In [73]:
import pandas as pd
import re
import glob
from pathlib import Path
from tqdm.notebook import tqdm
from langdetect import detect
import numpy as np
import shutil

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
# import random
# import requests
# import pickle
# import time

In [67]:
def process_file_one_string(file_name, file_list):
    txt_re = re.compile(re.escape(file_name))
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    
    return text_lower

In [68]:
def tokenize_file(file_name, file_list, mode = 'lemm'):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    if mode == 'lemm':
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return lemmatized_words
    
    elif mode == 'stem':
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(t) for t in filtered_tokens]
    
        return stemmed_words

In [69]:
txt_file_paths = [str(path) for path in list(Path('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts').rglob('*.txt'))]


In [70]:
txt_files = [path.stem for path in list(Path('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts').rglob('*.txt'))]


In [71]:
long_word_lists = [process_file_one_string(f, txt_file_paths) for f in tqdm(txt_files)]

  0%|          | 0/9449 [00:00<?, ?it/s]

In [72]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
token_lists = []

for word_list in tqdm(long_word_lists):
    tokens = word_tokenize(word_list)
    filtered_tokens = [t for t in tokens if t not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    token_lists.append(lemmatized_words)

  0%|          | 0/9449 [00:00<?, ?it/s]

In [75]:
df = pd.DataFrame({'txt_file': txt_files,
                   'article_text': long_word_lists,
                    'tokens': token_lists})

In [79]:
df['avg_word_len'] = df.apply(lambda x: np.mean(list(map(len, x['tokens']))), axis=1)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [80]:
df.iterrows

Unnamed: 0,txt_file,article_text,tokens,avg_word_len
0,jsc.0000000000000862,\n \n \n\n \n \n \n\n \n \n \n \n \n \n\n \n\...,"[copyright, notice, feduni, researchonline, ht...",4.686279
1,fnut.2021.718936,original research\npublished: 21 september 202...,"[original, research, published, :, 21, septemb...",5.113729
2,s003600000151,51© springer international publishing switzerl...,"[51©, springer, international, publishing, swi...",4.651454
3,s00421-022-04928-0,european journal of applied physiology (2022) ...,"[european, journal, applied, physiology, (, 20...",4.697708
4,s00421-007-0554-0,the travelling athlete\n\nnebojša nikolić\n\n1...,"[travelling, athlete, nebojša, nikolić, 16, 16...",5.178954
...,...,...,...,...
9444,chest.124.6.2377,https://api.elsevier.com/content/article/pii/s...,"[http, :, //api.elsevier.com/content/article/p...",5.658039
9445,chest.125.4.1292,https://api.elsevier.com/content/article/pii/s...,"[http, :, //api.elsevier.com/content/article/p...",5.603693
9446,77.12.1295,https://api.elsevier.com/content/article/pii/s...,"[http, :, //api.elsevier.com/content/article/p...",6.413466
9447,chest.107.5.1206,https://api.elsevier.com/content/article/pii/s...,"[http, :, //api.elsevier.com/content/article/p...",7.885780


In [None]:
# move files with parsing error
for idx, row in tqdm(df.iterrows(), total=len(df)):
    if row['avg_word_len'] < 1.5:
        try:
            txt_re = re.compile(re.escape(row['txt_file']))
            fname = list(filter(txt_re.search, txt_file_paths))[0]
            dest = '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/pdf_to_txt_parsing_error/' + row['txt_file'] + '.txt'
            shutil.move(fname, dest)
        except Exception as e:
            print(e)


In [98]:
df = df[df['avg_word_len'] > 1.5].reset_index(drop=True) # removes articles with pdf to txt conversion issue


In [99]:
df['language'] = df.apply(lambda x: detect(x['article_text']), axis=1)

In [100]:
df['language'].value_counts()

en    9097
pt      44
ro      39
es      14
de      11
ja       4
ko       2
fr       2
no       1
tr       1
it       1
ru       1
ca       1
Name: language, dtype: int64

In [110]:
# move files not in english
error_list = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    if row['language'] not in ['en', 'cy']:
        try:
            txt_re = re.compile(re.escape(row['txt_file']))
            fname = list(filter(txt_re.search, txt_file_paths))[0]
            dest = '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/non-english/' + row['txt_file'] + '.txt'
            shutil.move(fname, dest)
        except Exception as e:
            print(e)
            error_list.append(

  0%|          | 0/9218 [00:00<?, ?it/s]

[Errno 2] No such file or directory: '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/06.26.pms.119c11z4.txt'
[Errno 2] No such file or directory: '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/s1517-86922009000200010.txt'
[Errno 2] No such file or directory: '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/s0066-782x2006000600007.txt'
[Errno 2] No such file or directory: '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/s00408-003-1009-y.txt'
[Errno 2] No such file or directory: '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/jcj.52.162.txt'
[Errno 2] No such file or directory: '/Users/antonhesse/Desktop/Anton/Educ

In [109]:
# df = df[df['language'].isin(['en', 'cy'])].reset_index(drop=True)
row = df.iloc[0,:]
row['language'] not in ['en', 'cy']

False