In [84]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from pathlib import Path
import glob
import random
import pandas as pd
import re

In [42]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/txts/*.txt')
len(txt_files)

11057

In [10]:
file_size = 0
while file_size == 0:
    txt_file = random.choice(txt_files)
    file_size = Path(txt_file).stat().st_size
    if file_size != 0: # check if conversion to txt didn't work
        with open(txt_file, 'r') as f:
            text = f.read()
        print(txt_file)
    else:
        print('Skipping empty file')
        continue
text_lower = text.lower()

/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/txts/k.52.1.2.txt


In [11]:
tokens = word_tokenize(text_lower)
tokens[:10]

['comparison',
 'of',
 'non-maximal',
 'tests',
 'for',
 'estimating',
 'exercise',
 'capacity',
 'reem',
 'a.']

In [16]:
stop_words = set(stopwords.words('english'))
        
filtered_tokens = [t for t in tokens if t not in stop_words]

filtered_tokens[0:10]

['comparison',
 'non-maximal',
 'tests',
 'estimating',
 'exercise',
 'capacity',
 'reem',
 'a.',
 'alajmi',
 ',']

In [21]:
ps = PorterStemmer()
stemmed_words = [ps.stem(t) for t in filtered_tokens]
stemmed_words[0:20]

['comparison',
 'non-maxim',
 'test',
 'estim',
 'exercis',
 'capac',
 'reem',
 'a.',
 'alajmi',
 ',',
 'carl',
 'foster',
 ',',
 'john',
 'p.',
 'porcari',
 ',',
 'kimberley',
 'radtk',
 ',']

In [26]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# with lemmatizing you get actual words. With stemming you don't often get actual words, but rather the root itself
lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
lemmatized_words[0:10]

['comparison',
 'non-maximal',
 'test',
 'estimating',
 'exercise',
 'capacity',
 'reem',
 'a.',
 'alajmi',
 ',']

In [41]:
manual_text_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/manual_text_analysis.csv')
manual_text_df['txt_file_name'] = manual_text_df.apply(lambda x: str(x['Article'] + '.txt'), axis=1)
analyzed_txt_files = manual_text_df['txt_file_name'].to_list()

['nu9080917.txt',
 'bmjopen-2019-034610.txt',
 'jsc.0000000000000862.txt',
 'journal.pone.0114729.txt',
 'jsc.0000000000001679.txt',
 'fspor.2020.00099.txt',
 's00421-006-0321-7.txt',
 'j.jpsychires.2015.03.011.txt',
 's00421-019-04255-x.txt',
 'biom11040504.txt',
 'journal.pone.0017007.txt',
 'bmjsem-2015-000056.txt',
 'geosciences9020066.txt',
 'ajpp11.848.txt',
 '1414-431x20176400.txt',
 'circimaging.116.005511.txt',
 'eneuro.0008-16.2016.txt',
 'japplphysiol.00837.2019.txt',
 'biom10101394.txt',
 'journal.pone.0209069.txt',
 'cpf.12454.txt',
 'jeb.017533.txt',
 'hrt.49.6.584.txt',
 'meps13827.txt',
 'japplphysiol.00056.2019.txt',
 'mss.0000000000002637.txt']

In [63]:
def process_file(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    return filtered_tokens

In [119]:
words = [process_file(f, txt_files) for f in analyzed_txt_files]

In [81]:
manual_text_df['words'] = words
documents = []
for idx, row in manual_text_df.iterrows():
    documents.append((row['words'], row['Gas data?']))
random.shuffle(documents)

In [111]:
all_words = []
for doc in documents:
    for w in doc[0]:
        all_words.append(w)
fdist = FreqDist(all_words)
word_features = list(fdist.keys())

In [117]:
def find_features(document, ref_features):
    words = set(document)
    features = {}
    for w in ref_features:
        features[w] = (w in words)
    
    return features

In [123]:
feature_sets = [(find_features(words, word_features), gas) for (words, gas) in documents]

In [126]:
feature_sets[0]

({'constant': True,
  'versus': True,
  'variable-intensity': True,
  'cycling': True,
  ':': True,
  'effects': True,
  'subsequent': True,
  'running': True,
  'performance': True,
  'thierry': True,
  'bernard': True,
  ',': True,
  'fabrice': True,
  'vercruyssen': True,
  'cyrille': True,
  'mazure': True,
  'philippe': True,
  'gorce': True,
  'christophe': True,
  'hausswirth': True,
  'jeanick': True,
  'brisswalter': True,
  'cite': True,
  'version': True,
  'et': True,
  'al': True,
  '..': True,
  '.': True,
  'european': True,
  'journal': True,
  'applied': True,
  'physiology': True,
  'springer': True,
  'verlag': True,
  '2007': True,
  '99': True,
  '(': True,
  '2': True,
  ')': True,
  'pp.103-111': True,
  '\uffff10.1007/s00421-': True,
  '006-0321-7\uffff': True,
  '\uffffhal-01708106\uffff': True,
  'hal': True,
  'id': True,
  'hal-01708106': True,
  'https': True,
  '//hal-insep.archives-ouvertes.fr/hal-01708106': True,
  'submitted': True,
  '13': True,
  'feb