In [5]:
import pandas as pd
import re
import glob
from pathlib import Path
from tqdm.notebook import tqdm
from langdetect import detect

import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
# import random
# import requests
# import pickle
# import time

In [6]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/*.txt')
gas_clf_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/Manual text analysis - exer.csv')
gas_clf_df['txt_file_name'] = gas_clf_df.apply(lambda x: str(x['Article'] + '.txt'), axis=1)
# gas_clf_df = gas_clf_df[(gas_clf_df['Eligible'] == 'e') & (gas_clf_df['External ref'] == 'n')].reset_index(drop=True)

In [7]:
for idx, row in gas_clf_df.iterrows():
    if row['Eligible'] != 'e':
        row['Gas data'] = 'n'
    if (row['External ref'] == 'y') & (row['Gas data'] != 'y'):
        row['Gas data'] = 'n'

In [8]:
def process_file_one_string(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    
    return text_lower

In [9]:
def tokenize_file(file_name, file_list, mode = 'lemm'):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    if mode == 'lemm':
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return lemmatized_words
    
    elif mode == 'stem':
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(t) for t in filtered_tokens]
    
        return stemmed_words

In [10]:
analyzed_txt_files = gas_clf_df['txt_file_name'].to_list()

In [11]:
token_word_lists = [tokenize_file(f, txt_files, mode='lemm') for f in tqdm(analyzed_txt_files)]
long_word_lists = [process_file_one_string(f, txt_files) for f in tqdm(analyzed_txt_files)]

  0%|          | 0/380 [00:00<?, ?it/s]

  0%|          | 0/380 [00:00<?, ?it/s]

In [12]:
gas_clf_df['article_text'] = long_word_lists
gas_clf_df['tokens'] = token_word_lists
gas_clf_df['avg_word_len'] = gas_clf_df.apply(lambda x: np.mean(list(map(len, x['tokens']))), axis=1)
gas_clf_df = gas_clf_df[gas_clf_df['avg_word_len'] > 1.5].reset_index(drop=True) # removes articles with pdf to txt conversion issue

In [13]:
gas_clf_df['language'] = gas_clf_df.apply(lambda x: detect(x['article_text']), axis=1)
# gas_clf_df['language'].value_counts()

In [14]:
# filtering by enlglish or Welch lol
gas_clf_df = gas_clf_df[gas_clf_df['language'].isin(['en', 'cy'])].reset_index(drop=True)

In [128]:
def oxygen_uptake_re(text):
    o2_uptake_consupmtion_re = re.compile(r'oxygen.{0,5}(uptake|consumption)', re.DOTALL)
    vo2max_peak_re = re.compile(r'(v)?o2.{0,2}(max|peak)?', re.DOTALL)
    aerobic_re = re.compile(r'(?<!an)aerobic.{0,2}(power|capacity)', re.DOTALL)
    
    mo_list = [
        o2_uptake_re.search(text),
        vo2max_re.search(text),
        aerobic_power_re.search(text)]
    
    mentions_o2_uptake = any(mo is not None for mo in mo_list)
    
    return mentions_o2_uptake

def gas_collection_methods_re(text):
    bbb_re = re.compile(r'breath.{0,5}breath', re.DOTALL)
    douglas_bag_re = re.compile(r'douglas.{0,5}bag', re.DOTALL)
    mixing_chamber_re = re.compile(r'mixing.{0,5}chamber', re.DOTALL)
    
    mo_list = [bbb_re.search(text), douglas_bag_re.search(text), mixing_chamber_re.search(text)]
    
    gas_methods = any(mo is not None for mo in mo_list)
    
    return gas_methods

def vo2_units_re(text):
    vo2_rel_re = re.compile(r'ml([^a-zA-Z]*kg[^a-zA-Z]*min|[^a-zA-Z]*min[^a-zA-Z]*kg)')
    # mL_min_kg_re = re.compile(r'ml[^a-zA-Z]*min[^a-zA-Z]*kg')
    
    # L_mL_min = re.compile(r'(m)?l[^a-zA-Z]*min')

    mo_list = [vo2_rel_re.search(text)]
    
    vo2_units = any(mo is not None for mo in mo_list)
    
    return vo2_units

def estimated_vo2_re(text):
    est_o2_uptake_re = re.compile(r'''(
    (estimat|indirect|calculat).{0,30}oxygen.{0,2}(uptake|consumption)|
    oxygen.{0,2}(uptake|consumption).{0,30}(estimat|indirect|calculat)
    )''',
                                           re.DOTALL | re.VERBOSE)
    
    est_vo2_re = re.compile(r'''(
    (estimat|indirect|calculat).{0,30}(v)?o2.{0,2}(max|peak)|
    (v)?o2.{0,2}(max|peak).{0,30}(estimat|indirect|calculat)
    )''',
                            re.DOTALL | re.VERBOSE)
    
    est_vo2_units_re = re.compile(r'''(
    (estimat|indirect|calculat).{0,30}ml([^a-zA-Z]*kg[^a-zA-Z]*min|[^a-zA-Z]*min[^a-zA-Z]*kg)|
    ml([^a-zA-Z]*kg[^a-zA-Z]*min|[^a-zA-Z]*min[^a-zA-Z]*kg).{0,30}(estimat|indirect|calculat)
    )''',
                            re.DOTALL | re.VERBOSE)
    
    mo_list = [est_o2_uptake_re.search(text), est_vo2_re.search(text), est_vo2_units_re.search(text)]
    est_vo2 = any(mo is not None for mo in mo_list)
    
    return est_vo2
    # assessment of aerobic capacity

In [129]:
gas_clf_df['o2_uptake'] = gas_clf_df.apply(lambda x: oxygen_uptake_re(x['article_text']), axis=1)
gas_clf_df['vo2_units'] = gas_clf_df.apply(lambda x: vo2_units_re(x['article_text']), axis=1)
gas_clf_df['gas_collection_methods'] = gas_clf_df.apply(lambda x: gas_collection_methods_re(x['article_text']), axis=1)
gas_clf_df['estimated_vo2'] = gas_clf_df.apply(lambda x: estimated_vo2_re(x['article_text']), axis=1)

In [130]:
pd.crosstab(gas_clf_df['Gas data'], gas_clf_df['o2_uptake'])
# if there is NO reference to O2 uptake, we can for sure say it's a no

o2_uptake,False,True
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1
n,19,106
y,0,233


In [131]:
pd.crosstab(gas_clf_df['Gas data'], gas_clf_df['estimated_vo2'])

estimated_vo2,False,True
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1
n,95,30
y,166,67


In [135]:
pd.crosstab(gas_clf_df['Gas data'], gas_clf_df['gas_collection_methods'])

gas_collection_methods,False,True
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1
n,122,3
y,122,111


In [136]:
pd.crosstab(gas_clf_df['Gas data'], gas_clf_df['vo2_units'])

vo2_units,False,True
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1
n,92,33
y,52,181


In [134]:
gas_clf_df[(gas_clf_df['estimated_vo2'] == True) & (gas_clf_df['Gas data'] == 'y')]

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,txt_file_name,article_text,tokens,avg_word_len,language,o2_uptake,vo2_units,gas_collection_methods,estimated_vo2
5,tad-2005-17207,e,,n,y,tad-2005-17207.txt,pdf hosted at the radboud repository of the ra...,"[pdf, hosted, radboud, repository, radboud, un...",4.708717,en,True,True,False,True
8,s41746-021-00531-3,e,,n,y,s41746-021-00531-3.txt,www.nature.com/npjdigitalmed\n\nopen\n\narticl...,"[www.nature.com/npjdigitalmed, open, article, ...",5.023291,en,True,True,True,True
10,srep42485,e,,n,y,srep42485.txt,open\n\nreceived: 15 august 2016\naccepted: 11...,"[open, received, :, 15, august, 2016, accepted...",4.568605,en,True,True,True,True
13,s1517-86922011000100008,e,,n,y,s1517-86922011000100008.txt,physiological and neuromuscular variables \nas...,"[physiological, neuromuscular, variable, assoc...",4.896287,en,True,True,False,True
24,sc.2017.34,e,,n,y,sc.2017.34.txt,"spinal cord (2017) 55, 935–939\n& 2017 interna...","[spinal, cord, (, 2017, ), 55, ,, 935–939, &, ...",4.579608,en,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,ajpregu.00182.2018,e,,n,y,ajpregu.00182.2018.txt,am j physiol regul integr comp physiol 315: r5...,"[j, physiol, regul, integr, comp, physiol, 315...",4.960239,en,True,False,False,True
324,02640414.2011.562232,e,,n,y,02640414.2011.562232.txt,"effects of optimal pacing strategies for 400-,...","[effect, optimal, pacing, strategy, 400-, ,, 8...",4.443731,en,True,True,True,True
332,fphys.2019.00365,e,,n,y,fphys.2019.00365.txt,original research\npublished: 05 april 2019\nd...,"[original, research, published, :, 05, april, ...",4.758354,en,True,True,True,True
337,thx.44.9.716,e,,n,y,thx.44.9.716.txt,thorax 1989;44:716-720\n\nadverse effect of ad...,"[thorax, 1989, ;, 44:716-720, adverse, effect,...",4.214140,en,True,False,False,True


In [78]:
pd.crosstab(
    gas_clf_df['Gas data'],
    [
        gas_clf_df['gas_collection_methods'],
        gas_clf_df['vo2_units'],
        gas_clf_df['o2_uptake'],
    ]
)

gas_collection_methods,False,False,False,True,True
vo2_units,False,False,True,False,True
o2_uptake,False,True,True,True,True
Gas data,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
n,15,77,30,0,3
y,0,34,88,18,93


In [118]:
gas_clf_df[
    (gas_clf_df['Gas data'] == 'n') & 
    (gas_clf_df['gas_collection_methods'] == False) &
    (gas_clf_df['vo2_units'] == True) &
    (gas_clf_df['o2_uptake'] == True)
]

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,txt_file_name,article_text,tokens,avg_word_len,language,o2_uptake,vo2_units,gas_collection_methods
26,s12887-022-03257-7,e,,n,n,s12887-022-03257-7.txt,eskandarifard et al. bmc pediatrics (...,"[eskandarifard, et, al, ., bmc, pediatrics, (,...",4.564543,en,True,True,False
31,s00421-017-3677-y,i,,,n,s00421-017-3677-y.txt,the metabolic costs of walking and running up ...,"[metabolic, cost, walking, running, 30, degree...",4.747726,en,True,True,False
40,s0066-782x2012005000092,e,,n,n,s0066-782x2012005000092.txt,original article\n\naerobic training does not ...,"[original, article, aerobic, training, alter, ...",4.940076,en,True,True,False
83,pes.2018-0141,i,Not OP-RR,,n,pes.2018-0141.txt,"pediatric exercise science, 2019, 31, 184-190\...","[pediatric, exercise, science, ,, 2019, ,, 31,...",4.860014,en,True,True,False
100,jsc.0b013e3181876ad0,e,,n,n,jsc.0b013e3181876ad0.txt,"bampouras, theodoros and marrin, kelly (2009) ...","[bampouras, ,, theodoros, marrin, ,, kelly, (,...",4.916015,en,True,True,False
120,nu10070815,e,,n,n,nu10070815.txt,article\noptimal adherence to a mediterranean ...,"[article, optimal, adherence, mediterranean, d...",4.444631,en,True,True,False
184,bjsm.2008.046391,e,,n,n,bjsm.2008.046391.txt,original article\n\n1unit for preventive \nnut...,"[original, article, 1unit, preventive, nutriti...",4.602708,en,True,True,False
212,journal.pone.0118809,i,Non-human,,n,journal.pone.0118809.txt,research article\n\nendurance training inhibit...,"[research, article, endurance, training, inhib...",4.905543,en,True,True,False
217,s12931-020-01350-y,e,,n,n,s12931-020-01350-y.txt,swinarew et al. respiratory research ...,"[swinarew, et, al, ., respiratory, research, (...",5.22577,en,True,True,False
227,s12889-017-4237-4,e,,n,n,s12889-017-4237-4.txt,tikanmäki et al. bmc public health (2017) 17:...,"[tikanmäki, et, al, ., bmc, public, health, (,...",4.792403,en,True,True,False


In [106]:
token_list_n = gas_clf_df[
    (gas_clf_df['Gas data'] == 'n') & 
    (gas_clf_df['gas_collection_methods'] == False) &
    (gas_clf_df['vo2_units'] == False) &
    (gas_clf_df['o2_uptake'] == True)
]['tokens'].tolist()

token_list_y = gas_clf_df[
    (gas_clf_df['Gas data'] == 'y') & 
    (gas_clf_df['gas_collection_methods'] == False) &
    (gas_clf_df['vo2_units'] == False) &
    (gas_clf_df['o2_uptake'] == True)
]['tokens'].tolist()

In [107]:
fdist_n = FreqDist()
for tokens in tqdm(token_list_n):
    for word in tokens:
        fdist_n[word] += 1

fdist_y = FreqDist()
for tokens in tqdm(token_list_y):
    for word in tokens:
        fdist_y[word] += 1


  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

In [115]:
# fdist_n.most_common()[0:40]

Articles where they *estimated* physical fitness are going to be tricky

In [116]:
# fdist_y.most_common()[0:40]

## Passes
1. If the article does NOT mention anything about oxygen uptake, it does NOT include gas data
2. If the articles DOES mention gas collection methods, it INCLUDES gas data