In [1]:
import pandas as pd
import re
import glob
from pathlib import Path
from tqdm.notebook import tqdm
from langdetect import detect
import sys
sys.path.append('code/cpet_articles/analysis/')
from article_screening_re import *

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import numpy as np
# from nltk.probability import FreqDist

# import random
# import pandas as pd

# # import requests

# import pickle
# import time

In [2]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/txts/*.txt')
gas_clf_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/Manual text analysis - exer.csv')
gas_clf_df['txt_file_name'] = gas_clf_df.apply(lambda x: str(x['Article'] + '.txt'), axis=1)
# gas_clf_df = gas_clf_df[(gas_clf_df['Eligible'] == 'e') & (gas_clf_df['External ref'] == 'n')].reset_index(drop=True)

In [3]:
for idx, row in gas_clf_df.iterrows():
    if row['Eligible'] != 'e':
        row['Gas data'] = 'n'
    if (row['External ref'] == 'y') & (row['Gas data'] != 'y'):
        row['Gas data'] = 'n'

In [4]:
def process_file_one_string(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    
    return text_lower

In [5]:
def process_file(file_name, file_list, mode = 'lemm'):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    if mode == 'lemm':
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return lemmatized_words
    
    elif mode == 'stem':
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(t) for t in filtered_tokens]
    
        return stemmed_words

In [6]:
analyzed_txt_files = gas_clf_df['txt_file_name'].to_list()

In [8]:
token_word_lists = [process_file(f, txt_files, mode='lemm') for f in tqdm(analyzed_txt_files)]
long_word_lists = [process_file_one_string(f, txt_files) for f in tqdm(analyzed_txt_files)]

  0%|          | 0/380 [00:00<?, ?it/s]

  0%|          | 0/380 [00:00<?, ?it/s]

In [10]:
gas_clf_df['article_text'] = long_word_lists
gas_clf_df['tokens'] = token_word_lists
gas_clf_df['avg_word_len'] = gas_clf_df.apply(lambda x: np.mean(list(map(len, x['tokens']))), axis=1)
gas_clf_df = gas_clf_df[gas_clf_df['avg_word_len'] > 1.5].reset_index(drop=True) # removes articles with pdf to txt conversion issue

In [11]:
gas_clf_df['language'] = gas_clf_df.apply(lambda x: detect(x['article_text']), axis=1)
# gas_clf_df['language'].value_counts()

In [12]:
# filtering by enlglish or Welch lol
gas_clf_df = gas_clf_df[gas_clf_df['language'].isin(['en', 'cy'])].reset_index(drop=True)

In [29]:
def oxygen_uptake_re(text):
    o2_uptake_re = re.compile(r'oxygen.{0,5}uptake', re.DOTALL)
    o2_consumption_re = re.compile(r'oxyen.{0,5}consumption', re.DOTALL)
    vo2max_re = re.compile(r'(v)?o2.{0,2}(max)?', re.DOTALL)
    vo2peak_re = re.compile(r'(v)?o2.{0,2}(peak)?', re.DOTALL)
    aerobic_power_re = re.compile(r'aerobic.{0,2}power', re.DOTALL)
    
    mo_list = [
        o2_uptake_re.search(text),
        o2_consumption_re.search(text),
    vo2max_re.search(text),
    vo2peak_re.search(text),
    aerobic_power_re.search(text)]
    
    mentions_o2_uptake = any(mo is not None for mo in mo_list)
    
    return mentions_o2_uptake

def gas_collection_methods_re(text):
    bbb_re = re.compile(r'breath.{0,5}breath', re.DOTALL)
    douglas_bag_re = re.compile(r'douglas.{0,5}bag', re.DOTALL)
    mixing_chamber_re = re.compile(r'mixing.{0,5}chamber', re.DOTALL)
    
    mo_list = [bbb_re.search(text), douglas_bag_re.search(text), mixing_chamber_re.search(text)]
    
    gas_methods = any(mo is not None for mo in mo_list)
    
    return gas_methods

def vo2_units_re(text):
    mL_kg_min_re = re.compile(r'ml[^a-zA-Z]*kg[^a-zA-Z]*min')
    mL_min_kg_re = re.compile(r'ml[^a-zA-Z]*min[^a-zA-Z]*kg')
    
    # L_mL_min = re.compile(r'(m)?l[^a-zA-Z]*min')

    mo_list = [mL_kg_min_re.search(text), mL_min_kg_re.search(text)]
    
    vo2_units = any(mo is not None for mo in mo_list)
    
    return vo2_units

In [30]:
gas_clf_df['o2_uptake'] = gas_clf_df.apply(lambda x: oxygen_uptake_re(x['article_text']), axis=1)
gas_clf_df['vo2_units'] = gas_clf_df.apply(lambda x: vo2_units_re(x['article_text']), axis=1)
gas_clf_df['gas_collection_methods'] = gas_clf_df.apply(lambda x: gas_collection_methods_re(x['article_text']), axis=1)

In [31]:
pd.crosstab(gas_clf_df['Gas data'], gas_clf_df['o2_uptake'])

o2_uptake,False,True
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1
n,19,105
y,0,234


In [33]:
gas_clf_df[(gas_clf_df['Gas data'] == 'n') & (gas_clf_df['o2_uptake'] == True)]

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,txt_file_name,article_text,tokens,avg_word_len,language,o2_uptake,vo2_units,gas_collection_methods
1,sjweh.2914,e,,n,n,sjweh.2914.txt,"downloaded from www.sjweh.fi on may 26, 2022\n...","[downloaded, www.sjweh.fi, may, 26, ,, 2022, s...",4.782363,en,True,False,False
7,systems2040425,e,,n,n,systems2040425.txt,"systems 2014, 2, 425-450; doi:10.3390/systems2...","[system, 2014, ,, 2, ,, 425-450, ;, doi:10.339...",4.628654,en,True,False,False
15,s40798-021-00363-1,e,,n,n,s40798-021-00363-1.txt,most et al. sports med - open (2021...,"[et, al, ., sport, med, -, open, (, 2021, ), 7...",4.669721,en,True,False,False
21,s40279-019-01103-y,e,,n,n,s40279-019-01103-y.txt,sports medicine (2019) 49:1465–1473 \nhttps://...,"[sport, medicine, (, 2019, ), 49:1465–1473, ht...",4.783203,en,True,False,False
22,s41467-019-10925-3,e,,n,n,s41467-019-10925-3.txt,article\n\nhttps://doi.org/10.1038/s41467-019-...,"[article, http, :, //doi.org/10.1038/s41467-01...",4.529885,en,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,jeb.02440,i,Non-human,,n,jeb.02440.txt,3940\n\nthe journal of experimental biology 20...,"[3940, journal, experimental, biology, 209, ,,...",4.784446,en,True,True,False
352,j.msard.2022.103721,e,,n,n,j.msard.2022.103721.txt,contents lists available at sciencedirect \n\n...,"[content, list, available, sciencedirect, mult...",4.775318,en,True,False,False
354,s00421-006-0181-1,i,Not OP-RR,,n,s00421-006-0181-1.txt,grimmer et al. journal of neuroengineering and...,"[grimmer, et, al, ., journal, neuroengineering...",4.421915,en,True,True,False
355,mss.0000000000001204,i,Not OP-RR,,n,mss.0000000000001204.txt,\n\n. . . published ahead of print \n \n \n \...,"[., ., ., published, ahead, print, meta-analys...",4.594075,en,True,True,False


In [178]:
pd.crosstab(gas_clf_df['Gas data'], gas_clf_df['gas_collection_methods'])

gas_collection_methods,False,True
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1
n,122,3
y,140,103


In [77]:
gas_clf_df[(gas_clf_df['Gas data'] == 'n') & (gas_clf_df['gas_collection_methods'] == True)]

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,txt_file_name,article_text,language,vo2_units,says_bbb,gas_collection_methods
131,journal.pone.0140616,e,,n,n,journal.pone.0140616.txt,research article\n\ninfluence of hypoxic inter...,en,True,False,True
143,japplphysiol.00864.2015,i,Not OP-RR,,n,japplphysiol.00864.2015.txt,"j appl physiol 120: 481–494, 2016.\nfirst publ...",en,True,True,True
159,422056,i,Non-human,,n,422056.txt,division of comparative physiology and biochem...,en,True,True,True


In [75]:
pd.crosstab(gas_clf_df['Gas data'], [gas_clf_df['gas_collection_methods'], gas_clf_df['vo2_units']])

gas_collection_methods,False,False,True,True
vo2_units,False,True,False,True
Gas data,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
n,92,30,0,3
y,45,95,16,87


In [102]:
gas_clf_df[(gas_clf_df['Gas data'] == 'y') & (gas_clf_df['vo2_units'] == False) & (gas_clf_df['gas_collection_methods'] == False)]

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,txt_file_name,article_text,language,vo2_units,gas_collection_methods
6,thx.35.9.680,e,,n,y,thx.35.9.680.txt,"thorax, 198), 35, 680-685\n\ncholinergic block...",en,False,False
11,tjem.233.135,e,,n,y,tjem.233.135.txt,"tohoku j. exp. med., 2014, 233, 135-140\n\n135...",en,False,False
52,s0100-879x2011007500073,e,,n,y,s0100-879x2011007500073.txt,www.bjournal.com.br\nwww.bjournal.com.br\n\nis...,en,False,False
60,s0007114521002208,e,,n,y,s0007114521002208.txt,h\nt\nt\np\ns\n:\n/\n/\nd\no\n\ni\n.\n\n.\n\no...,en,False,False
62,nu14091776,e,,n,y,nu14091776.txt,article\neffects of trehalose solutions at dif...,en,False,False
78,s00421-011-2129-3,e,,n,y,s00421-011-2129-3.txt,university of montana \nuniversity of montana ...,en,False,False
81,oem.34.2.126,e,,n,y,oem.34.2.126.txt,"british journal ofindustrial medicine, 1977, 3...",en,False,False
91,journal.pone.0208452,e,,n,y,journal.pone.0208452.txt,research article\neccentric cycling does not i...,en,False,False
93,jrrd.2007.09.0153,e,,n,y,jrrd.2007.09.0153.txt,"jrrdjrrd volume 45, number 6, 2008\n\njournal ...",en,False,False
103,mss.0000000000001353,e,,n,y,mss.0000000000001353.txt,washington university school of medicine \nwas...,en,False,False
