In [34]:
import pandas as pd
import re
import glob
from pathlib import Path
from tqdm.notebook import tqdm
from langdetect import detect
# import sys
# sys.path.append('code/cpet_articles/analysis/')
# from article_screening_re import *
import random

import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# from nltk.probability import FreqDist

# import requests
# import pickle
# import time

In [35]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts/*.txt')

In [36]:
def process_file_one_string(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    
    return text_lower

In [37]:
def tokenize_file(file_name, file_list, mode = 'lemm'):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    if mode == 'lemm':
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return lemmatized_words
    
    elif mode == 'stem':
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(t) for t in filtered_tokens]
    
        return stemmed_words

In [75]:
def oxygen_uptake_re(text):
    o2_uptake_consupmtion_re = re.compile(r'oxygen.{0,5}(uptake|consumption)', re.DOTALL)
    vo2max_peak_re = re.compile(r'(v)?o2.{0,2}(max|peak)?', re.DOTALL)
    aerobic_re = re.compile(r'(?<!an)aerobic.{0,2}(power|capacity)', re.DOTALL)
    
    mo_list = [
        o2_uptake_consupmtion_re.search(text),
        vo2max_peak_re.search(text),
        aerobic_re.search(text)]
    
    mentions_o2_uptake = any(mo is not None for mo in mo_list)
    
    return mentions_o2_uptake

def gas_collection_methods_re(text):
    bbb_re = re.compile(r'breath.{0,5}breath', re.DOTALL)
    douglas_bag_re = re.compile(r'douglas.{0,5}bag', re.DOTALL)
    mixing_chamber_re = re.compile(r'mixing.{0,5}chamber', re.DOTALL)
    
    mo_list = [bbb_re.search(text), douglas_bag_re.search(text), mixing_chamber_re.search(text)]
    
    gas_methods = any(mo is not None for mo in mo_list)
    
    return gas_methods

def vo2_units_re(text):
    vo2_rel_re = re.compile(r'ml([^a-zA-Z]*kg[^a-zA-Z]*min|[^a-zA-Z]*min[^a-zA-Z]*kg)')
    # mL_min_kg_re = re.compile(r'ml[^a-zA-Z]*min[^a-zA-Z]*kg')
    
    # L_mL_min = re.compile(r'(m)?l[^a-zA-Z]*min')

    mo_list = [vo2_rel_re.search(text)]
    
    vo2_units = any(mo is not None for mo in mo_list)
    
    return vo2_units

def estimated_vo2_re(text):
    est_o2_uptake_re = re.compile(r'''(
    (estimat|indirect|calculat).{0,30}oxygen.{0,2}(uptake|consumption)|
    oxygen.{0,2}(uptake|consumption).{0,30}(estimat|indirect|calculat)
    )''',
                                           re.DOTALL | re.VERBOSE)
    
    est_vo2_re = re.compile(r'''(
    (estimat|indirect|calculat).{0,30}(v)?o2.{0,2}(max|peak)|
    (v)?o2.{0,2}(max|peak).{0,30}(estimat|indirect|calculat)
    )''',
                            re.DOTALL | re.VERBOSE)
    
    est_vo2_units_re = re.compile(r'''(
    (estimat|indirect|calculat).{0,30}ml([^a-zA-Z]*kg[^a-zA-Z]*min|[^a-zA-Z]*min[^a-zA-Z]*kg)|
    ml([^a-zA-Z]*kg[^a-zA-Z]*min|[^a-zA-Z]*min[^a-zA-Z]*kg).{0,30}(estimat|indirect|calculat)
    )''',
                            re.DOTALL | re.VERBOSE)
    
    mo_list = [est_o2_uptake_re.search(text), est_vo2_re.search(text), est_vo2_units_re.search(text)]
    est_vo2 = any(mo is not None for mo in mo_list)
    
    return est_vo2
    # assessment of aerobic capacity

In [38]:
test_files = random.choices(txt_files, k=1000)

In [53]:
test_files_short = [re.search(r'(?<=/full_texts/txts/).*', f).group() for f in test_files]

In [39]:
long_word_lists = [process_file_one_string(f, txt_files) for f in tqdm(test_files)]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [41]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
token_lists = []

for word_list in tqdm(long_word_lists):
    tokens = word_tokenize(word_list)
    filtered_tokens = [t for t in tokens if t not in stop_words]
    lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    token_lists.append(lemmatized_words)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [58]:
df = pd.DataFrame({'txt_file': test_files_short,
                   'article_text': long_word_lists,
                    'tokens': token_lists})

In [63]:
df['avg_word_len'] = df.apply(lambda x: np.mean(list(map(len, x['tokens']))), axis=1)
df = df[df['avg_word_len'] > 1.5].reset_index(drop=True) # removes articles with pdf to txt conversion issue

In [65]:
df['language'] = df.apply(lambda x: detect(x['article_text']), axis=1)

In [66]:
df = df[df['language'].isin(['en', 'cy'])].reset_index(drop=True)

In [76]:
df['o2_uptake'] = df.apply(lambda x: oxygen_uptake_re(x['article_text']), axis=1)
df['vo2_units'] = df.apply(lambda x: vo2_units_re(x['article_text']), axis=1)
df['gas_collection_methods'] = df.apply(lambda x: gas_collection_methods_re(x['article_text']), axis=1)
df['estimated_vo2'] = df.apply(lambda x: estimated_vo2_re(x['article_text']), axis=1)

In [None]:
# df[df['o2_uptake'] == False] # these articles are VERY likely to NOT include gas data

In [81]:
df[df['gas_collection_methods'] == True].shape  # these articles are VERY likely to INCLUDE gas data

(365, 9)