In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from pathlib import Path
import glob
import random
import pandas as pd
import re
# import requests
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold, train_test_split, GridSearchCV
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB,ComplementNB,CategoricalNB
from statistics import mean, stdev
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time

In [2]:
def process_file(file_name, file_list, mode = 'lemm'):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    if mode == 'lemm':
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]

        return lemmatized_words
    
    elif mode == 'stem':
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(t) for t in filtered_tokens]
    
        return stemmed_words

In [3]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/txts/*.txt')
gas_clf_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/Manual text analysis - Gas clf.csv')
gas_clf_df['txt_file_name'] = gas_clf_df.apply(lambda x: str(x['Article'] + '.txt'), axis=1)
# gas_clf_df = gas_clf_df[(gas_clf_df['Eligible'] == 'e') & (gas_clf_df['External ref'] == 'n')].reset_index(drop=True)

In [4]:
for idx, row in gas_clf_df.iterrows():
    if row['Eligible'] != 'e':
        row['Gas data'] = 'n'
    if (row['External ref'] == 'y') & (row['Gas data'] != 'y'):
        row['Gas data'] = 'n'

In [5]:
analyzed_txt_files = gas_clf_df['txt_file_name'].to_list()

In [6]:
word_lists = [process_file(f, txt_files, mode = 'lemm') for f in tqdm(analyzed_txt_files)]
# joined_word_lists = [' '.join(text) for text in word_lists]

  0%|          | 0/620 [00:00<?, ?it/s]

In [7]:
gas_clf_df['words'] = word_lists

## Removing articles with pdf to txt word lengh issue

Sometimes the pdf2txt.py file makes every character its own word. I'm going to remove them for the sake of simplicity

In [8]:
gas_clf_df['avg_word_len'] = gas_clf_df.apply(lambda x: np.mean(list(map(len, x['words']))), axis=1)
gas_clf_df = gas_clf_df[gas_clf_df['avg_word_len'] > 1.5]

In [9]:
fdist = FreqDist()
for word_l in tqdm(gas_clf_df['words']):
        for word in word_l:
            fdist[word] += 1

  0%|          | 0/604 [00:00<?, ?it/s]

In [10]:
fdist.most_common() # exercise is by far the most common word

[(',', 313552),
 ('.', 223151),
 (')', 124261),
 ('(', 123542),
 (';', 38446),
 (':', 38132),
 ('exercise', 22944),
 ('%', 18398),
 ('[', 18094),
 (']', 18049),
 ('p', 17703),
 ('et', 16718),
 ('±', 16016),
 ('1', 15881),
 ('study', 15306),
 ('2', 14500),
 ('=', 13426),
 ('e', 13312),
 ('j', 13256),
 ('n', 12839),
 ('r', 12564),
 ('al', 12346),
 ('effect', 10286),
 ('3', 10018),
 ('c', 9912),
 ('test', 9125),
 ('training', 8984),
 ('muscle', 8833),
 ('*', 8368),
 ('group', 8223),
 ('rate', 8017),
 ('b', 7679),
 ('5', 7662),
 ('h', 7420),
 ('4', 7383),
 ('time', 7279),
 ('performance', 6959),
 ('level', 6700),
 ('6', 6661),
 ('0', 6529),
 ('<', 6431),
 ('oxygen', 6400),
 ('data', 6388),
 ('j.', 6376),
 ('sport', 6335),
 ('activity', 6324),
 ('10', 6238),
 ('l', 6214),
 ('’', 6081),
 ('mean', 6069),
 ('body', 6048),
 ('change', 6026),
 ('increase', 6008),
 ('value', 5872),
 ('using', 5851),
 ('physical', 5717),
 ('subject', 5648),
 ('patient', 5599),
 ('blood', 5566),
 ('result', 5562),


In [11]:
def test_word(df, indicator_col_name, text_col_name, test_word):
    test_words = []
    temp_df = df
    for idx, row in temp_df.iterrows():
        if test_word in row[text_col_name]:
            test_words.append('y')
        else:
            test_words.append('n')
    temp_df[test_word] = test_words
    ct = pd.crosstab(temp_df[indicator_col_name], temp_df[test_word], margins=True)
    return ct

In [12]:
test_word(gas_clf_df, indicator_col_name='Gas data', text_col_name='words', test_word='exercise')
# exercise seems like the next best word to add to the electronic search

exercise,n,y,All
Gas data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
n,91,220,311
y,5,288,293
All,96,508,604


In [14]:
exercise = []
for idx, row in gas_clf_df.iterrows():
    if 'exercise' in row['words']:
        exercise.append('y')
    else:
        exercise.append('n')
gas_clf_df['exercise'] = exercise

In [15]:
df = gas_clf_df[(gas_clf_df['exercise'] == 'n') & (gas_clf_df['Gas data'] == 'y')]
df

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,txt_file_name,words,avg_word_len,exercise
329,s40279-017-0811-2,e,,n,y,s40279-017-0811-2.txt,"[sport, med, (, 2018, ), 48:1009–1019, http, :...",3.926697,n
364,fspor.2020.00044,e,,n,y,fspor.2020.00044.txt,"[original, research, published, :, 28, april, ...",5.034149,n
377,jeb.024927,e,,n,y,jeb.024927.txt,"[corrigendum, control, function, arm, swing, h...",4.810747,n
386,jeb.02782,e,,n,y,jeb.02782.txt,"[2390, journal, experimental, biology, 210, ,,...",4.966608,n
567,sms.12514,e,,n,y,sms.12514.txt,"[reduced, oxygen, cost, running, related, alig...",4.76857,n


In [None]:
word_len = []
for words in gas_clf_df['words']:
    char_len = []
    for word in words:
        char_len.append(len(word))
    word_len.append(char_len)

In [None]:
gas_clf_df['avg_word_len'] = list(map(np.mean, word_len))
gas_clf_df[gas_clf_df['avg_word_len'] < 3.5]