In [1]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
import numpy as np

In [2]:
import sys
sys.path.append('/Users/antonhesse/Desktop/Anton/Education/UMN/PhD/Dissertation/CPET_scoping_review/code/cpet_articles/analysis/helper_funcs')
from text_analysis import tokenize_file

In [3]:
# get txt file paths and stems
txt_file_paths = list(Path('/Users/antonhesse/Desktop/Anton/Education/UMN/PhD/Dissertation/CPET_scoping_review/data/cpet_articles/full_texts/txts').glob('*.txt'))
txt_file_stems = [path.stem for path in txt_file_paths]

In [4]:
# load op-rr eligibility data frame
op_rr_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/PhD/Dissertation/CPET_scoping_review/data/cpet_articles/text_analysis/eligibility/Eligibility - op-rr.csv')
# drop articles that haven't been assess for op-rr status
op_rr_df = op_rr_df[~op_rr_df['op-rr'].isna()].drop_duplicates().reset_index(drop=True)
# op_rr_df

In [5]:
# gather file paths of articles used to build random forest ML model
file_paths_for_model = [path for path in txt_file_paths if path.stem in op_rr_df['doi_suffix'].to_list()]
file_stems_for_model = [path.stem for path in file_paths_for_model]

files_df = pd.DataFrame(
    {'doi_suffix': file_stems_for_model,
    'file_path': file_paths_for_model}
)

In [6]:
# merge files_df with op-rr_df
merge_df = pd.merge(files_df, op_rr_df.drop(['file_path', 'pred_op-rr', 'pred_0.5'], axis=1), how='inner', on='doi_suffix')
merge_df

Unnamed: 0,doi_suffix,file_path,op-rr
0,s00421-007-0554-0,/Users/antonhesse/Desktop/Anton/Education/UMN/...,False
1,s40279-021-01523-9,/Users/antonhesse/Desktop/Anton/Education/UMN/...,False
2,jbc.m117.817510,/Users/antonhesse/Desktop/Anton/Education/UMN/...,True
3,ijspp.2013-0486,/Users/antonhesse/Desktop/Anton/Education/UMN/...,True
4,s12984-018-0401-z,/Users/antonhesse/Desktop/Anton/Education/UMN/...,False
...,...,...,...
1500,s13063-019-3560-8,/Users/antonhesse/Desktop/Anton/Education/UMN/...,False
1501,jbc.m112.440354,/Users/antonhesse/Desktop/Anton/Education/UMN/...,True
1502,chest.107.5.1206,/Users/antonhesse/Desktop/Anton/Education/UMN/...,True
1503,a-1273-7589,/Users/antonhesse/Desktop/Anton/Education/UMN/...,True


In [7]:
# tokenize files and join tokens
merge_df['tokens'] = merge_df['file_path'].progress_apply(lambda x: tokenize_file(x, mode='lemm'))
merge_df['joined_tokens'] = merge_df['tokens'].progress_apply(lambda x: ' '.join(x))

  0%|          | 0/1505 [00:00<?, ?it/s]

  0%|          | 0/1505 [00:00<?, ?it/s]

In [8]:
# load vetorizer and model
vectorizer = TfidfVectorizer(stop_words='english')
rf_clf = RandomForestClassifier(verbose=1)

In [9]:
# train model and get current accuracy
X = vectorizer.fit_transform(merge_df['joined_tokens'].to_list())
rf_clf.fit(X.toarray(), merge_df['op-rr'].to_list())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.0min finished


RandomForestClassifier(verbose=1)

In [15]:
# uncomment remaining lines to check current accuracy
rskf_cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2)
scores = cross_val_score(rf_clf, X.toarray(), merge_df['op-rr'].to_list(), cv = rskf_cv)
mean_score = round(np.mean(scores),3)*100
print(f'Current Accuracy: {mean_score}%')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   23.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   22.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   16.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

Current Accuracy: 83.8%


In [10]:
# load bbb articles and remove known ineligible articles
all_bbb_articles = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/PhD/Dissertation/CPET_scoping_review/data/cpet_articles/text_analysis/bbb_articles.csv')
ineligible_articles = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/PhD/Dissertation/CPET_scoping_review/data/cpet_articles/text_analysis/eligibility/ineligible_articles_combined.csv')
bbb_articles = all_bbb_articles[~all_bbb_articles['doi_suffix'].isin(ineligible_articles['doi_suffix'])].drop_duplicates().reset_index(drop=True)

In [11]:
# find subset of bbb article file stems that are not op_rr_df
remaining_bbb_file_paths = [path for path in txt_file_paths if path.stem in bbb_articles['doi_suffix'].to_list() and path.stem not in op_rr_df['doi_suffix'].to_list()]
remaining_bbb_file_stems = [path.stem for path in remaining_bbb_file_paths]

In [12]:
test_texts = []
for path in tqdm(remaining_bbb_file_paths, total=len(remaining_bbb_file_paths)):
    try:
        tokens = tokenize_file(path, mode='lemm')
        if tokens is not None:
            joined_tokens = ' '.join(tokens)
        test_texts.append(joined_tokens)
    except FileNotFoundError as e:
        print(e)
        test_texts.append(None)

  0%|          | 0/7896 [00:00<?, ?it/s]

In [13]:
# create test data frame
test_df = pd.DataFrame({
    'doi_suffix': [path.stem for path in remaining_bbb_file_paths],
    'file_path': [path for path in remaining_bbb_file_paths],
    'joined_tokens': test_texts
})
test_df = test_df[~test_df['joined_tokens'].isna()].reset_index(drop=True)
# test_df

In [14]:
# create predictions and calculate probabilities
X_test = vectorizer.transform(test_texts)
preds = rf_clf.predict_proba(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    9.5s finished


In [74]:
# Uncomment to get info on the prediction classes
# print(preds)
# print(rf_clf.classes_)

In [75]:
# create output data frame
out_df = test_df.drop('joined_tokens', axis=1)

out_df['pred_false'] = preds[:,0]
out_df['pred_true'] = preds[:,1]
out_df['pred_0.5'] = abs(preds[:,0]-0.5)
out_df['pred_op-rr'] = out_df.apply(lambda x: False if x['pred_false'] > 0.5 else True, axis=1)
out_df.insert(2, 'op-rr', None)

out_df = out_df.reindex(columns=['doi_suffix', 'file_path', 'op-rr', 'pred_op-rr', 'pred_0.5'])
# out_df

In [76]:
# concatenate output df with op_rr_df
comb_df = pd.concat([op_rr_df, out_df])
comb_df = comb_df.sort_values(['op-rr', 'pred_op-rr', 'pred_0.5'], ascending=False).reset_index(drop=True)
comb_df['doi_suffix'] = comb_df['doi_suffix'].astype('str')
# comb_df

In [77]:
# comb_df

In [80]:
comb_df.to_clipboard(index=False)

In [79]:
comb_df.to_csv(Path(
    '/Users/antonhesse/Desktop/Anton/Education/UMN/PhD/Dissertation/CPET_scoping_review/data/cpet_articles/text_analysis/eligibility/pred_op-rr.csv'),
    index=False)