In [19]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# from nltk.probability import FreqDist
from pathlib import Path
import glob
import random
import pandas as pd
import re
# import requests
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold, train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB,ComplementNB,CategoricalNB
from statistics import mean, stdev
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import VotingClassifier
import numpy as np

In [9]:
def process_file(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    
    return lemmatized_words

In [10]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/txts/*.txt')
manual_text_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/Manual text analysis - Data.csv')
manual_text_df['txt_file_name'] = manual_text_df.apply(lambda x: str(x['Article'] + '.txt'), axis=1)
# manual_text_df = manual_text_df[(manual_text_df['Eligible'] == 'e') & (manual_text_df['External ref'] == 'n')].reset_index(drop=True)

In [11]:
for idx, row in manual_text_df.iterrows():
    if row['Eligible'] != 'e':
        row['Gas data'] = 'n'
    if row['External ref'] == 'y':
        row['Gas data'] = 'n'

In [12]:
analyzed_txt_files = manual_text_df['txt_file_name'].to_list()

In [13]:
word_lists = [process_file(f, txt_files) for f in tqdm(analyzed_txt_files)]
joined_word_lists = [' '.join(text) for text in word_lists]

  0%|          | 0/317 [00:00<?, ?it/s]

In [14]:
manual_text_df[manual_text_df['Gas data'].isnull()]

Unnamed: 0,Article,Eligible,Eligibility note,External ref,Gas data,Gas Analyzer Type,Outliers,Interpolation type,Interpolation time (s),Avg type,Avg subtype,Avg amount,Avg MOS,Avg mean type,Data Processing Text,Avg phrase,Notes,Unnamed: 17,Unnamed: 18,txt_file_name


In [15]:
gas_data = manual_text_df['Gas data'].to_list()
counts = manual_text_df['Gas data'].value_counts()
print(f'Values counts:\n{counts}')
props = round(manual_text_df['Gas data'].value_counts() / sum(manual_text_df['Gas data'].value_counts()) * 100,1)
print()
print(f'Proportions:\n{props}')

Values counts:
y    162
n    155
Name: Gas data, dtype: int64

Proportions:
y    51.1
n    48.9
Name: Gas data, dtype: float64


You can use CountVectorizier and TfidfTransformer together, or just use TfidfVectorizer because that combines those steps together. With a Pipeline, however, I'm unsure if you need to separate those two steps or not.

In [77]:
names = [
#     'MultinomialNB',
    'BernoulliNB',
#     'GaussianNB',
#     'ComplementNB',
#     'SGDClassifier',
#     'SVC',
    'Logistic',
#     'LinearSVC',
    'NuSVC'
]

classifiers = [
#     MultinomialNB(),
    BernoulliNB(),
#     GaussianNB(),
#     ComplementNB(),
#     SGDClassifier(),
    SVC(kernel='linear', degree=3, gamma='auto', probability=True,),
    LogisticRegression(),
#     LinearSVC(),
    NuSVC(probability=True)
]

scores_df = pd.DataFrame(columns = [
    'Metric',
#     'MultinomialNB',
    'BernoulliNB',
#     'GaussianNB',
#     'ComplementNB',
#     'SGDClassifier',
    'SVC',
    'Logistic',
#     'LinearSVC',
    'NuSVC'
])
scores_df['Metric'] = ['Mean', 'Median', 'Min', 'Max', 'SD']

In [75]:
# compare accuracy of different models
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(joined_word_lists)
rskf_cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 5)

for name, clf in tqdm(zip(names, classifiers), total=len(classifiers)):
    clf.fit(X.toarray(), gas_data)
    scores = cross_val_score(clf, X.toarray(), gas_data, cv = rskf_cv)
    scores_df[name] = [
        round(np.mean(scores)*100,1),
        round(np.median(scores)*100,1),
        round(min(scores)*100,1),
        round(max(scores)*100,1),
        round(stdev(scores*100),1)
    ]

  0%|          | 0/4 [00:00<?, ?it/s]

In [76]:
scores_df

Unnamed: 0,Metric,BernoulliNB,Logistic,NuSVC
0,Mean,77.9,81.9,81.9
1,Median,77.8,82.5,81.2
2,Min,68.8,75.0,71.4
3,Max,88.9,90.5,90.5
4,SD,3.8,4.3,5.1


In [78]:
manual_analysis_pdfs = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/pdfs/manual_pdf_analysis/*.pdf')
# manual_analysis_pdfs[0].split('/')[-1].replace('.pdf','.txt')

manual_analysis_txts = [x.split('/')[-1].replace('.pdf', '.txt') for x in manual_analysis_pdfs]
# len(manual_analysis_txts)


In [39]:
def list_intersection(lst1, lst2, output = 'intersection'):
    if output == 'intersection':
        return list(set(lst1).intersection(lst2))
    elif output == 'difference':
        return list(set(lst1).difference(lst2))
    elif output == 'symmetric_difference':
        return list(set(lst1).symmetric_difference(lst2))
    else:
        print('Bad input, returning None')
        return None


In [40]:
test_articles = list_intersection(manual_analysis_txts, analyzed_txt_files, output='difference')
word_lists_test = [process_file(f, txt_files) for f in tqdm(test_articles)]
joined_word_lists_test = [' '.join(text) for text in word_lists_test]

  0%|          | 0/211 [00:00<?, ?it/s]

In [48]:
estimators = [(name, clf) for name, clf in zip(names, classifiers)]
estimators
vote_soft = VotingClassifier(estimators=estimators, voting='soft')
vote_soft.fit(X, gas_data)

VotingClassifier(estimators=[('BernoulliNB', BernoulliNB()),
                             ('Logistic',
                              SVC(gamma='auto', kernel='linear',
                                  probability=True)),
                             ('NuSVC', LogisticRegression())],
                 voting='soft')

In [52]:
X_test = vectorizer.transform(joined_word_lists_test)
preds = vote_soft.predict_proba(X_test)
# SVC_clf = SVC(kernel='linear', degree=3, gamma='auto')
# SVC_clf.fit(X, gas_data)
# preds = SVC_clf.predict(X_test)
# BNB_clf = BernoulliNB()
# BNB_clf.fit(X, gas_data)
# preds = BNB_clf.predict(X_test)

In [None]:
test_dict = {'article': test_articles, 'pred_n': preds[:,0], 'pred_y': preds[:,1]}
test_df = pd.DataFrame.from_dict(test_dict)
test_df['article'] = test_df['article'].apply(lambda x: x.replace('.txt', ''))

In [74]:
test_df.sort_values(by = 'pred_n', ascending=False).iloc[0:20,:]

Unnamed: 0,article,pred_n,pred_y
210,jnnp.37.11.1236,0.699058,0.300942
21,1517-869220152103137534,0.576079,0.423921
79,jaha.118.008837,0.413298,0.586702
93,j.1365-2125.1979.tb04720.x,0.411176,0.588824
143,hrt.62.6.445,0.391335,0.608665
90,hrt.61.2.161,0.390081,0.609919
106,fphys.2021.683942,0.376728,0.623272
149,ijn.s282200,0.375942,0.624058
138,circheartfailure.113.000187,0.375424,0.624576
166,hbm.22658,0.369019,0.630981


In [48]:
# test_dict = {'article': test_articles, 'pred': preds[0,1]}
# test_df = pd.DataFrame.from_dict(test_dict)
# pred_as_n = test_df[test_df['pred'] == 'n']['article']
# # pred_as_n.apply(lambda x: x.replace('.txt', '')).to_csv('/Users/antonhesse/Desktop/no_preds.csv')

In [49]:
pred_as_n.apply(lambda x: x.replace('.txt', ''))

55      j.1365-2125.1979.tb04720.x
107                      hbm.22658
122                jnnp.37.11.1236
137                   hrt.61.2.161
141                jaha.118.008837
158                    hrt.70.1.17
166                   hrt.44.3.259
180    circheartfailure.113.000187
181              fphys.2021.683942
185                   hrt.62.6.445
190                    ijn.s282200
Name: article, dtype: object