In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from pathlib import Path
import glob
import random
import pandas as pd
import re
import requests
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB,ComplementNB
from statistics import mean
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [6]:
def process_file(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    
    return lemmatized_words

In [7]:
def get_raw_text(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()

In [8]:
txt_file_paths = list(Path('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/txts').glob('*.txt'))
txt_files = [path.stem for path in txt_file_paths]
txt_files_df = pd.DataFrame({'doi_suffix': txt_files})

In [16]:
manual_text_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/Manual text analysis - Data.csv')
# manual_text_df['txt_file_name'] = manual_text_df.apply(lambda x: str(x['doi_suffix'] + '.txt'), axis=1) # not sure I need this
manual_text_df = manual_text_df[(manual_text_df['Eligible'] == 'e') & (~manual_text_df['Gas data'].isna())].drop_duplicates().reset_index(drop=True) # remove non-original research, animal studies, etc.

In [26]:
merge_df = pd.merge(txt_files_df, manual_text_df, how='inner', on='doi_suffix') # only analyze files for which we have the txt files

In [27]:
merge_df.shape

(320, 18)

In [28]:
words = [process_file(f, list(map(str, txt_file_paths))) for f in tqdm(merge_df['doi_suffix'].to_list())]
joined_words = [' '.join(text) for text in words]

100%|██████████| 320/320 [00:29<00:00, 10.87it/s]


In [29]:
gas_data = merge_df['Gas data'].to_list()

In [30]:
# text = ['The quick brown fox jumped over the lazy dog.',
#        'The dog.',
#        'The fox.']
features_train, features_test, labels_train, labels_test = train_test_split(joined_words, gas_data, test_size=0.25, random_state=10)


In [31]:
vectorizer = TfidfVectorizer(stop_words='english')

In [32]:
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test)

In [152]:
# X = vectorizer.fit_transform(joined_words)
# vocabulary = vectorizer.get_feature_names_out()

In [153]:
BNB = BernoulliNB()
BNB.fit(features_train, labels_train)

BernoulliNB()

In [154]:
BNB_pred = BNB.predict(features_test)
accuracy_BNB = metrics.accuracy_score(labels_test, BNB_pred)
print("BernoulliNB accuracy:", accuracy_BNB)
print("confusion matrix:")
print(metrics.confusion_matrix(labels_test, BNB_pred))

BernoulliNB accuracy: 0.6923076923076923
confusion matrix:
[[ 0  8]
 [ 0 18]]


In [155]:
MNB = MultinomialNB()
MNB.fit(features_train, labels_train)

MultinomialNB()

In [156]:
MNB_pred = MNB.predict(features_test)
accuracy_MNB = metrics.accuracy_score(labels_test, MNB_pred)
print("BernoulliNB accuracy:", accuracy_MNB)
print("confusion matrix:")
print(metrics.confusion_matrix(labels_test, MNB_pred))

BernoulliNB accuracy: 0.6923076923076923
confusion matrix:
[[ 0  8]
 [ 0 18]]


In [16]:
# vector = TfidfVectorizer(stop_words='english')
# vector.fit(joined_words)
# vector.get_feature_names_out()
# tfidf = vector.transform(joined_words)

In [17]:
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform

In [28]:
# tfidf = vector.transform(joined_words)

TfidfVectorizer(stop_words='english')

In [161]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', BernoulliNB()),
])

In [164]:
# text_clf.fit(X = features_train, y = labels_train)

<78x29823 sparse matrix of type '<class 'numpy.float64'>'
	with 112336 stored elements in Compressed Sparse Row format>