In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from pathlib import Path
import glob
import random
import pandas as pd
import re
import requests
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB,ComplementNB
from statistics import mean

In [2]:
txt_files = glob.glob('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/txts/*.txt')
# len(txt_files)

In [6]:
# file_size = 0
# while file_size == 0:
#     txt_file = random.choice(txt_files)
#     file_size = Path(txt_file).stat().st_size
#     if file_size != 0: # check if conversion to txt didn't work
#         with open(txt_file, 'r') as f:
#             text = f.read()
#         print(txt_file)
#     else:
#         print('Skipping empty file')
#         continue
# text_lower = text.lower()

In [7]:
# tokens = word_tokenize(text_lower)
# tokens[:10]

In [8]:
# stop_words = set(stopwords.words('english'))
        
# filtered_tokens = [t for t in tokens if t not in stop_words]

# filtered_tokens[0:10]

In [9]:
# ps = PorterStemmer()
# stemmed_words = [ps.stem(t) for t in filtered_tokens]
# stemmed_words[0:20]

In [10]:
# lemmatizer = WordNetLemmatizer()
# # with lemmatizing you get actual words. With stemming you don't often get actual words, but rather the root itself
# lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
# lemmatized_words[0:10]

In [3]:
manual_text_df = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/Manual text analysis - Data.csv')
manual_text_df['txt_file_name'] = manual_text_df.apply(lambda x: str(x['Article'] + '.txt'), axis=1)
analyzed_txt_files = manual_text_df['txt_file_name'].to_list()

In [4]:
def process_file(file_name, file_list):
    txt_re = re.compile(file_name)
    fname = list(filter(txt_re.search, file_list))[0]
    
    # check file size to make sure the txt file actually has text
    file_size = 0
    while file_size == 0:
        file_size = Path(fname).stat().st_size
        if file_size != 0: # check if conversion to txt didn't work
            with open(fname, 'r') as f:
                text = f.read()
        else:
            print('Empty file, returning None')
            return None
    text_lower = text.lower()
    tokens = word_tokenize(text_lower)
    stop_words = set(stopwords.words('english'))

    filtered_tokens = [t for t in tokens if t not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    
    return lemmatized_words

In [5]:
words = [process_file(f, txt_files) for f in tqdm(analyzed_txt_files)]

  0%|          | 0/114 [00:00<?, ?it/s]

In [7]:
manual_text_df['words'] = words
documents = []
for idx, row in manual_text_df.iterrows():
    documents.append((row['words'], row['Gas data']))
random.seed(2312)
random.shuffle(documents)

0it [00:00, ?it/s]

In [8]:
all_words = []
for doc in tqdm(documents):
    for w in doc[0]:
        all_words.append(w)
fdist = FreqDist(all_words)
word_features = list(fdist.keys())

  0%|          | 0/114 [00:00<?, ?it/s]

In [9]:
def find_features(document, ref_features):
    words = set(document)
    features = {}
    for w in ref_features:
        features[w] = (w in words)
    
    return features

In [10]:
feature_sets = [(find_features(words, word_features), gas) for (words, gas) in tqdm(documents)]

  0%|          | 0/114 [00:00<?, ?it/s]

In [65]:
# rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=None)
# skf = StratifiedKFold(n_splits=5, random_state=None)
# CNB = ComplementNB()
# classifier = nltk.NaiveBayesClassifier()
from sklearn.model_selection import KFold

In [107]:
num_folds = 5
subset_size = int(len(feature_sets)/num_folds)
CNB_classifier = SklearnClassifier(ComplementNB())
accuracy = []

for i in tqdm(range(num_folds)):
    test_set = feature_sets[i*subset_size:][:subset_size]
    train_set = feature_sets[:i*subset_size] + feature_sets[(i+1)*subset_size:]
    classifier = CNB_classifier.train(train_set)
    accuracy.append(round(nltk.classify.accuracy(classifier, test_set)*100,1))

  0%|          | 0/5 [00:00<?, ?it/s]

In [114]:
print('Compliment NB classifier accuracy:', accuracy)
print('Mean Compliment NB accuracy:', mean(accuracy))

Compliment NB classifier accuracy: [75.0, 75.0, 62.5, 50.0, 87.5]
Mean Compliment NB accuracy: 70.0


In [110]:
num_folds = 5
subset_size = int(len(feature_sets)/num_folds)
BNB_classifier = SklearnClassifier(BernoulliNB())
accuracy = []

for i in tqdm(range(num_folds)):
    test_set = feature_sets[i*subset_size:][:subset_size]
    train_set = feature_sets[:i*subset_size] + feature_sets[(i+1)*subset_size:]
    classifier = BNB_classifier.train(train_set)
    accuracy.append(round(nltk.classify.accuracy(classifier, test_set)*100,1))

  0%|          | 0/5 [00:00<?, ?it/s]

In [115]:
print('Bernoulli NB classifier accuracy:', accuracy)
print('Mean Bernoulli NB accuracy:', mean(accuracy))

Bernoulli NB classifier accuracy: [75.0, 75.0, 62.5, 50.0, 87.5]
Mean Bernoulli NB accuracy: 70.0


In [11]:
train_set = feature_sets[0:len(feature_sets)*2//3]
print(f'{len(train_set)} articles in training set')
test_set = feature_sets[len(feature_sets)*2//3:]
print(f'{len(test_set)} articles in testing set')

76 articles in training set
38 articles in testing set


In [12]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [13]:
print("Classifier accuracy percent:",round((nltk.classify.accuracy(classifier, test_set))*100,1))

Classifier accuracy percent: 73.7


In [14]:
classifier.show_most_informative_features(20)

Most Informative Features
                     may = False             nan : y      =     21.9 : 1.0
                    0,05 = True              nan : y      =     17.0 : 1.0
                      4a = True              nan : y      =     17.0 : 1.0
                    also = False             nan : y      =     17.0 : 1.0
                corporal = True              nan : y      =     17.0 : 1.0
              discussion = False             nan : y      =     17.0 : 1.0
                 housing = True              nan : y      =     17.0 : 1.0
                inferior = True              nan : y      =     17.0 : 1.0
                 shapiro = True              nan : y      =     17.0 : 1.0
                   table = False             nan : y      =     17.0 : 1.0
                    toda = True              nan : y      =     17.0 : 1.0
                    2.02 = True              nan : y      =     12.1 : 1.0
                    24.8 = True              nan : y      =     12.1 : 1.0