In [1]:
import pandas as pd
import numpy as np
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
# read the data into a pandas dataframe

def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('HealthProNonPro/NonPro/', 0) # NEG
dfpos = data2df('HealthProNonPro/Pro/', 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
734,a31658.txt,if u dont mind she needs change she will be al...,0
766,ans1689.txt,Vaginal bleeding/spotting that occurs in betwe...,1
941,a61238.txt,I dont know. See a shrink.,0
267,ans1239.txt,Thank you for your question. The use of antibi...,1
1072,a61372.txt,It hurts like you wouldnt imagine! I had train...,0
132,ans1117.txt,Thank you for your question. A severe peanut a...,1
1833,ans962.txt,"Pain in the stomach after meals, especially wi...",1
420,ans1377.txt,"The lower back pain has many causes, from main...",1
1614,ans765.txt,It is quite common to experience pain in the f...,1
1027,a61327.txt,MAKE LOVE WITH SOME ONE,0


In [3]:
# setup the data
X, y = df['text'], df['class']


Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [4]:
def custom_tokenizer(doc):

    # use spacy to filter out noise
    tokens = [token.lemma_.lower() 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are greater than 2 characters long
                                    token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve selected pos
                                    #token.text in nlp.vocab and # check if token in vocab 
                                    token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop and # get rid of tokens that are stop words
                                    not token.is_currency # get rid of tokens that denote currencies
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [5]:
%%time

nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtrain))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
X = pd.Series(clean_corpus)
X.head()

Wall time: 25.2 s


0    common cause itchy palm contact dermatitis exp...
1    take money support famlys need food shelter do...
2                         sign married doctor approval
3    speed amphetamine psychostimulant commonly abu...
4                                          tantric sex
dtype: object

In [6]:
# setup the preprocessing->model pipeline

clf = Pipeline(steps=[('tfidf', TfidfVectorizer()),('nb', MultinomialNB())])

In [7]:
# setup grid search

param_grid = {
    'nb__alpha': [0, 1], 
    'tfidf__sublinear_tf':[True,False] 
}
gscv = GridSearchCV(clf, param_grid, cv=4, return_train_score=False)

In [8]:
gscv.fit(Xtrain, ytrain)

print ("-"*100)
print(gscv.best_estimator_, "\n")


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


----------------------------------------------------------------------------------------------------
Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1, class_pr

In [9]:
# predict and evaluate best_estimator_ on test data

ypred = gscv.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9427012278308322
[[317  41]
 [  1 374]]
              precision    recall  f1-score   support

           0       1.00      0.89      0.94       358
           1       0.90      1.00      0.95       375

    accuracy                           0.94       733
   macro avg       0.95      0.94      0.94       733
weighted avg       0.95      0.94      0.94       733



In [10]:
TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()

In [11]:
precision0=round(TP/(TP+FN),2)
precision1=round(TP/(TP+FP),2)
print("Precision0:",precision0)
print("Precision1:",precision1)

print()

recall0=round(TN/(TN+FP),2)
recall1=round(TP/(TP+FN),2)
print("Recall0:",recall0)
print("Recall1:",recall1)

print()


F_1_0=round((2*precision0*recall0)/(precision0+recall0),2)
F_1_1=round((2*precision1*recall1)/(precision1+recall1),2)
print("F1_Score:",F_1_0)
print("F1_Score:",F_1_1)

print()

Accuracy=round((TN+TP)/(TN+FP+FN+TP),2)
print("Accuracy:",Accuracy)

Precision0: 1.0
Precision1: 0.9

Recall0: 0.89
Recall1: 1.0

F1_Score: 0.94
F1_Score: 0.95

Accuracy: 0.94
