In [1]:
import pandas as pd

In [2]:
# 1) read the data into a pandas dataframe
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('HealthProNonPro/NonPro/', 0) # NonPro
dfpos = data2df('HealthProNonPro/Pro/', 1) # Pro

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
183,a24415.txt,Eat alot! Food is good for the soul.,0
416,ans1373.txt,The level of severity depend on the amount of ...,1
127,a24359.txt,Thats a good thing.,0
648,a31572.txt,Allergies can get worse over time but there ar...,0
1252,ans439.txt,I do understand your concern for cervical canc...,1
1308,ans49.txt,According to your description it seems that yo...,1
910,a54316.txt,hamburgers and french fries work for me,0
934,a61231.txt,you are lucky to have not had the decisions ca...,0
913,a54319.txt,well first of all make sure its a cold and n...,0
1321,a69453.txt,You can always be a Monk.,0


In [3]:
# 2)	Setup the data for Training/Testing. Use 20% for testing.
X, y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()
Xtrain.head()

439     The most common cause of itchy palms is contac...
720     1. It takes two to make the money to support t...
307     there isnt always signs\nso do not do it until...
87      Speed is an amphetamine, a psychostimulant, wh...
1066                                         tantric sex 
Name: text, dtype: object

In [4]:
#Pre-Processing using Spacy
def custom_tokenizer(doc):

    # use spacy to filter out noise
    tokens = [token.lemma_.lower() 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are greater than 2 characters long
                                    #token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve selected pos
                                    #token.text in nlp.vocab and # check if token in vocab 
                                    token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop and # get rid of tokens that are stop words
                                    not token.is_currency # get rid of tokens that denote currencies
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [5]:
#Preprocessing Xtrain using Spacy
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtrain))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtrain = pd.Series(clean_corpus,index=Xtrain.index)
Xtrain.head()

439     common cause itchy palm contact dermatitis exp...
720     take money support famlys need like food shelt...
307                      not sign married doctor approval
87      speed amphetamine psychostimulant commonly abu...
1066                                          tantric sex
dtype: object

In [6]:
#4)Setup a Pipeline with TfidfVectorizer and Naïve Bayes. 

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB



nb=Pipeline(steps=[('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [7]:
#5) Grid Search with 4-fold Cross Validation to search for the best values 

from sklearn.model_selection import GridSearchCV
param_grid = {
    'mnb__alpha': [1.0,0.5,0.25,1.5], # getting best alpha
    'tfidf__sublinear_tf':[True,False], # fublinear_tf from tfidf
    'tfidf__norm':['l1','l2'] #finding best norm
}
gscv = GridSearchCV(nb, param_grid, cv=4, return_train_score=False)

In [8]:
# 6) Use the Best Estimator resulting from the Grid Search for fitting training dataset and finding best-estimator

gscv.fit(Xtrain, ytrain)

print ("-"*50)
print(gscv.best_estimator_, "\n")
print ("-"*50)
print(gscv.best_score_, "\n")
print ("-"*50)
print(gscv.best_params_, "\n")
print ("-"*50)
print(gscv.cv_results_, "\n")
print ("-"*50)

--------------------------------------------------
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ue,
        vocabulary=None)), ('mnb', MultinomialNB(alpha=0.25, class_prior=None, fit_prior=True))]) 

--------------------------------------------------
0.9323770491803278 

--------------------------------------------------
{'mnb__alpha': 0.25, 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False} 

--------------------------------------------------
{'mean_fit_time': array([0.13672966, 0.11328357, 0.11928457, 0.11719626, 0.12500364,
       0.11327982, 0.13098782, 0.10937279, 0.11719352, 0.16797191,
       0.17969096, 0.13672692, 0.14453399, 0.13672298, 0.14844042,
       0.13281971]), 'std_fit_time': array([0.00

In [9]:
# Preprocessing Xtest using Spacy

import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtest))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtest = pd.Series(clean_corpus,index=Xtest.index)
Xtest.head()

956     procrastinating inevitable drinking problem da...
1247    understand concern pain knee surgery pain weig...
102     swallow complex act involve mouth throat area ...
994                                          course break
dtype: object

In [10]:
# predict and evaluate best_estimator_ on test data

ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9454297407912687
[[323  35]
 [  5 370]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       358
           1       0.91      0.99      0.95       375

   micro avg       0.95      0.95      0.95       733
   macro avg       0.95      0.94      0.95       733
weighted avg       0.95      0.95      0.95       733



In [11]:
#Extract the true negatives (TN), false positives (FP), false negatives (FN), and true positives (TP)

TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()
print("TN:",TN,"\n"
      'FP:',FP,"\n" 
      'FN:',FN,"\n" 
      'TP:',TP)


TN: 323 
FP: 35 
FN: 5 
TP: 370


In [12]:
#Overall_Accuracy
Overall_Accuracy = (TP + TN) / (TP + TN + FP + FN)
Overall_Accuracy

0.9454297407912687

In [13]:
#Precision for Class 0 and Class 1

Precision_Class_0 = round(TN / (TN + FN),2)
print("Precision for Class 0:",Precision_Class_0)

Precision_Class_1 = round(TP / (TP + FP),2)
print("Precision for Class 1:",Precision_Class_1)

Precision for Class 0: 0.98
Precision for Class 1: 0.91


In [14]:
#Recall for Class 0 and Class 1

Recall_Class_0 = round(TN / (TN + FP),2)
print("Recall for Class 0:",Recall_Class_0)

Recall_Class_1 = round(TP / (TP + FN),2)
print("Recall for Class 1:",Recall_Class_1)



Recall for Class 0: 0.9
Recall for Class 1: 0.99


In [15]:
#F1-Score for Class 0 and Class 1

F1score_Class_0 = round((2*Recall_Class_0*Precision_Class_0)/(Recall_Class_0+Precision_Class_0),2)
print("F1-score for Class 0:",F1score_Class_0)

F1score_Class_1 = round((2*Recall_Class_1*Precision_Class_1)/(Recall_Class_1+Precision_Class_1),2)
print("F1-score for Class 1:",F1score_Class_1)


F1-score for Class 0: 0.94
F1-score for Class 1: 0.95
