In [1]:
import pandas as pd

In [2]:
# read the data into a pandas dataframe
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfPro = data2df('Pro/', 0) # PRO
dfNonPro = data2df('NonPro/', 1) # NONPRO

df = pd.concat([dfPro, dfNonPro], axis=0)
df.sample(frac=0.005)

Unnamed: 0,file,text,class
856,a24408.txt,Grey Goose,1
1382,a24442.txt,"Hmm impressive, you have got a large audience!...",1
1491,ans632.txt,In my opinion your symptoms of burning sensati...,0
1697,ans1069.txt,Smoking may not cause an ear infection in an a...,0
772,a61401.txt,"i had them 1 year ago, it just hurts the first...",1
840,ans1404.txt,The most probable cause could be due to leg cr...,0
672,a61370.txt,"I had braces and teeth out when I was younger,...",1
1467,a69568.txt,"Committ Suicide, really, this is a serious ans...",1
1605,ans1681.txt,Usually the oral mucosal wounds are suture if ...,0
911,ans281.txt,"Even though you do not show any symptoms, you ...",0


In [3]:
# setup the data
X, y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [4]:
def custom_tokenizer(doc):

    # clean up text
    tokens = [token.lemma_.lower() # lemmatize and lower-case 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are 2 or more characters long
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop # get rid of tokens that are stop words
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [5]:
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
nlpXtrain = nlp.pipe(Xtrain)
clean_Xtrain = [custom_tokenizer(doc) for doc in nlpXtrain]
clean_Xtrain
nlpXtest = nlp.pipe(Xtest)
clean_Xtest = [custom_tokenizer(doc) for doc in nlpXtest]
clean_Xtest

['eat chicken soop',
 'thank question notice require dose antibiotic medication day recommend henceforth pill day prescribe doctor sure antibiotic course completely finish pill remain provide good result treat throat infection',
 'tylenol safe pain medication pregnancy need tablet hourly stay away ibuprofen nonsteroidal anti inflammatory drug nsaids aspirin naproxen pregnant relieve pain need schedule appointment dentist',
 'try have orgasm have actual sex 2nd time usually take long reach climax',
 'need sugar daily intake suppose use sparingly little visit mayoclinic.com view food pyramid',
 'difficult pregnant bleed regular period implantation bleed pregnant need urine pregnancy test perform test week unprotected sex test morning blood test pregnancy earlier sensitive hope answer question care',
 'measure helpful relieve constipation proper diet hydration diet rich green leafy vegetable add bulk stool relieve symptom constipation.ii bulk form laxative like methylcellulose psyllium br

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
tfpipeline = Pipeline(steps=[('tfif',TfidfVectorizer(binary=False,sublinear_tf=True,use_idf=True,smooth_idf=True,norm='l2',
                                            stop_words='english',min_df=1,max_df=1.0,max_features=None,ngram_range=(1,1))),
                    ('nb',MultinomialNB(alpha=1.0,fit_prior=True,class_prior=None))])

In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
param_grid = {'tfif__sublinear_tf':['True','False'],
                'nb__alpha':[1.0,2.0,3.0] }
gscv = GridSearchCV(tfpipeline,param_grid,cv=4,return_train_score=False)

In [15]:
gscv.fit(Xtrain,ytrain)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfif',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [16]:
print ("-"*100)
print(gscv.best_estimator_, "\n")
print ("-"*100)
print(gscv.best_score_, "\n")
print ("-"*100)
print(gscv.best_params_, "\n")
print ("-"*100)
print(gscv.cv_results_, "\n")
print ("-"*100)

----------------------------------------------------------------------------------------------------
Pipeline(memory=None,
         steps=[('tfif',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf='True',
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, 

In [17]:
ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9263301500682128
[[372   3]
 [ 51 307]]
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       375
           1       0.99      0.86      0.92       358

    accuracy                           0.93       733
   macro avg       0.93      0.92      0.93       733
weighted avg       0.93      0.93      0.93       733



In [18]:
TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()
Accu = (TN + TP) /( TN + TP + FP + FN)
print("Accuracy_score:", round(Accu,2))
Precision_class0 = (TN) / (FN + TN)
print("Precision for class 0 is:",round(Precision_class0,2))

Precision_class1 = (TP) / (FP + TP)
print("Precision for class 1 is:",round(Precision_class1,2))
Recall_class0 = (TN) / (FP + TN)
print("Recall for class 0 is:",round(Recall_class0,2))

Recall_class1 = (TP) / (FN + TP)
print("Recall for class 1 is:",round(Recall_class1,2))
F1_score_class0 = 2 * ((Precision_class0*Recall_class0)/(Precision_class0+Recall_class0))
print('F1 Score for Class 0 is :',round(F1_score_class0,2))

F1_score_class1 = 2 * ((Precision_class1*Recall_class1)/(Precision_class1+Recall_class1))
print('F1 Score for Class 1 is :',round(F1_score_class1,2))

Accuracy_score: 0.93
Precision for class 0 is: 0.88
Precision for class 1 is: 0.99
Recall for class 0 is: 0.99
Recall for class 1 is: 0.86
F1 Score for Class 0 is : 0.93
F1 Score for Class 1 is : 0.92
