# Text Classification with Naive Bayes

### Step 1: Read Data

In [1]:
# read the data into a pandas dataframe
import pandas as pd
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r',encoding="utf8", errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

NonPro = data2df('Data/HealthProNonPro/NonPro/', 0) # NonPro
Pro = data2df('Data/HealthProNonPro/Pro/', 1) # Pro

df = pd.concat([NonPro, Pro], axis=0)
df.sample(frac=0.005)


Unnamed: 0,file,text,class
848,ans1809.txt,Your description may correspond to Neuralgia; ...,1
697,ans1758.txt,You are most probably experiencing a musculo-s...,1
1468,a7369.txt,Restrain yourself to eat during the hours that...,0
1155,a31604.txt,Hi I wouldnt do it cause if you start you migh...,0
234,a55.txt,Fool proof way.\n\nFill a glass of water.\nLea...,0
1451,a116.txt,"Ashtanga is great, like the above answer says....",0
1730,a61443.txt,Not a lot.,0
471,ans1428.txt,The overall effects of cetrizine and iterax (h...,1
542,ans128.txt,Because you missed 3 birth control pills in we...,1
520,a69660.txt,I have the same problem. When school comes aro...,0


### Step 2:  Set up data and split data


In [2]:
X=df['text']
y=df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

### Step 3: Create a custom preprocessing function

In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Create Preprocess function

def preprocess(text):
    # replace one or more white-space characters with a space
    regex = re.compile(r"\s+")                               
    text = regex.sub(' ', text)    
    # lower case
    text = text.lower()          
    # remove digits and punctuation
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub(' ', text)           
    # remove stop words
    sw = stopwords.words('english')
    text = text.split()                                              
    text = ' '.join([w for w in text if w not in sw]) 
    # remove short words
    ' '.join([w for w in text.split() if len(w) >= 4])
    # lemmatize
    text = ' '.join([(WordNetLemmatizer()).lemmatize(w) for w in text.split()]) 
    return text

### Step 4: Pipeline with preprocess and TF-IDF

In [4]:
# import necessary module
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Step up Pipeline
classifier = Pipeline(steps=[
    ('pp', TfidfVectorizer(
        preprocessor=preprocess,
        use_idf=True, smooth_idf=True, norm='l2',
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1))),
    ('NaiveBayes', MultinomialNB())])  #Naive Bayes Classifier


### Step 5: Fit best parameter with Grid Search

In [5]:
# setup grid search

from sklearn.model_selection import GridSearchCV
param_grid = {
    'pp__norm':['l1','l2',None],
    'NaiveBayes__alpha':[0.01,0.1,0.2,0.5,1]
}
gscv = GridSearchCV(classifier, param_grid, iid=False, cv=5, return_train_score=False)
gscv.fit(Xtrain, ytrain)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pp', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...True, vocabulary=None)), ('NaiveBayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'pp__norm': ['l1', 'l2', None], 'NaiveBayes__alpha': [0.01, 0.1, 0.2, 0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [6]:
# search for best parameters/estimator

print(gscv.best_estimator_, "\n")
#print(gscv.best_score_, "\n")
#print(gscv.best_params_, "\n")
#print(gscv.cv_results_, "\n")


Pipeline(memory=None,
     steps=[('pp', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...True, vocabulary=None)), ('NaiveBayes', MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))]) 

0.9434049732490273 

{'NaiveBayes__alpha': 0.2, 'pp__norm': 'l2'} 

{'mean_fit_time': array([1.6609736 , 1.20189147, 1.22921376, 1.33773479, 1.23872137,
       1.29899669, 1.17581186, 1.17546954, 1.17001977, 1.17027044,
       1.23830199, 1.30429502, 1.16846867, 1.1676506 , 1.17528872]), 'std_fit_time': array([0.92369759, 0.05437371, 0.06230801, 0.10516984, 0.02459072,
       0.12923569, 0.00541774, 0.00886681, 0.00335653, 0.00600648,
       0.07924392, 0.10250802, 0.00642457, 0.00406087, 0.00852419]), 'mean_score_time': array([0.32524099, 0.3063952 , 0.318918

### Step 6: Prediction and Evaluation

In [7]:
# evaluate best_estimator_ on test data
ypred = gscv.best_estimator_.predict(Xtest)
from sklearn import metrics
print ("Accuracy score:",round(metrics.accuracy_score(ytest, ypred),3))
print ("Confusion matrix:")
print (metrics.confusion_matrix(ytest, ypred))
print( )
print("Classification Report:")
print (metrics.classification_report(ytest, ypred))

Accuracy score: 0.955
Confusion matrix:
[[502  49]
 [  1 547]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       551
           1       0.92      1.00      0.96       548

   micro avg       0.95      0.95      0.95      1099
   macro avg       0.96      0.95      0.95      1099
weighted avg       0.96      0.95      0.95      1099

