In [1]:
#import libraries
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib 

In [2]:
#load data
train_data = pd.read_csv("drugsComTrain_raw.tsv", sep="\t")
test_data = pd.read_csv("drugsComTest_raw.tsv", sep="\t")

In [3]:
#create sentiment labels
train_data["sentiment"] = np.where(train_data["rating"] > 7, 1, np.where(train_data["rating"] < 4, -1, 0))
test_data["sentiment"] = np.where(test_data["rating"] > 7, 1, np.where(test_data["rating"] < 4, -1, 0))

In [4]:
#assign x and y
train_x = train_data["review"]
test_x = test_data["review"]
train_y = train_data["sentiment"]
test_y = test_data["sentiment"]

In [6]:
def preprocessor(s):
    s = s.strip('"')
    s = s.lower()
    s = re.sub(r'\d+', 'DG', s)
    return s

vect = CountVectorizer(preprocessor=preprocessor)
nb = MultinomialNB(fit_prior=False)
pipe = Pipeline(steps=[("vectorizer", vect), ("naivebayes", nb)])
param_grid = {"vectorizer__ngram_range": [(1,1),(1,2),(1,3)],
              "vectorizer__max_df": [0.8,0.9,1.0],
              "naivebayes__alpha": [0.01, 0.1, 1.0, 10.0]}

start = time.time()
search = GridSearchCV(pipe, param_grid, cv=3, verbose=1)
search.fit(train_x, train_y)
print(f"\n... search took {time.time() - start} seconds")

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 95.6min finished



... search took 5807.562975883484 seconds


In [7]:
search.best_params_

{'naivebayes__alpha': 0.1,
 'vectorizer__max_df': 0.9,
 'vectorizer__ngram_range': (1, 3)}

In [10]:
search_results = pd.DataFrame(search.cv_results_)[["mean_fit_time","mean_test_score","mean_train_score","param_naivebayes__alpha",
                                                   "param_vectorizer__max_df", "param_vectorizer__ngram_range"]]
search_results.sort_values("mean_test_score", ascending=False)



Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_naivebayes__alpha,param_vectorizer__max_df,param_vectorizer__ngram_range
17,48.409272,0.837926,0.99548,0.1,1.0,"(1, 3)"
14,50.595173,0.837926,0.99548,0.1,0.9,"(1, 3)"
11,50.569403,0.83779,0.995502,0.1,0.8,"(1, 3)"
2,55.435096,0.83022,0.99699,0.01,0.8,"(1, 3)"
5,55.5405,0.830183,0.996981,0.01,0.9,"(1, 3)"
8,52.281219,0.830183,0.996981,0.01,1.0,"(1, 3)"
4,24.268135,0.814727,0.980375,0.01,0.9,"(1, 2)"
7,23.524424,0.814727,0.980375,0.01,1.0,"(1, 2)"
1,23.424415,0.814683,0.980331,0.01,0.8,"(1, 2)"
10,22.004706,0.808912,0.964048,0.1,0.8,"(1, 2)"


In [11]:
train_pred = search.predict(train_x)
accuracy_score(train_pred, train_y)

0.9948542130355803

In [12]:
confusion_matrix(train_pred, train_y)

array([[34931,    66,   186],
       [   85, 28632,   320],
       [   47,   126, 96904]])

In [13]:
test_pred = search.predict(test_x)
accuracy_score(test_pred, test_y)

0.8900048357698174

In [14]:
confusion_matrix(test_pred, test_y)

array([[10105,   774,   547],
       [  725,  6936,   991],
       [ 1008,  1869, 30811]])

In [15]:
joblib.dump(search, "Drug_reviews_naive_bayes.pkl") 

['Drug_reviews_naive_bayes.pkl']