In [2]:
#import libraries
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.externals import joblib 

In [47]:
#load data
train_data = pd.read_csv("undersampled/train.tsv", delimiter='\t', lineterminator='\n', header=None)
dev_data = pd.read_csv("undersampled/dev.tsv", delimiter='\t', lineterminator='\n', header=None)
test_data = pd.read_csv("undersampled/test.tsv", delimiter='\t', lineterminator='\n', header=None)

In [48]:
(train_data.shape[0], dev_data.shape[0], test_data.shape[0])

(835, 834, 834)

In [49]:
train_dev_data = pd.concat([train_data, dev_data], axis=0)
train_dev_data.shape[0]

1669

In [50]:
train_dev_data.head()

Unnamed: 0,0,1
0,"If you take Byetta, Victoza, or Januvia, read ...",0
1,"Side Effects of Levaquin, Cipro May Increase R...",0
2,Finding out I'm allergic to fluoxetine was a b...,1
3,Weird #Pristiq is sold by #Pfizer but also by ...,0
4,Somebody pass the cymbalta,0


In [51]:
#assign x and y
train_x = train_dev_data[0]
test_x = test_data[0]
train_y = train_dev_data[1]
test_y = test_data[1]

In [52]:
print(f"training data: {np.round(train_y.value_counts()[1]/train_data.shape[0],4)*100}% positive class")
print(f"test data: {np.round(test_y.value_counts()[1]/test_data.shape[0],4)*100}% positive class")

training data: 44.190000000000005% positive class
test data: 11.03% positive class


In [53]:
def preprocessor(s):
    s = s.lower()
    s = re.sub(r'\d+', 'DG', s)
    s = re.sub(r'@\w+', "@USER", s)
    return s

vect = CountVectorizer(preprocessor=preprocessor)
nb = MultinomialNB(fit_prior=False)
pipe = Pipeline(steps=[("vectorizer", vect), ("naivebayes", nb)])
param_grid = {"vectorizer__ngram_range": [(1,1),(1,2),(1,3)],
              "vectorizer__max_df": [0.8,0.9,1.0],
              "naivebayes__alpha": [0.01, 0.1, 1.0, 10.0]}

search = GridSearchCV(pipe, param_grid, cv=3, verbose=1)
search.fit(train_x, train_y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   13.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function preprocessor...one, vocabulary=None)), ('naivebayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorizer__max_df': [0.8, 0.9, 1.0], 'naivebayes__alpha': [0.01, 0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [54]:
search.best_params_

{'naivebayes__alpha': 1.0,
 'vectorizer__max_df': 0.8,
 'vectorizer__ngram_range': (1, 2)}

In [55]:
search_results = pd.DataFrame(search.cv_results_)[["mean_fit_time","mean_test_score","mean_train_score",
                                                   "param_naivebayes__alpha","param_vectorizer__max_df", 
                                                   "param_vectorizer__ngram_range"]]
search_results.sort_values("mean_test_score", ascending=False)



Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_naivebayes__alpha,param_vectorizer__max_df,param_vectorizer__ngram_range
22,0.064144,0.817855,0.996405,1.0,0.9,"(1, 2)"
19,0.064373,0.817855,0.996405,1.0,0.8,"(1, 2)"
25,0.064236,0.817855,0.996405,1.0,1.0,"(1, 2)"
26,0.1068,0.813062,0.998502,1.0,1.0,"(1, 3)"
20,0.107644,0.813062,0.998502,1.0,0.8,"(1, 3)"
23,0.10693,0.813062,0.998502,1.0,0.9,"(1, 3)"
21,0.027561,0.810066,0.955961,1.0,0.9,"(1, 1)"
24,0.026975,0.810066,0.955961,1.0,1.0,"(1, 1)"
18,0.027318,0.810066,0.955961,1.0,0.8,"(1, 1)"
14,0.104946,0.794488,0.998801,0.1,0.9,"(1, 3)"


In [56]:
train_pred = search.predict(train_x)
print(f"accuracy: {np.round(accuracy_score(train_pred, train_y),3)}")
print(f"f1-score: {np.round(f1_score(train_pred, train_y),3)}")

accuracy: 0.995
f1-score: 0.988


In [57]:
confusion_matrix(train_pred, train_y)

array([[1292,    1],
       [   8,  368]])

In [58]:
test_pred = search.predict(test_x)
print(f"accuracy: {np.round(accuracy_score(test_pred, test_y),3)}")
print(f"f1-score: {np.round(f1_score(test_pred, test_y),3)}")

accuracy: 0.888
f1-score: 0.408


In [59]:
confusion_matrix(test_pred, test_y)

array([[709,  60],
       [ 33,  32]])