In [18]:
#import libraries
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.externals import joblib 

In [2]:
#load data
data = pd.read_csv("binary_tweets_valid.csv", lineterminator="\n")
data.drop(columns=["index"], inplace=True)
data.head()

Unnamed: 0,tweet_id,user_id,id,tweet,url
0,339867818843594756,246979971,0,"@DoctorChristian scared to start fluoxetine, w...",http://twitter.com/246979971/status/3398678188...
1,349294537367236611,149749939,0,"@IntuitiveGal1 ok, if you stopped taking the L...",http://twitter.com/149749939/status/3492945373...
2,354256195432882177,54516759,0,Novartis announces secukinumab (AIN457) demons...,http://twitter.com/54516759/status/35425619543...
3,352456944537178112,1267743056,1,"""U wailed all night; now y'r disembodied sobbi...",http://twitter.com/1267743056/status/352456944...
4,332479707004170241,273421529,0,@irapaps you're so fucking selfish. I've got L...,http://twitter.com/273421529/status/3324797070...


In [3]:
#split into train and test data
train_data, test_data = train_test_split(data, test_size=0.25, random_state=0, stratify=data["id"])
train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)

In [4]:
print(f"training data: {train_data.shape[0]} tweets")
print(f"test data: {test_data.shape[0]} tweets")

training data: 3126 tweets
test data: 1043 tweets


In [5]:
#assign x and y
train_x = train_data["tweet"]
test_x = test_data["tweet"]
train_y = train_data["id"]
test_y = test_data["id"]

In [6]:
print(f"training data: {np.round(train_y.value_counts()[1]/train_data.shape[0],4)*100}% positive class")
print(f"test data: {np.round(test_y.value_counts()[1]/test_data.shape[0],4)*100}% positive class")

training data: 11.07% positive class
test data: 11.03% positive class


In [7]:
def preprocessor(s):
    s = s.lower()
    s = re.sub(r'\d+', 'DG', s)
    s = re.sub(r'@\w+', "@USER", s)
    return s

vect = CountVectorizer(preprocessor=preprocessor)
nb = MultinomialNB(fit_prior=False)
pipe = Pipeline(steps=[("vectorizer", vect), ("naivebayes", nb)])
param_grid = {"vectorizer__ngram_range": [(1,1),(1,2),(1,3)],
              "vectorizer__max_df": [0.8,0.9,1.0],
              "naivebayes__alpha": [0.01, 0.1, 1.0, 10.0]}

search = GridSearchCV(pipe, param_grid, cv=3, verbose=1)
search.fit(train_x, train_y)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   24.6s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1),
        preprocessor=<function preprocessor...one, vocabulary=None)), ('naivebayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorizer__max_df': [0.8, 0.9, 1.0], 'naivebayes__alpha': [0.01, 0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [8]:
search.best_params_

{'naivebayes__alpha': 1.0,
 'vectorizer__max_df': 0.8,
 'vectorizer__ngram_range': (1, 2)}

In [9]:
search_results = pd.DataFrame(search.cv_results_)[["mean_fit_time","mean_test_score","mean_train_score",
                                                   "param_naivebayes__alpha","param_vectorizer__max_df", 
                                                   "param_vectorizer__ngram_range"]]
search_results.sort_values("mean_test_score", ascending=False)



Unnamed: 0,mean_fit_time,mean_test_score,mean_train_score,param_naivebayes__alpha,param_vectorizer__max_df,param_vectorizer__ngram_range
20,0.214362,0.893794,0.99904,1.0,0.8,"(1, 3)"
22,0.124207,0.893794,0.994562,1.0,0.9,"(1, 2)"
23,0.202088,0.893794,0.99904,1.0,0.9,"(1, 3)"
25,0.113499,0.893794,0.994562,1.0,1.0,"(1, 2)"
26,0.192821,0.893794,0.99904,1.0,1.0,"(1, 3)"
19,0.117914,0.893794,0.994562,1.0,0.8,"(1, 2)"
29,0.207913,0.888996,0.893794,10.0,0.8,"(1, 3)"
35,0.199304,0.888996,0.893794,10.0,1.0,"(1, 3)"
31,0.11518,0.888996,0.890755,10.0,0.9,"(1, 2)"
32,0.196928,0.888996,0.893794,10.0,0.9,"(1, 3)"


In [19]:
train_pred = search.predict(train_x)
print(f"accuracy: {np.round(accuracy_score(train_pred, train_y),4)*100}%")
print(f"f1-score: {np.round(f1_score(train_pred, train_y),4)*100}%")

accuracy: 99.14%
f1-score: 96.17%


In [11]:
confusion_matrix(train_pred, train_y)

array([[2760,    7],
       [  20,  339]])

In [20]:
test_pred = search.predict(test_x)
print(f"accuracy: {np.round(accuracy_score(test_pred, test_y),4)*100}%")
print(f"f1-score: {np.round(f1_score(test_pred, test_y),4)*100}%")

accuracy: 89.84%
f1-score: 19.7%


In [13]:
confusion_matrix(test_pred, test_y)

array([[924, 102],
       [  4,  13]])

In [14]:
joblib.dump(search, "Twitter_binary_naive_bayes.pkl") 

['Twitter_binary_naive_bayes.pkl']