# Multinomial Naive Bayes - Production
The Multinomial NB seemed to have performed the best with this amount of data. This model took the least amount of tuning when it came to parameters to yield an accuracy I felt comfortable with. This model can accurately predict the sub of origin given text from one of my subs.
This model could be improved by also including articles and headlines to really be able to generalize better. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
import nltk
import re
import pickle



In [23]:
combined = pd.read_pickle('../data/combined.pkl')


In [12]:
X_train = pd.read_pickle('../data/X_train.pkl')
X_test = pd.read_pickle('../data/X_test.pkl')
y_train = pd.read_pickle('../data/y_train.pkl')
y_test = pd.read_pickle('../data/y_test.pkl')

In [13]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb',MultinomialNB()),
])


In [14]:
param_grid =  {
    'tfidf__min_df': np.arange(1,10,2),
    'tfidf__max_df': [.95, .98, 1.0],
    'nb__alpha': [.01,.10,.20,.50]
}

In [15]:
gs = GridSearchCV(pipe, param_grid=param_grid,verbose=1)

In [16]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 20.3min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tfidf__min_df': array([1, 3, 5, 7, 9]), 'tfidf__max_df': [0.95, 0.98, 1.0], 'nb__alpha': [0.01, 0.1, 0.2, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [17]:
gs.best_params_

{'nb__alpha': 0.5, 'tfidf__max_df': 0.95, 'tfidf__min_df': 1}

In [18]:
gs.score(X_train,y_train)

0.7667547535651739

In [19]:
gs.score(X_test,y_test)

0.7234255327592365

In [20]:
gs.predict_proba(X_test)

array([[0.15037602, 0.84962398],
       [0.43696979, 0.56303021],
       [0.25381307, 0.74618693],
       ...,
       [0.47298755, 0.52701245],
       [0.39041347, 0.60958653],
       [0.23486574, 0.76513426]])

In [21]:
gs.best_score_

0.7204700400300225

In [25]:
combined['subreddit'].value_counts()/combined.shape[0]

LateStageCapitalism    0.502404
Libertarian            0.497596
Name: subreddit, dtype: float64

In [27]:
foo = ["""
'I love Marx'
"""]

In [28]:
gs.predict(foo)

array(['LateStageCapitalism'], dtype='<U19')

In [29]:
preds = gs.predict(X_test)
preds

array(['Libertarian', 'Libertarian', 'Libertarian', ..., 'Libertarian',
       'Libertarian', 'Libertarian'], dtype='<U19')

In [30]:
cm = confusion_matrix(y_test,preds,labels=['LateStageCapitalism','Libertarian'])

In [31]:
cm_df = pd.DataFrame(data=cm,columns=['pred_LSC','pred_LIB'],index=['actual_Negtive','actual_Positive'])
cm_df

Unnamed: 0,pred_LSC,pred_LIB
actual_Negtive,39054,20604
actual_Positive,14243,52094


In [35]:
y_test.count('Libertarian'),y_test.count('LateStageCapitalism')

(66337, 59658)

In [33]:
with open('../assets/naive_bayes_model.pkl','wb+') as f:
    pickle.dump(gs,f)