# Multinomial Naive Bayes

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
import pandas as pd
from pprint import pprint
import pickle
import numpy as np

In [14]:
model_name = 'multi_NB'

df_train = pd.read_csv('../data/training.csv', index_col=0)

In [15]:

X_train = df_train['article_words'].values
y_train = df_train['topic'].values

print(X_train.shape, y_train.shape)

labels = set(y_train)
# labels.remove('IRRELEVANT')
custom_f1 = make_scorer(f1_score, labels=list(labels), average='macro')

scorer = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, labels=list(labels), average='micro'),
    'recall': make_scorer(recall_score, labels=list(labels), average='micro'),
    'f1': make_scorer(f1_score, labels=list(labels), average='micro'),    
    'precision_macro': make_scorer(precision_score, labels=list(labels), average='macro'),
    'recall_macro': make_scorer(recall_score, labels=list(labels), average='macro'),
    'f1_macro': make_scorer(f1_score, labels=list(labels), average='macro'),
    'custom': custom_f1,
    
}

clf = make_pipeline_imb(
    TfidfVectorizer(),
    SMOTE(random_state=0,n_jobs=-1),
    MultinomialNB(),
)

grid_params = {
#     'multinomialnb__alpha': [0.1, 0.5, 1.0],
    'multinomialnb__alpha': np.linspace(0.5, 1.5, 5),
    'multinomialnb__fit_prior': [True, False], 
    'tfidfvectorizer__max_features': [20_000],
    'tfidfvectorizer__max_df': [0.5],
    'tfidfvectorizer__sublinear_tf': [True],
    'tfidfvectorizer__lowercase': [False],
}

(9500,) (9500,) (500,) (500,)


In [16]:
%%time
grid = GridSearchCV(clf,
                    grid_params,
                    cv=5,
                    n_jobs=-1,
                    scoring=scorer,
                    refit='custom',
                    verbose=1,
                    return_train_score=True)

grid.fit(X_train, y_train)

print(f'Best cross-validation score: {grid.best_score_:.2f}')
print(f'Best params: {grid.best_params_}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


Best cross-validation score: 0.58
Best params: {'multinomialnb__alpha': 0.5, 'multinomialnb__fit_prior': True, 'tfidfvectorizer__lowercase': False, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 20000, 'tfidfvectorizer__sublinear_tf': True}
CPU times: user 2.94 s, sys: 759 ms, total: 3.7 s
Wall time: 1min 30s


In [17]:
best_clf = grid.best_estimator_
best_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=0.5,
                                 max_features=20000, min_df=1,
                                 ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('smote',
                 SMOTE(k_neighbors=5, n_jobs=-1, random_state=0,
                       sampling_stra

In [18]:
best_idx = grid.cv_results_['params'].index(grid.best_params_)

d = {
    'model': model_name,
}

for metric in scorer:
    for dataset in ['train', 'test']:
        for stat in ['mean', 'std']:
            metric_item = f'{stat}_{dataset}_{metric}'
            d[metric_item] = grid.cv_results_[metric_item][best_idx]

df_models_clf = pd.DataFrame(d, index=[0])

with open(f'../data/pickles/best_{model_name}.pickle', 'wb') as output:
    pickle.dump(best_clf, output)
    
with open(f'../data/pickles/df_results_{model_name}.pickle', 'wb') as output:
    pickle.dump(df_models_clf, output)
    
df_models_clf

Unnamed: 0,model,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_precision,std_train_precision,mean_test_precision,std_test_precision,mean_train_recall,...,mean_test_recall_macro,std_test_recall_macro,mean_train_f1_macro,std_train_f1_macro,mean_test_f1_macro,std_test_f1_macro,mean_train_custom,std_train_custom,mean_test_custom,std_test_custom
0,multi_NB,0.756316,0.003184,0.666737,0.010077,0.756316,0.003184,0.666737,0.010077,0.756316,...,0.710465,0.016987,0.753196,0.003805,0.584823,0.015956,0.753196,0.003805,0.584823,0.015956
