# Bernoulli Naive Bayes

## Baseline implementation

In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import sklearn
print(sklearn.__version__)
from sklearn.naive_bayes import BernoulliNB
from collections import Counter
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.model_selection import GridSearchCV
import warnings
from pprint import pprint
import pickle

warnings.filterwarnings('ignore')

# Load the datasets
df_train = pd.read_csv('../data/training.csv', index_col=0)

X_train = df_train['article_words'].values
y_train = df_train['topic'].values

assert(len(X_train) == 9500)
assert(len(y_train) == 9500)

labels = set(y_train)
labels.remove('IRRELEVANT')
custom_f1 = make_scorer(f1_score, labels=list(labels), average='macro')

scorer = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, labels=list(labels), average='micro'),
    'recall': make_scorer(recall_score, labels=list(labels), average='micro'),
    'f1': make_scorer(f1_score, labels=list(labels), average='micro'),    
    'precision_macro': make_scorer(precision_score, labels=list(labels), average='macro'),
    'recall_macro': make_scorer(recall_score, labels=list(labels), average='macro'),
    'f1_macro': make_scorer(f1_score, labels=list(labels), average='macro'),
    'custom': custom_f1,
}

0.22.2.post1


Using TensorFlow backend.


### setting up pipeline

In [2]:
clf = make_pipeline_imb(
    TfidfVectorizer(stop_words='english'),
    #SMOTE(random_state=0,n_jobs=-1),
    RandomOverSampler(sampling_strategy={'IRRELEVANT': 8000, 'MONEY MARKETS': 1673, 'SPORTS': 1673,
                                              'FOREX MARKETS': 1500, 'DEFENCE': 1500, 'SHARE LISTINGS': 1500, 'HEALTH': 1500,
                                              'BIOGRAPHIES PERSONALITIES PEOPLE': 1673, 'DOMESTIC MARKETS': 1500, 'ARTS CULTURE ENTERTAINMENT': 1500,
                                              'SCIENCE AND TECHNOLOGY': 1500}),
    BernoulliNB(),
)



grid_params = {
    'bernoullinb__alpha': [0.01,0.1,0.5,1.0, 2.0],

    'tfidfvectorizer__max_features': [20_000],
    'tfidfvectorizer__max_df': [0.5],
    'tfidfvectorizer__sublinear_tf': [True],
    'tfidfvectorizer__lowercase': [False],
}



grid = GridSearchCV(clf,
                    grid_params,
                    cv=5,
                    n_jobs=-1,
                    scoring=scorer,
                    refit='custom',
                    verbose=1,
                    return_train_score=True)

grid.fit(X_train, y_train)

print(f'Best cross-validation score: {grid.best_score_:.2f}')
print(f'Best params: {grid.best_params_}')
      
      

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   33.9s finished


Best cross-validation score: 0.55
Best params: {'bernoullinb__alpha': 2.0, 'tfidfvectorizer__lowercase': False, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 20000, 'tfidfvectorizer__sublinear_tf': True}


### refitting the best parameter by grid search

In [3]:
print(f'Best cross-validation score: {grid.best_score_:.2f}')
print(f'Best params: {grid.best_params_}')


Best cross-validation score: 0.55
Best params: {'bernoullinb__alpha': 2.0, 'tfidfvectorizer__lowercase': False, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 20000, 'tfidfvectorizer__sublinear_tf': True}


### saving the results to a dataframe

In [4]:
model_name = 'BNB'

best_bnb = grid.cv_results_['params'].index(grid.best_params_)

d = {
    'model': model_name,
}

for metric in scorer:
    for dataset in ['train', 'test']:
        for stat in ['mean', 'std']:
            metric_item = f'{stat}_{dataset}_{metric}'
            d[metric_item] = grid.cv_results_[metric_item][best_bnb]

df_models_clf = pd.DataFrame(d, index=[0])

with open(f'../data/pickles/best_{model_name}.pickle',
          'wb') as output:
    pickle.dump(best_bnb, output)

with open(f'../data/pickles/df_results_{model_name}.pickle',
          'wb') as output:
    pickle.dump(df_models_clf, output)

df_models_clf

Unnamed: 0,model,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_precision,std_train_precision,mean_test_precision,std_test_precision,mean_train_recall,...,mean_test_recall_macro,std_test_recall_macro,mean_train_f1_macro,std_train_f1_macro,mean_test_f1_macro,std_test_f1_macro,mean_train_custom,std_train_custom,mean_test_custom,std_test_custom
0,BNB,0.771842,0.002917,0.709053,0.002036,0.671089,0.003097,0.604036,0.006571,0.759914,...,0.564236,0.022818,0.758379,0.002977,0.547403,0.021601,0.758379,0.002977,0.547403,0.021601
