### Import Libraries

In [37]:
import pickle

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### VBID

In [43]:
dataset_file = open('train_vbid_df.pkl', 'rb')
train_vbid_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_vbid_df.pkl', 'rb')
test_vbid_df = pickle.load(dataset_file)
dataset_file.close()

In [44]:
X_train = train_vbid_df['text_cleaned']
y_train = train_vbid_df['vbid']

X_test = test_vbid_df['text_cleaned']
y_test = test_vbid_df['vbid']

In [45]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred_light = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred_light = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred_light = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred_light = mnb_tfidf_bi_grid.predict(X_test)



Best GridSearch Scores
mnb_cv_uni: 0.8846153846153847
mnb_cv_bi: 0.8846153846153847
mnb_tfidf_uni: 0.8942307692307693
mnb_tfidf_bi: 0.8942307692307693
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': False}
mnb_tfidf_bi: {'mnb__alpha': 0.01, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.8518518518518519
mnb_cv_bi: 0.8888888888888888
mnb_tfidf_uni: 0.8518518518518519
mnb_tfidf_bi: 0.8518518518518519


### OTC Rollover

In [46]:
dataset_file = open('train_otc_rollover_df.pkl', 'rb')
train_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_otc_rollover_df.pkl', 'rb')
test_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [47]:
X_train = train_otc_rollover_df['text_cleaned']
y_train = train_otc_rollover_df['otc_rollover']

X_test = test_otc_rollover_df['text_cleaned']
y_test = test_otc_rollover_df['otc_rollover']

In [48]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred_light = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred_light = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred_light = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred_light = mnb_tfidf_bi_grid.predict(X_test)



Best GridSearch Scores
mnb_cv_uni: 0.7524350649350651
mnb_cv_bi: 0.7387987012987014
mnb_tfidf_uni: 0.8198051948051948
mnb_tfidf_bi: 0.7477272727272728
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': False}
Test Scores
mnb_cv_uni: 0.7857142857142857
mnb_cv_bi: 0.8392857142857143
mnb_tfidf_uni: 0.8928571428571429
mnb_tfidf_bi: 0.8392857142857143
