### Import Libraries

In [49]:
import pickle

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

### Implant Coverage

In [50]:
dataset_file = open('train_implant_coverage_df.pkl', 'rb')
train_implant_coverage_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_implant_coverage_df.pkl', 'rb')
test_implant_coverage_df = pickle.load(dataset_file)
dataset_file.close()

In [51]:
X_train = train_implant_coverage_df['text_cleaned']
y_train = train_implant_coverage_df['implant_coverage']

X_test = test_implant_coverage_df['text_cleaned']
y_test = test_implant_coverage_df['implant_coverage']

In [52]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred_light = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred_light = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred_light = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred_light = mnb_tfidf_bi_grid.predict(X_test)



Best GridSearch Scores
mnb_cv_uni: 0.8838114754098361
mnb_cv_bi: 0.8921448087431694
mnb_tfidf_uni: 0.8796448087431693
mnb_tfidf_bi: 0.8921448087431694
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': False}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.9508196721311475
mnb_cv_bi: 0.9344262295081968
mnb_tfidf_uni: 0.9508196721311475
mnb_tfidf_bi: 0.9672131147540983


### Root Canal

In [53]:
dataset_file = open('train_root_canal_df.pkl', 'rb')
train_root_canal_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_root_canal_df.pkl', 'rb')
test_root_canal_df = pickle.load(dataset_file)
dataset_file.close()

In [54]:
X_train = train_root_canal_df['text_cleaned']
y_train = train_root_canal_df['root_canal']

X_test = test_root_canal_df['text_cleaned']
y_test = test_root_canal_df['root_canal']

In [55]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred_light = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred_light = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred_light = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred_light = mnb_tfidf_bi_grid.predict(X_test)



Best GridSearch Scores
mnb_cv_uni: 0.7260928961748634
mnb_cv_bi: 0.730327868852459
mnb_tfidf_uni: 0.8256147540983607
mnb_tfidf_bi: 0.8297814207650274
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0.001, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0.01, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.7377049180327869
mnb_cv_bi: 0.7704918032786885
mnb_tfidf_uni: 0.8524590163934426
mnb_tfidf_bi: 0.8524590163934426


### Healthy Food Rollover

In [56]:
dataset_file = open('train_healthy_food_rollover_df.pkl', 'rb')
train_healthy_food_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_healthy_food_rollover_df.pkl', 'rb')
test_healthy_food_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [57]:
X_train = train_healthy_food_rollover_df['text_cleaned']
y_train = train_healthy_food_rollover_df['healthy_food_rollover']

X_test = test_healthy_food_rollover_df['text_cleaned']
y_test = test_healthy_food_rollover_df['healthy_food_rollover']

In [58]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred_light = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred_light = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred_light = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred_light = mnb_tfidf_bi_grid.predict(X_test)



Best GridSearch Scores
mnb_cv_uni: 0.9334699453551912
mnb_cv_bi: 0.9793032786885245
mnb_tfidf_uni: 0.9752049180327869
mnb_tfidf_bi: 0.9793032786885245
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.9016393442622951
mnb_cv_bi: 1.0
mnb_tfidf_uni: 1.0
mnb_tfidf_bi: 1.0


### OTC Rollover

In [59]:
dataset_file = open('train_otc_rollover_df.pkl', 'rb')
train_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_otc_rollover_df.pkl', 'rb')
test_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [60]:
X_train = train_otc_rollover_df['text_cleaned']
y_train = train_otc_rollover_df['otc_rollover']

X_test = test_otc_rollover_df['text_cleaned']
y_test = test_otc_rollover_df['otc_rollover']

In [61]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred_light = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred_light = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred_light = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred_light = mnb_tfidf_bi_grid.predict(X_test)



Best GridSearch Scores
mnb_cv_uni: 0.8426912568306011
mnb_cv_bi: 0.8468579234972677
mnb_tfidf_uni: 0.921243169398907
mnb_tfidf_bi: 0.8675546448087431
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.8360655737704918
mnb_cv_bi: 0.8524590163934426
mnb_tfidf_uni: 0.8852459016393442
mnb_tfidf_bi: 0.8524590163934426
