### Import Libraries

In [41]:
import pickle

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support

import shap
from lime.lime_text import LimeTextExplainer
import warnings

In [18]:
warnings.filterwarnings('ignore')

### Implant Coverage

In [19]:
dataset_file = open('train_implant_coverage_df.pkl', 'rb')
train_implant_coverage_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_implant_coverage_df.pkl', 'rb')
test_implant_coverage_df = pickle.load(dataset_file)
dataset_file.close()

In [20]:
X_train = train_implant_coverage_df['text_cleaned']
y_train = train_implant_coverage_df['implant_coverage']

X_test = test_implant_coverage_df['text_cleaned']
y_test = test_implant_coverage_df['implant_coverage']

In [26]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.8806082505269497
mnb_cv_bi: 0.886668172237278
mnb_tfidf_uni: 0.8744730502860585
mnb_tfidf_bi: 0.8836193917494731
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.9024390243902439
mnb_cv_bi: 0.9024390243902439
mnb_tfidf_uni: 0.9024390243902439
mnb_tfidf_bi: 0.9024390243902439
Test Results
mnb_cv_uni: (0.8823529411764706, 0.8823529411764706, 0.8823529411764706, None)
mnb_cv_bi: (0.8823529411764706, 0.8823529411764706, 0.8823529411764706, None)
mnb_tfidf_uni: (0.8823529411764706, 0.8823529411764706, 0.8823529411764706, None)
mnb_tfidf_bi: (0.8823529411764706, 0.8823529411764706, 0.8823529411764706, None)


### Root Canal

In [49]:
dataset_file = open('train_root_canal_df.pkl', 'rb')
train_root_canal_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_root_canal_df.pkl', 'rb')
test_root_canal_df = pickle.load(dataset_file)
dataset_file.close()

In [50]:
X_train = train_root_canal_df['text_cleaned']
y_train = train_root_canal_df['root_canal']

X_test = test_root_canal_df['text_cleaned']
y_test = test_root_canal_df['root_canal']

In [51]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.7433002107798856
mnb_cv_bi: 0.7708145137006925
mnb_tfidf_uni: 0.8135350797952425
mnb_tfidf_bi: 0.8256549232158988
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0.01, 'mnb__fit_prior': False}
Test Scores
mnb_cv_uni: 0.7682926829268293
mnb_cv_bi: 0.7926829268292683
mnb_tfidf_uni: 0.8414634146341463
mnb_tfidf_bi: 0.8658536585365854
Test Results
mnb_cv_uni: (0.8709677419354839, 0.8307692307692308, 0.8503937007874016, None)
mnb_cv_bi: (0.8870967741935484, 0.8461538461538461, 0.8661417322834646, None)
mnb_tfidf_uni: (0.8421052631578947, 0.9846153846153847, 0.9078014184397163, None)
mnb_tfidf_bi: (0.8552631578947368, 1.0, 0.9219858156028369, None)


### Healthy Food Rollover

In [52]:
dataset_file = open('train_healthy_food_rollover_df.pkl', 'rb')
train_healthy_food_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_healthy_food_rollover_df.pkl', 'rb')
test_healthy_food_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [53]:
X_train = train_healthy_food_rollover_df['text_cleaned']
y_train = train_healthy_food_rollover_df['healthy_food_rollover']

X_test = test_healthy_food_rollover_df['text_cleaned']
y_test = test_healthy_food_rollover_df['healthy_food_rollover']

In [54]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.9050361336946702
mnb_cv_bi: 0.9908160192713038
mnb_tfidf_uni: 0.9877672387834989
mnb_tfidf_bi: 0.9938647997591087
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': False}
Test Scores
mnb_cv_uni: 0.8536585365853658
mnb_cv_bi: 1.0
mnb_tfidf_uni: 0.975609756097561
mnb_tfidf_bi: 1.0
Test Results
mnb_cv_uni: (0.14285714285714285, 1.0, 0.25, None)
mnb_cv_bi: (1.0, 1.0, 1.0, None)
mnb_tfidf_uni: (0.0, 0.0, 0.0, None)
mnb_tfidf_bi: (1.0, 1.0, 1.0, None)


### OTC Rollover

In [55]:
dataset_file = open('train_otc_rollover_df.pkl', 'rb')
train_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_otc_rollover_df.pkl', 'rb')
test_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [56]:
X_train = train_otc_rollover_df['text_cleaned']
y_train = train_otc_rollover_df['otc_rollover']

X_test = test_otc_rollover_df['text_cleaned']
y_test = test_otc_rollover_df['otc_rollover']

In [57]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.8654019873532068
mnb_cv_bi: 0.8593044263775971
mnb_tfidf_uni: 0.9235922914784704
mnb_tfidf_bi: 0.9143706714844926
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0.01, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.8902439024390244
mnb_cv_bi: 0.8902439024390244
mnb_tfidf_uni: 0.926829268292683
mnb_tfidf_bi: 0.8780487804878049
Test Results
mnb_cv_uni: (0.5294117647058824, 0.9, 0.6666666666666667, None)
mnb_cv_bi: (0.5294117647058824, 0.9, 0.6666666666666667, None)
mnb_tfidf_uni: (0.75, 0.6, 0.6666666666666665, None)
mnb_tfidf_bi: (0.5, 0.2, 0.28571428571428575, None)


### Explainers

In [None]:
X_train_implant_coverage = train_implant_coverage_df['text_cleaned']
y_train_implant_coverage = train_implant_coverage_df['implant_coverage']

X_test_implant_coverage = test_implant_coverage_df['text_cleaned']
y_test_implant_coverage = test_implant_coverage_df['implant_coverage']

model_implant_coverage = MultinomialNB(alpha = 0, fit_prior = True)
vectorizer_implant_coverage = CountVectorizer(ngram_range = (2, 2))

X_train_vec_implant_coverage = vectorizer_implant_coverage.fit_transform(X_train).toarray()
X_test_vec_implant_coverage = vectorizer_implant_coverage.transform(X_test).toarray()
model_implant_coverage.fit(X_train_vec_implant_coverage, y_train_implant_coverage)

pipeline_implant_coverage = make_pipeline(vectorizer_implant_coverage, model_implant_coverage)

class_names = ['Y', 'N']
explainer_implant_coverage = LimeTextExplainer(class_names = class_names)

In [None]:
exp_implant_coverage = explainer_implant_coverage.explain_instance(X_test[14], pipeline_implant_coverage.predict_proba, num_features = 50)
exp_implant_coverage.show_in_notebook(text = True)

In [None]:
# model = MultinomialNB(alpha = 0, fit_prior = True)
# vectorizer = CountVectorizer(ngram_range = (2, 2))

# X_train_vec = vectorizer.fit_transform(X_train).toarray()
# X_test_vec = vectorizer.transform(X_test).toarray()
# model.fit(X_train_vec, y_train)
# feature_names = vectorizer.get_feature_names_out()

# explainer = shap.Explainer(model.predict, X_train_vec, feature_names = feature_names)
# shap_values = explainer(X_test_vec[:1])