### Import Libraries

In [75]:
import pickle

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support

import shap
from lime.lime_text import LimeTextExplainer
import warnings

In [76]:
warnings.filterwarnings('ignore')

### Implant Coverage

In [77]:
dataset_file = open('train_implant_coverage_df.pkl', 'rb')
train_implant_coverage_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_implant_coverage_df.pkl', 'rb')
test_implant_coverage_df = pickle.load(dataset_file)
dataset_file.close()

In [78]:
X_train = train_implant_coverage_df['text_cleaned']
y_train = train_implant_coverage_df['implant_coverage']

X_test = test_implant_coverage_df['text_cleaned']
y_test = test_implant_coverage_df['implant_coverage']

In [79]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.8746236073471846
mnb_cv_bi: 0.877710027100271
mnb_tfidf_uni: 0.8837699488105991
mnb_tfidf_bi: 0.889942788316772
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.8536585365853658
mnb_cv_bi: 0.8658536585365854
mnb_tfidf_uni: 0.8658536585365854
mnb_tfidf_bi: 0.8658536585365854
Test Results
mnb_cv_uni: (0.8928571428571429, 0.7352941176470589, 0.806451612903226, None)
mnb_cv_bi: (0.896551724137931, 0.7647058823529411, 0.8253968253968255, None)
mnb_tfidf_uni: (0.9259259259259259, 0.7352941176470589, 0.819672131147541, None)
mnb_tfidf_bi: (0.896551724137931, 0.7647058823529411, 0.8253968253968255, None)


### Root Canal

In [80]:
dataset_file = open('train_root_canal_df.pkl', 'rb')
train_root_canal_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_root_canal_df.pkl', 'rb')
test_root_canal_df = pickle.load(dataset_file)
dataset_file.close()

In [81]:
X_train = train_root_canal_df['text_cleaned']
y_train = train_root_canal_df['root_canal']

X_test = test_root_canal_df['text_cleaned']
y_test = test_root_canal_df['root_canal']

In [83]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.7398373983739838
mnb_cv_bi: 0.7430367359229147
mnb_tfidf_uni: 0.8042758205359832
mnb_tfidf_bi: 0.8134974405299609
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 1, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0.01, 'mnb__fit_prior': False}
Test Scores
mnb_cv_uni: 0.7804878048780488
mnb_cv_bi: 0.7560975609756098
mnb_tfidf_uni: 0.8414634146341463
mnb_tfidf_bi: 0.8536585365853658
Test Results
mnb_cv_uni: (0.873015873015873, 0.8461538461538461, 0.859375, None)
mnb_cv_bi: (0.8813559322033898, 0.8, 0.8387096774193548, None)
mnb_tfidf_uni: (0.8611111111111112, 0.9538461538461539, 0.9051094890510949, None)
mnb_tfidf_bi: (0.8732394366197183, 0.9538461538461539, 0.9117647058823529, None)


### Healthy Food Rollover

In [84]:
dataset_file = open('train_healthy_food_rollover_df.pkl', 'rb')
train_healthy_food_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_healthy_food_rollover_df.pkl', 'rb')
test_healthy_food_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [85]:
X_train = train_healthy_food_rollover_df['text_cleaned']
y_train = train_healthy_food_rollover_df['healthy_food_rollover']

X_test = test_healthy_food_rollover_df['text_cleaned']
y_test = test_healthy_food_rollover_df['healthy_food_rollover']

In [86]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.9327009936766034
mnb_cv_bi: 0.9877672387834989
mnb_tfidf_uni: 0.9908160192713038
mnb_tfidf_bi: 0.9908160192713038
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0, 'mnb__fit_prior': False}
Test Scores
mnb_cv_uni: 0.9390243902439024
mnb_cv_bi: 1.0
mnb_tfidf_uni: 1.0
mnb_tfidf_bi: 1.0
Test Results
mnb_cv_uni: (0.2857142857142857, 1.0, 0.4444444444444445, None)
mnb_cv_bi: (1.0, 1.0, 1.0, None)
mnb_tfidf_uni: (1.0, 1.0, 1.0, None)
mnb_tfidf_bi: (1.0, 1.0, 1.0, None)


### OTC Rollover

In [87]:
dataset_file = open('train_otc_rollover_df.pkl', 'rb')
train_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

dataset_file = open('test_otc_rollover_df.pkl', 'rb')
test_otc_rollover_df = pickle.load(dataset_file)
dataset_file.close()

In [88]:
X_train = train_otc_rollover_df['text_cleaned']
y_train = train_otc_rollover_df['otc_rollover']

X_test = test_otc_rollover_df['text_cleaned']
y_test = test_otc_rollover_df['otc_rollover']

In [90]:
# Setup MNB using CountVectorizer, TFIDF, unigrams, bigrams, & different alpha values

parameters = {'mnb__alpha': [0, 0.001, 0.01, 0.1, 1],
              'mnb__fit_prior': [True, False]}

mnb_cv_uni = Pipeline([('cv', CountVectorizer()),
                       ('mnb', MultinomialNB())])
mnb_cv_bi = Pipeline([('cv', CountVectorizer(ngram_range = (2, 2))),
                      ('mnb', MultinomialNB())])
mnb_tfidf_uni = Pipeline([('tfidf', TfidfVectorizer()),
                ('mnb', MultinomialNB())])
mnb_tfidf_bi = Pipeline([('tfidf', TfidfVectorizer(ngram_range = (2, 2))),
                         ('mnb', MultinomialNB())])

mnb_cv_uni_grid = GridSearchCV(mnb_cv_uni,
                               parameters,
                               cv = 4)
mnb_cv_bi_grid = GridSearchCV(mnb_cv_bi,
                              parameters,
                              cv = 4)
mnb_tfidf_uni_grid = GridSearchCV(mnb_tfidf_uni,
                                  parameters,
                                  cv = 4)
mnb_tfidf_bi_grid = GridSearchCV(mnb_tfidf_bi,
                                 parameters,
                                 cv = 4)

# Train MNB

mnb_cv_uni_grid.fit(X_train, y_train)
mnb_cv_bi_grid.fit(X_train, y_train)
mnb_tfidf_uni_grid.fit(X_train, y_train)
mnb_tfidf_bi_grid.fit(X_train, y_train)

# MNB best train scores

mnb_cv_uni_grid_score = mnb_cv_uni_grid.best_score_
mnb_cv_bi_grid_score = mnb_cv_bi_grid.best_score_
mnb_tfidf_uni_grid_score = mnb_tfidf_uni_grid.best_score_
mnb_tfidf_bi_grid_score = mnb_tfidf_bi_grid.best_score_

print('Best GridSearch Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score)
print('mnb_cv_bi:', mnb_cv_bi_grid_score)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score)

# MNB best params

mnb_cv_uni_grid_params = mnb_cv_uni_grid.best_params_
mnb_cv_bi_grid_params = mnb_cv_bi_grid.best_params_
mnb_tfidf_uni_grid_params = mnb_tfidf_uni_grid.best_params_
mnb_tfidf_bi_grid_params = mnb_tfidf_bi_grid.best_params_

print('Best GridSearch Params')
print('mnb_cv_uni:', mnb_cv_uni_grid_params)
print('mnb_cv_bi:', mnb_cv_bi_grid_params)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_params)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_params)

# MNB best test scores

mnb_cv_uni_grid_score_test = mnb_cv_uni_grid.score(X_test, y_test)
mnb_cv_bi_grid_score_test = mnb_cv_bi_grid.score(X_test, y_test)
mnb_tfidf_uni_grid_score_test = mnb_tfidf_uni_grid.score(X_test, y_test)
mnb_tfidf_bi_grid_score_test = mnb_tfidf_bi_grid.score(X_test, y_test)

print('Test Scores')
print('mnb_cv_uni:', mnb_cv_uni_grid_score_test)
print('mnb_cv_bi:', mnb_cv_bi_grid_score_test)
print('mnb_tfidf_uni:', mnb_tfidf_uni_grid_score_test)
print('mnb_tfidf_bi:', mnb_tfidf_bi_grid_score_test)

# Get test scores
mnb_cv_uni_pred = mnb_cv_uni_grid.predict(X_test)
mnb_cv_bi_pred = mnb_cv_bi_grid.predict(X_test)
mnb_tfidf_uni_pred = mnb_tfidf_uni_grid.predict(X_test)
mnb_tfidf_bi_pred = mnb_tfidf_bi_grid.predict(X_test)

mnb_cv_uni_prf = precision_recall_fscore_support(y_test, mnb_cv_uni_pred, average = 'binary', pos_label = 'Y')
mnb_cv_bi_prf = precision_recall_fscore_support(y_test, mnb_cv_bi_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_uni_prf = precision_recall_fscore_support(y_test, mnb_tfidf_uni_pred, average = 'binary', pos_label = 'Y')
mnb_tfidf_bi_prf = precision_recall_fscore_support(y_test, mnb_tfidf_bi_pred, average = 'binary', pos_label = 'Y')

print('Test Results')
print('mnb_cv_uni:', mnb_cv_uni_prf)
print('mnb_cv_bi:', mnb_cv_bi_prf)
print('mnb_tfidf_uni:', mnb_tfidf_uni_prf)
print('mnb_tfidf_bi:', mnb_tfidf_bi_prf)

Best GridSearch Scores
mnb_cv_uni: 0.8563309244203553
mnb_cv_bi: 0.8624661246612466
mnb_tfidf_uni: 0.9326633544113219
mnb_tfidf_bi: 0.9143330322192111
Best GridSearch Params
mnb_cv_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_cv_bi: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_uni: {'mnb__alpha': 0, 'mnb__fit_prior': True}
mnb_tfidf_bi: {'mnb__alpha': 0.01, 'mnb__fit_prior': True}
Test Scores
mnb_cv_uni: 0.9024390243902439
mnb_cv_bi: 0.926829268292683
mnb_tfidf_uni: 0.926829268292683
mnb_tfidf_bi: 0.8780487804878049
Test Results
mnb_cv_uni: (0.5625, 0.9, 0.6923076923076923, None)
mnb_cv_bi: (0.6428571428571429, 0.9, 0.75, None)
mnb_tfidf_uni: (0.7, 0.7, 0.7, None)
mnb_tfidf_bi: (0.5, 0.1, 0.16666666666666669, None)


### Explainers

In [103]:
X_train_implant_coverage = train_implant_coverage_df['text_cleaned']
y_train_implant_coverage = train_implant_coverage_df['implant_coverage']

X_test_implant_coverage = test_implant_coverage_df['text_cleaned']
y_test_implant_coverage = test_implant_coverage_df['implant_coverage']

model_implant_coverage = MultinomialNB(alpha = 0, fit_prior = True)
vectorizer_implant_coverage = CountVectorizer(ngram_range = (2, 2))

X_train_vec_implant_coverage = vectorizer_implant_coverage.fit_transform(X_train_implant_coverage).toarray()
X_test_vec_implant_coverage = vectorizer_implant_coverage.transform(X_test_implant_coverage).toarray()
model_implant_coverage.fit(X_train_vec_implant_coverage, y_train_implant_coverage)

pipeline_implant_coverage = make_pipeline(vectorizer_implant_coverage, model_implant_coverage)

class_names = ['Y', 'N']
explainer_implant_coverage = LimeTextExplainer(class_names = class_names)

In [104]:
exp_implant_coverage = explainer_implant_coverage.explain_instance(X_test_implant_coverage[615], pipeline_implant_coverage.predict_proba, num_features = 10)
exp_implant_coverage.show_in_notebook(text = True)

In [105]:
X_train_root_canal = train_root_canal_df['text_cleaned']
y_train_root_canal = train_root_canal_df['root_canal']

X_test_root_canal = test_root_canal_df['text_cleaned']
y_test_root_canal = test_root_canal_df['root_canal']

model_root_canal = MultinomialNB(alpha = 0.01, fit_prior = False)
vectorizer_root_canal = TfidfVectorizer(ngram_range = (2, 2))

X_train_vec_root_canal = vectorizer_root_canal.fit_transform(X_train_root_canal).toarray()
X_test_vec_root_canal = vectorizer_root_canal.transform(X_test_root_canal).toarray()
model_root_canal.fit(X_train_vec_root_canal, y_train_root_canal)

pipeline_root_canal = make_pipeline(vectorizer_root_canal, model_root_canal)

class_names = ['Y', 'N']
explainer_root_canal = LimeTextExplainer(class_names = class_names)

In [106]:
exp_root_canal = explainer_root_canal.explain_instance(X_test_root_canal[195], pipeline_root_canal.predict_proba, num_features = 10)
exp_root_canal.show_in_notebook(text = True)

In [108]:
X_train_healthy_food_rollover = train_healthy_food_rollover_df['text_cleaned']
y_train_healthy_food_rollover = train_healthy_food_rollover_df['healthy_food_rollover']

X_test_healthy_food_rollover = test_healthy_food_rollover_df['text_cleaned']
y_test_healthy_food_rollover = test_healthy_food_rollover_df['healthy_food_rollover']

model_healthy_food_rollover = MultinomialNB(alpha = 0, fit_prior = False)
vectorizer_healthy_food_rollover = TfidfVectorizer(ngram_range = (2, 2))

X_train_vec_healthy_food_rollover = vectorizer_healthy_food_rollover.fit_transform(X_train_healthy_food_rollover).toarray()
X_test_vec_healthy_food_rollover = vectorizer_healthy_food_rollover.transform(X_test_healthy_food_rollover).toarray()
model_healthy_food_rollover.fit(X_train_vec_healthy_food_rollover, y_train_healthy_food_rollover)

pipeline_healthy_food_rollover = make_pipeline(vectorizer_healthy_food_rollover, model_healthy_food_rollover)

class_names = ['Y', 'N']
explainer_healthy_food_rollover = LimeTextExplainer(class_names = class_names)

In [110]:
exp_healthy_food_rollover = explainer_healthy_food_rollover.explain_instance(X_test_healthy_food_rollover[395], pipeline_healthy_food_rollover.predict_proba, num_features = 10)
exp_healthy_food_rollover.show_in_notebook(text = True)

In [111]:
X_train_otc_rollover = train_otc_rollover_df['text_cleaned']
y_train_otc_rollover = train_otc_rollover_df['otc_rollover']

X_test_otc_rollover = test_otc_rollover_df['text_cleaned']
y_test_otc_rollover = test_otc_rollover_df['otc_rollover']

model_otc_rollover = MultinomialNB(alpha = 0, fit_prior = True)
vectorizer_otc_rollover = CountVectorizer()

X_train_vec_otc_rollover = vectorizer_otc_rollover.fit_transform(X_train_otc_rollover).toarray()
X_test_vec_otc_rollover = vectorizer_otc_rollover.transform(X_test_otc_rollover).toarray()
model_otc_rollover.fit(X_train_vec_otc_rollover, y_train_otc_rollover)

pipeline_otc_rollover = make_pipeline(vectorizer_otc_rollover, model_otc_rollover)

class_names = ['Y', 'N']
explainer_otc_rollover = LimeTextExplainer(class_names = class_names)

In [113]:
exp_otc_rollover = explainer_otc_rollover.explain_instance(X_test_otc_rollover[389], pipeline_otc_rollover.predict_proba, num_features = 10)
exp_otc_rollover.show_in_notebook(text = True)