# Naive Bayes

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

%cd "G:/Archive"

G:\Archive


## Data Prep

In [4]:
#load data
data = pd.read_csv("data/wine quality red.csv", header = 0)

#aggregate into binary classification problem
data["quality"] = pd.Categorical(np.where(data["quality"] > 5, "above_avg", "below_avg"))

#split data
X = data.iloc[:, :11]
y = data.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

## Searching Optimal Parameters

In [11]:
#all predictors are continuous variables in this case
#GaussianNB is the proper classifier function
#var_smoothing is the tuning parameter
#10-fold cv with accuracy as target metrics
grid1 = {"var_smoothing": 1 / np.power(10, range(5))}
model1 = GridSearchCV(GaussianNB(), param_grid = grid1, cv = 10, scoring = "accuracy")
model1.fit(X_train, y_train)

#show tuning results
print("Best score: ", model1.best_score_)
print("Corresponding parms: ", model1.best_params_)
print("Model description: ", model1.best_estimator_)

#predict on test set
pred1 = model1.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred1)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred1, pos_label = "above_avg", average = "binary")))

Best score:  0.7114849901574802
Corresponding parms:  {'var_smoothing': 0.0001}
Model description:  GaussianNB(priors=None, var_smoothing=0.0001)
Accuracy: 0.7594
Recall: 0.7500
Precision: 0.7914
F1 score: 0.7701


## Visualization