# Gaussian Naive Bayes Classifier

The aim of this project is to implement and test a Guassian Naive Bayes classifier.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
import timeit

## GaussianNB Classifier
An implementation of a Gaussian Naive Bayes that fits the framework.

In [None]:
class MyGaussianNB(BaseEstimator, ClassifierMixin):
    def fit(self, xt, yt):
        xt = np.array(xt)
        yt = np.array(yt)

        # first phase: compute class priors
        self._classes, self._class_priors = np.unique(yt, return_counts=True)
        self._class_priors = self._class_priors / float(np.sum(self._class_priors))  

        # second phase: compute all features' means and variances
        xt_split = [xt[yt==cl] for cl in self._classes]
        self._means = np.array([np.mean(xtcl, axis=0) for xtcl in xt_split])
        self._vars = np.array([np.var(xtcl, axis=0) for xtcl in xt_split])
        return self

    def predict(self, xtests):
        xtests = np.array(xtests)

        probas = self._compute_probas(xtests)
        return np.array([self._classes[i] for i in np.argmax(probas, axis=0)])
        
    def predict_proba(self, xtests):
        xtests = np.array(xtests)

        probas = self._compute_probas(xtests)
        return np.transpose(probas / np.sum(probas, axis=0))

    def _compute_probas(self, xtests):
        return np.array([
            [
                self._class_priors[i] * np.product(self._conditional_proba(xtest, self._means[i], self._vars[i])) 
                for xtest in xtests
            ]
            for i in range(np.size(self._classes))
        ])
    
    @staticmethod
    def _conditional_proba(xt, m, var):
        return np.exp(-np.power(xt-m, 2)/(2*var))/np.sqrt(2*np.pi*var)

The function below will compare Scikit Gaussian NB and my implementation of Gaussian NB using accuracy and ROC analysis.

In [None]:
def test_gaussian_on_dataset(x_train, y_train, x_test, y_test):
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    print("Scikit GNB:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy score:")
    print(accuracy_score(y_test, y_pred))

    print()
    
    mgnb = MyGaussianNB()
    mgnb.fit(x_train,y_train)
    y_pred = mgnb.predict(x_test)
    print("My GNB:")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy score:")
    print(accuracy_score(y_test, y_pred))

    y_score = gnb.predict_proba(x_test)
    fprG, tprG, t = roc_curve(y_test, y_score[:,1])
    roc_aucG  = auc(fprG, tprG)

    y_score = mgnb.predict_proba(x_test)
    fprN, tprN, t = roc_curve(y_test, y_score[:,1])
    roc_aucN = auc(fprN, tprN)
    
    %matplotlib inline
    plt.figure()
    lw = 2
    plt.plot(fprG, tprG, color='red',
            lw=lw, label='ROC NB (area = %0.2f)' % roc_aucG)
    plt.plot(fprN, tprN, color='blue',
            lw=lw, label='ROC custom NB (area = %0.2f)' % roc_aucN)

    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Analysis for Diabetes data')
    plt.legend(loc="lower right")
    plt.show()

This function below will compare Scikit's Gaussian NB and mine in terms of speed.

In [None]:
def test_speed_on_dataset(x_train, y_train, x_test, y_test, repeat=1000):
    gnb = GaussianNB()
    gnb_fit_time = timeit.timeit(lambda: gnb.fit(x_train, y_train), number=repeat)
    gnb_predict_time = timeit.timeit(lambda: gnb.predict(x_test), number=repeat)
    
    print(f'Scikit GNB fit performance: {gnb_fit_time}s ({repeat} times)')
    print(f'Scikit GNB predict performance: {gnb_predict_time}s ({repeat} times)')

    mgnb = MyGaussianNB()
    mgnb_fit_time = timeit.timeit(lambda: mgnb.fit(x_train, y_train), number=repeat)
    mgnb_predict_time = timeit.timeit(lambda: mgnb.predict(x_test), number=repeat)

    print(f'My GNB fit performance: {mgnb_fit_time}s ({repeat} times)')
    print(f'My GNB predict performance: {mgnb_predict_time}s ({repeat} times)')

## Penguins dataset
Running and testing Gaussian Naive Bayes on the penguin dataset.

In [None]:
penguins_af = pd.read_csv('penguins_af.csv', index_col = 0)
print(penguins_af.shape)
penguins_af.head()

In [None]:
penguins_af['target'] = np.where(penguins_af['species'] == 'Adelie', 1, 0)
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g', 'species', 'target']
penguins = penguins_af[f_names]
penguins2C = penguins.loc[penguins['species'].isin(['Adelie','Chinstrap'])]
penguins2C.pop('species')
penguins2C.head()

In [None]:
y = penguins2C.pop('target').values
X_raw = penguins2C.values
feature_names = penguins2C.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)
X_train.shape, X_test.shape

In [None]:
test_gaussian_on_dataset(X_train, y_train, X_test, y_test)

In [None]:
test_speed_on_dataset(X_train, y_train, X_test, y_test)

## Diabetes dataset
Running and testing Gaussian Naive Bayes on the diabetes test dataset.

In [None]:
diabetes = pd.read_csv('diabetes.csv', index_col = 0)
print(diabetes.shape)
diabetes.head()

In [None]:
diabetes['target'] = np.where(diabetes['neg_pos'] == 'tested_positive', 1, 0)
diabetes.pop('neg_pos').values
diabetes.head()

In [None]:
y = diabetes.pop('target').values
Xorig = diabetes.values
scaler = StandardScaler()
X = scaler.fit_transform(Xorig)
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

In [None]:
test_gaussian_on_dataset(X_train, y_train, X_test, y_test)

In [None]:
test_speed_on_dataset(X_train, y_train, X_test, y_test)

## Hotel Reviews dataset
Running and testing Gaussian Naive Bayes on the helpfulness of hotel reviews dataset.

In [None]:
reviews = pd.read_csv('HotelRevHelpfulness.csv', index_col=0)
print(reviews.shape)
reviews.head()

In [None]:
y = reviews.pop('reviewHelpfulness').values
x_raw = reviews.values
scaler = StandardScaler()
x = scaler.fit_transform(x_raw)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/3)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
test_gaussian_on_dataset(x_train, y_train, x_test, y_test)

In [None]:
test_speed_on_dataset(x_train, y_train, x_test, y_test)

# Conclusion

For binary classification:

Evaluation: 
  - their accuracies are equals ;
  - their confusion matrices are equals ;
  - their ROC curve get mixed up with the same area under.

Speed: 
  - `fit` method: Scikit is slower than mine (about 4 times) ;
  - `predict` method: Scikit is faster than mine (about 40 times).

What I suspect for these differences:
  - Scikit may compute other values when fitting the model in order to optimize the predictions ;
  - Scikit may use a better but equivalent algorithm to compute the conditional probabilities.