## We are going to use a preprocessed credit datasset from a German bank on its customers. 

In [1]:
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB, BernoulliNB
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

## Creating a numpy array for the features and label.

In [2]:
Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
Labels = Labels.reshape(Labels.shape[0],)
print(Features.shape)
print(Labels.shape)

(1000, 35)
(1000,)


## We first define a 10-fold cross validation object. We then define a Gaussian naive Bayes model. Finally, we print results from the cross validation. 

In [3]:
nr.seed(321)
cv_folds = ms.KFold(n_splits=10, shuffle = True)
    
nr.seed(498)
NB_credit = GaussianNB()
cv_estimate = ms.cross_val_score(NB_credit, Features, Labels, 
                                 cv = cv_folds) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean performance metric = 0.691
SDT of the metric       = 0.093
Outcomes by cv fold
Fold  1    0.720
Fold  2    0.660
Fold  3    0.770
Fold  4    0.690
Fold  5    0.750
Fold  6    0.430
Fold  7    0.690
Fold  8    0.760
Fold  9    0.710
Fold 10    0.730


In [4]:
nr.seed(498)
NB_credit = GaussianNB()
cv_estimate = ms.cross_val_score(NB_credit, Features, Labels, 
                                 cv = 10) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean performance metric = 0.679
SDT of the metric       = 0.106
Outcomes by cv fold
Fold  1    0.670
Fold  2    0.390
Fold  3    0.680
Fold  4    0.710
Fold  5    0.760
Fold  6    0.770
Fold  7    0.620
Fold  8    0.760
Fold  9    0.730
Fold 10    0.700


## We now build and test a model using a single split of the dataset. 

In [5]:
nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
X_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

## # Define a naive Bayes model object and then fit the model to the training data. 

In [6]:
NB_credit_mod = GaussianNB() 
NB_credit_mod.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Next, we score and print evaluation metrics for the model using the test data subset.

In [7]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = NB_credit_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5)

                 Confusion matrix
                 Score positive    Score negative
Actual positive        30               182
Actual negative         6                82

Accuracy        0.37
AUC             0.69
Macro precision 0.57
Macro recall    0.54
 
           Positive      Negative
Num case      212            88
Precision    0.83          0.31
Recall       0.14          0.93
F1           0.24          0.47


In [8]:
## Next we will use a Bernoulli naive Bayes model.

In [9]:
# To do ernoulli naive Bayes model the numeric features must be dropped from the array and are done below.

Features = Features[:,4:]
Features[:3,:]

array([[0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.]])

## 10-fold nested cross validation is used to estimate the optimal hyperparameter and perform model selection for the naive Bayes model.

In [11]:
# We define inside and outside fold objects of the Bernoulli naive Bayes model.

nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle = True)

## We estimate the best hyperparameters using 10 fold cross validation. 

In [12]:
# In this case, a grid for one hyperparameter: alpha is the smoothing parameter to avoid zero probabilities.
# The model is fit on the grid and the best estimated hyperparameters are printed.

nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]}
## Define the NB regression model
NB_clf = BernoulliNB() 

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = NB_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, Labels)
print(clf.best_estimator_.alpha)

1


## We now perform the outer loop of the cross validation to estimate model performance with the optimal hyperparameter. 

In [13]:
#NB_credit = BernoulliNB(alpha = clf.best_estimator_.alpha)
nr.seed(498)
cv_estimate = ms.cross_val_score(clf, Features, Labels, 
                                 cv = outside) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean performance metric = 0.754
SDT of the metric       = 0.040
Outcomes by cv fold
Fold  1    0.735
Fold  2    0.701
Fold  3    0.733
Fold  4    0.745
Fold  5    0.771
Fold  6    0.757
Fold  7    0.762
Fold  8    0.857
Fold  9    0.765
Fold 10    0.713


## Split the dataset into training and testing subsets. 

In [14]:
nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
X_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [15]:
## We finally fit and score the Bernoulli naive Bayes model and display the performance metrics. 

NB_credit_mod = BernoulliNB(alpha = clf.best_estimator_.alpha) 
NB_credit_mod.fit(X_train, y_train)
probabilities = NB_credit_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5)

                 Confusion matrix
                 Score positive    Score negative
Actual positive       178                34
Actual negative        34                54

Accuracy        0.77
AUC             0.78
Macro precision 0.73
Macro recall    0.73
 
           Positive      Negative
Num case      212            88
Precision    0.84          0.61
Recall       0.84          0.61
F1           0.84          0.61


## This probability is invariably skewed toward the majority case. Since the bank cares more about the minority case, we now redefine the model object with prior probability of 0.6 for the minority case.


In [16]:
NB_credit_mod = BernoulliNB(alpha = clf.best_estimator_.alpha,
                            class_prior = [0.4,0.6]) 
NB_credit_mod.fit(X_train, y_train)
probabilities = NB_credit_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5) 

                 Confusion matrix
                 Score positive    Score negative
Actual positive       116                96
Actual negative        17                71

Accuracy        0.62
AUC             0.78
Macro precision 0.65
Macro recall    0.68
 
           Positive      Negative
Num case      212            88
Precision    0.87          0.43
Recall       0.55          0.81
F1           0.67          0.56
