In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

# Set random seed to reproduce results
seed = 2107
# Set number of cross-validation splits
splits = 10


def binary_conv(value):
    """Convert binary variable from string to integer"""
    if value == "no":
        return 0
    elif value == "yes":
        return 1
    else:
        return np.nan


# Load the data for the variables needed and conver to integer
train = pd.read_csv('bank.csv', sep=';',
                    usecols=['default', 'housing', 'loan', 'response'],
                    converters={'default': binary_conv, 'housing': binary_conv,
                                'loan': binary_conv, 'response': binary_conv})

# Convert training data to arrays for sklearn
X = train[['default', 'housing', 'loan']].values
y = train['response'].values

# Use KFold cross-validation to split data for training
cv = KFold(n_splits=splits, shuffle=False, random_state=seed)

# Construct a list of classifiers to iterate over
models = [LogisticRegression(), BernoulliNB()]

# Initialize list to store stats from each fold and model
stats = []

# Iterate over the folds and models to save OoF AUC for each
i = 0
for training, test in cv.split(X, y):
    fold = [i]
    for alg in models:
        scores = alg.fit(X[training], y[training]).predict_proba(X[test])
        fold.append(roc_auc_score(y[test], scores[:, 1]))
    stats.append(fold)
    i += 1

# Create a dataframe of the OoF AUCs for each model
labels = ['Fold', 'Logistic Regression AUC', 'Naive Bayes AUC']
results = pd.DataFrame(stats, columns=labels)

# Add mean and standard deviation to summary
m = results.mean().tolist()
m[0] = "Mean"
s = results.std().tolist()
s[0] = "Std Dev"

results_summary = results.append([pd.Series(m, index=labels),
                                  pd.Series(s, index=labels)],
                                 ignore_index=True).set_index('Fold')

# Save CV results to a CSV
results_summary.to_csv("results_summary.csv")

# Fit the logistic regression to the entire data and score
full_lr_model = LogisticRegression().fit(X, y)
scores = full_lr_model.predict_proba(X)
train['lr_score'] = [i[1] for i in scores]

# Fit the Naive Bayes classifier to the entrie data and score
full_nb_model = BernoulliNB().fit(X, y)
scores = full_nb_model.predict_proba(X)
train['nb_score'] = [i[1] for i in scores]

# Calculate the correlation between model scores
model_corr = train[['lr_score', 'nb_score']].corr()

# Save correlation results to a CSV
model_corr.to_csv("results_corr.csv")

# Calculate the coefficients from the logistic regression
labels = train.columns[0:3].tolist()
lr_coef = pd.DataFrame([full_lr_model.coef_.tolist()[0]], columns=labels)

# Save coefficients to a CSV
lr_coef.to_csv("results_coef.csv")


In [2]:
results_summary

Unnamed: 0_level_0,Logistic Regression AUC,Naive Bayes AUC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.587852,0.587852
1,0.633728,0.633728
2,0.657535,0.657535
3,0.635565,0.637388
4,0.574399,0.574399
5,0.584238,0.574684
6,0.562512,0.562512
7,0.731144,0.731144
8,0.573533,0.573533
9,0.57682,0.577827


In [3]:
model_corr

Unnamed: 0,lr_score,nb_score
lr_score,1.0,0.99992
nb_score,0.99992,1.0


In [4]:
lr_coef

Unnamed: 0,default,housing,loan
0,0.157708,-0.651981,-0.743002
