In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load in dataset
df_bank = pd.read_csv('bank/bank.csv', sep=';')
df_bank.shape

(4521, 17)

# Data Preparation

### 1. Add dummy variables

In [27]:
# a function to add dummy variables
def add_dummy(feature_list):
    '''
    Arg:
    feature_list: a list of categorical variables 

    Return:
    df_og: a dataframe with dummy variables added and original categorical columns removed
    '''
    df_og = df_bank.copy()
    for feature in feature_list:
        df = pd.concat([df_og, pd.get_dummies(df_bank[feature], prefix=feature, prefix_sep='_', drop_first=True)],
        axis=1)
        df_og = df.drop(feature, axis=1)
    return df_og

In [30]:
# create a list of categorical variables
dum_list = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'campaign', 'poutcome']
# apply the add_dummy function
df_dummy = add_dummy(dum_list)

In [31]:
df_dummy.shape

(4521, 73)

# Modeling

### Feature selection

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [35]:
df_dummy.columns.difference(['y','day'])

Index(['age', 'balance', 'campaign_10', 'campaign_11', 'campaign_12',
       'campaign_13', 'campaign_14', 'campaign_15', 'campaign_16',
       'campaign_17', 'campaign_18', 'campaign_19', 'campaign_2',
       'campaign_20', 'campaign_21', 'campaign_22', 'campaign_23',
       'campaign_24', 'campaign_25', 'campaign_28', 'campaign_29',
       'campaign_3', 'campaign_30', 'campaign_31', 'campaign_32', 'campaign_4',
       'campaign_44', 'campaign_5', 'campaign_50', 'campaign_6', 'campaign_7',
       'campaign_8', 'campaign_9', 'contact_telephone', 'contact_unknown',
       'default_yes', 'duration', 'education_secondary', 'education_tertiary',
       'education_unknown', 'housing_yes', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'loan_yes', 'marital_married',
       'marital_single', 'month_aug', 'month_dec', 'month_feb',

In [37]:
# explanatory and response variables
X = df_dummy[df_dummy.columns.difference(['y','day'])]
y = df_dummy['y']

# split into training and testing set
test_size= 0.20
seed = 666
X_train, X_test, Y_train, Y_test= train_test_split(X, y, test_size=test_size, random_state=seed)

### Four classification algorithms

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [47]:
# create a list with model-names and models
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('TREE', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# Evaluation

In [48]:
results_c = []
names_c = []

# print accuracy for each model
for name, model in models:
    # cross-validation
    kfold = KFold(n_splits=10, random_state=seed)    
    # train the model
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')    
    results_c.append(cv_results)
    names_c.append(name)
    result_f = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(result_f)

LR: 0.894353 (0.013225)
KNN: 0.878317 (0.014669)
TREE: 0.863377 (0.025192)
NB: 0.832391 (0.023209)
SVM: 0.882740 (0.012651)
