In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load in dataset
df_bank = pd.read_csv('bank/bank.csv', sep=';')
df_bank.shape

(4521, 17)

In [4]:
df_bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


# Data Preparation

### 1. Remove outliers
**Any observation with a value that is more than 4 times its standard deviation.**

In [11]:
# delete outliers from balance and duration
df_bank_2 = df_bank.copy()
df_bank_2['balance_z']= stats.zscore(df_bank_2['balance'])
df_bank_2['duration_z']= stats.zscore(df_bank_2['duration'])
# stat.stdev(df_bank['balance'])
# drop outliers with a value that is more than 4 times its std
condition_1 = (df_bank_2['balance_z'] > 4) | (df_bank_2['balance_z'] < -4)
condition_2 = (df_bank_2['duration_z'] > 4) | (df_bank_2['duration_z'] < -4)
df_bank_3 = df_bank.drop(df_bank_2[condition_1 | condition_2].index, axis = 0)

print(f'There are {df_bank.shape[0]-df_bank_3.shape[0]} entries removed.')
print(f'The shape of the dataset now is {df_bank_3.shape}.')

There are 96 entries removed.
The shape of the dataset now is (4425, 17).


### 2. Add dummy variables

In [28]:
# a function to add dummy variables
def add_dummy(df_data, feature_list):
    '''
    Arg:
    df: the dataset to add dummy variables
    feature_list: a list of categorical variables 

    Return:
    df_og: a dataframe with dummy variables added and original categorical columns removed
    '''
    df_og = df_data.copy()
    #print(df_og.shape)
    for feature in feature_list:
        df = pd.concat([df_og, pd.get_dummies(df_data[feature], prefix=feature, prefix_sep='_', drop_first=True)],
        axis=1)
        df_og = df.drop(feature, axis=1)
    return df_og

In [29]:
# create a list of categorical variables
dum_list = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'campaign', 'poutcome']
# apply the add_dummy function
df_dummy = add_dummy(df_bank_3, dum_list)

In [30]:
df_dummy.shape

(4425, 73)

# Modeling

### Feature selection

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [32]:
df_dummy.columns.difference(['y','day'])

Index(['age', 'balance', 'campaign_10', 'campaign_11', 'campaign_12',
       'campaign_13', 'campaign_14', 'campaign_15', 'campaign_16',
       'campaign_17', 'campaign_18', 'campaign_19', 'campaign_2',
       'campaign_20', 'campaign_21', 'campaign_22', 'campaign_23',
       'campaign_24', 'campaign_25', 'campaign_28', 'campaign_29',
       'campaign_3', 'campaign_30', 'campaign_31', 'campaign_32', 'campaign_4',
       'campaign_44', 'campaign_5', 'campaign_50', 'campaign_6', 'campaign_7',
       'campaign_8', 'campaign_9', 'contact_telephone', 'contact_unknown',
       'default_yes', 'duration', 'education_secondary', 'education_tertiary',
       'education_unknown', 'housing_yes', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'loan_yes', 'marital_married',
       'marital_single', 'month_aug', 'month_dec', 'month_feb',

In [33]:
# explanatory and response variables
X = df_dummy[df_dummy.columns.difference(['y','day'])]
y = df_dummy['y']

# split into training and testing set
test_size= 0.20
seed = 666
X_train, X_test, Y_train, Y_test= train_test_split(X, y, test_size=test_size, random_state=seed)

### Four classification algorithms

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [35]:
# create a list with model-names and models
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('TREE', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# Evaluation

In [36]:
results_c = []
names_c = []

# print accuracy for each model
for name, model in models:
    # cross-validation
    kfold = KFold(n_splits=10, random_state=seed)    
    # train the model
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')    
    results_c.append(cv_results)
    names_c.append(name)
    result_f = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(result_f)

LR: 0.896328 (0.010421)
KNN: 0.879944 (0.009219)
TREE: 0.870339 (0.021005)
NB: 0.816102 (0.021678)
SVM: 0.889266 (0.010248)
