## Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [4]:
dataset = pd.read_csv('bank.csv', delimiter=';', quoting=3)
dataset.shape

(4521, 17)

In [5]:
dataset = dataset.rename(columns = lambda x : x.replace('"', ''))
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,"""30","""""unemployed""""","""""married""""","""""primary""""","""""no""""",1787,"""""no""""","""""no""""","""""cellular""""",19,"""""oct""""",79,1,-1,0,"""""unknown""""","""""no"""""""
1,"""33","""""services""""","""""married""""","""""secondary""""","""""no""""",4789,"""""yes""""","""""yes""""","""""cellular""""",11,"""""may""""",220,1,339,4,"""""failure""""","""""no"""""""
2,"""35","""""management""""","""""single""""","""""tertiary""""","""""no""""",1350,"""""yes""""","""""no""""","""""cellular""""",16,"""""apr""""",185,1,330,1,"""""failure""""","""""no"""""""
3,"""30","""""management""""","""""married""""","""""tertiary""""","""""no""""",1476,"""""yes""""","""""yes""""","""""unknown""""",3,"""""jun""""",199,4,-1,0,"""""unknown""""","""""no"""""""
4,"""59","""""blue-collar""""","""""married""""","""""secondary""""","""""no""""",0,"""""yes""""","""""no""""","""""unknown""""",5,"""""may""""",226,1,-1,0,"""""unknown""""","""""no"""""""


In [6]:
columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']

for x in columns:
    dataset[x] = dataset[x].apply(lambda x : x.replace('"', ''))

In [7]:
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [8]:
dataset['age'] = pd.to_numeric(dataset['age'])

## Create X and Y

In [9]:
X = dataset.iloc[:, 0:16].values
Y = dataset.iloc[:, 16].values

In [10]:
X.shape

(4521, 16)

In [11]:
Y.shape

(4521,)

In [12]:
X

array([[30, 'unemployed', 'married', ..., -1, 0, 'unknown'],
       [33, 'services', 'married', ..., 339, 4, 'failure'],
       [35, 'management', 'single', ..., 330, 1, 'failure'],
       ..., 
       [57, 'technician', 'married', ..., -1, 0, 'unknown'],
       [28, 'blue-collar', 'married', ..., 211, 3, 'other'],
       [44, 'entrepreneur', 'single', ..., 249, 7, 'other']], dtype=object)

In [13]:
Y

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

## Preprocess the Data

In [14]:
le_Y = LabelEncoder()

In [15]:
Y = le_Y.fit_transform(Y)
Y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
cols_to_encode = [1, 2, 3, 4, 6, 7, 8, 10, 15]
def encoder_X(index):
    le_X = LabelEncoder()
    X[:, index] = le_X.fit_transform(X[:, index])
    return

In [17]:
for i in cols_to_encode:
    encoder_X(i)

In [18]:
X[0, :]

array([30, 10, 1, 0, 0, 1787, 0, 0, 0, 19, 10, 79, 1, -1, 0, 3], dtype=object)

In [19]:
ohe_X = OneHotEncoder(categorical_features = [15])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(4521, 18)

In [20]:
pd.DataFrame(pd.DataFrame(X[:, 13])[0].value_counts())

Unnamed: 0,0
8.0,1398
5.0,706
1.0,633
6.0,531
9.0,389
0.0,293
3.0,222
4.0,148
10.0,80
11.0,52


In [21]:
ohe_X = OneHotEncoder(categorical_features = [13])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(4521, 28)

In [22]:
pd.DataFrame(pd.DataFrame(X[:, 22])[0].value_counts())

Unnamed: 0,0
0.0,2896
2.0,1324
1.0,301


In [23]:
ohe_X = OneHotEncoder(categorical_features = [22])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(4521, 29)

In [24]:
pd.DataFrame(pd.DataFrame(X[:, 19])[0].value_counts())

Unnamed: 0,0
1.0,2306
2.0,1350
0.0,678
3.0,187


In [25]:
ohe_X = OneHotEncoder(categorical_features = [19])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(4521, 31)

In [26]:
pd.DataFrame(pd.DataFrame(X[:, 21])[0].value_counts())

Unnamed: 0,0
1.0,2797
2.0,1196
0.0,528


In [27]:
ohe_X = OneHotEncoder(categorical_features = [21])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(4521, 32)

In [28]:
pd.DataFrame(pd.DataFrame(X[:, 22])[0].value_counts())

Unnamed: 0,0
4.0,969
1.0,946
9.0,768
0.0,478
7.0,417
5.0,230
6.0,183
2.0,168
10.0,128
3.0,112


In [29]:
ohe_X = OneHotEncoder(categorical_features = [22])
X = ohe_X.fit_transform(X).toarray()
X = X[:, 1:]
X.shape

(4521, 42)

In [30]:
sc_X = StandardScaler()

In [31]:
X = sc_X.fit_transform(X)

In [32]:
X

array([[-0.5144078 , -0.19645374, -0.15938189, ..., -0.57682947,
        -0.4072183 , -0.32041282],
       [-0.5144078 , -0.19645374, -0.15938189, ..., -0.57682947,
         2.98904408,  2.04173372],
       [-0.5144078 , -0.19645374, -0.15938189, ..., -0.57682947,
         2.89914302,  0.27012381],
       ..., 
       [-0.5144078 , -0.19645374, -0.15938189, ...,  2.63916021,
        -0.4072183 , -0.32041282],
       [ 1.94398296, -0.19645374, -0.15938189, ...,  0.38796743,
         1.71045119,  1.45119709],
       [-0.5144078 ,  5.0902568 , -0.15938189, ..., -0.2552305 ,
         2.09003345,  3.81334363]])

## Create Train and Test Data

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [34]:
X_train.shape

(3616, 42)

In [35]:
X_test.shape

(905, 42)

In [36]:
Y_train.shape

(3616,)

In [37]:
Y_test.shape

(905,)

In [38]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,3194
1,422


In [39]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,806
1,99


## DecisionTree

In [40]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [41]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [42]:
Y_pred_dt = clf_dt.predict(X_test)

In [43]:
confusion_matrix(Y_test, Y_pred_dt)

array([[746,  60],
       [ 46,  53]], dtype=int64)

## Random Forest

In [44]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [45]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
Y_pred_rf = clf_rf.predict(X_test)

In [47]:
confusion_matrix(Y_test, Y_pred_rf)

array([[782,  24],
       [ 69,  30]], dtype=int64)

## Naive Bayes

In [48]:
clf_nb = GaussianNB()

In [49]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [50]:
Y_pred_nb = clf_nb.predict(X_test)

In [51]:
confusion_matrix(Y_test, Y_pred_nb)

array([[716,  90],
       [ 49,  50]], dtype=int64)

## KNN

In [52]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [53]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [54]:
Y_pred_knn = clf_knn.predict(X_test)

In [55]:
confusion_matrix(Y_test, Y_pred_knn)

array([[786,  20],
       [ 73,  26]], dtype=int64)

## Logistic Regression

In [56]:
clf_lr = LogisticRegression()

In [57]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [58]:
Y_pred_lr = clf_lr.predict(X_test)

In [59]:
confusion_matrix(Y_test, Y_pred_lr)

array([[779,  27],
       [ 57,  42]], dtype=int64)

## Linear SVC

In [60]:
clf_lsvc = SVC(kernel = 'linear')

In [61]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [62]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [63]:
confusion_matrix(Y_test, Y_pred_lsvc)

array([[792,  14],
       [ 82,  17]], dtype=int64)

## Kernel SVC

In [64]:
clf_ksvc = SVC(kernel = 'rbf')

In [65]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [66]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [67]:
confusion_matrix(Y_test, Y_pred_ksvc)

array([[782,  24],
       [ 70,  29]], dtype=int64)

## Accuracy of Various Models

In [68]:
model_accuracies['DT'] = accuracy_score(Y_test, Y_pred_dt)
model_accuracies['KNN'] = accuracy_score(Y_test, Y_pred_knn)
model_accuracies['KernelSVC'] = accuracy_score(Y_test, Y_pred_ksvc)
model_accuracies['LinearSVC'] = accuracy_score(Y_test, Y_pred_lsvc)
model_accuracies['LogReg'] = accuracy_score(Y_test, Y_pred_lr)
model_accuracies['NB'] = accuracy_score(Y_test, Y_pred_nb)
model_accuracies['RF'] = accuracy_score(Y_test, Y_pred_rf)
model_accuracies

{'DT': 0.88287292817679563,
 'KNN': 0.89723756906077345,
 'KernelSVC': 0.89613259668508283,
 'LinearSVC': 0.89392265193370168,
 'LogReg': 0.90718232044198899,
 'NB': 0.8464088397790055,
 'RF': 0.89723756906077345}