# importing library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
dataset = pd.read_csv('UniversalBank.csv') # read the file

In [3]:
# remove zip code and id column
# as its not important
dataset.drop(['ID','ZIP Code'], axis=1,inplace=True) 

In [4]:
dataset.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [5]:
# X : input columns (column 0,1,2,3,4,5,6,8,9,10,11)
# y : output column (column 7)
X = dataset.iloc[:, [0,1,2,3,4,5,6,8,9,10,11]].values
y = dataset.iloc[:, 7].values

In [6]:
y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# splitting the dataset

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40, random_state = 0)

# Standardization of a dataset 

In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
df = pd.DataFrame() # blank dataframe to store y actual and y predicted

# Logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [11]:
y_pred_logis = classifier.predict(X_test)

In [12]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred_logis)

In [13]:
cm

array([[1781,   27],
       [  75,  117]], dtype=int64)

In [14]:
# find the accuracy
accuracy_score(y_test, y_pred_logis)

0.949

In [15]:
df['y_true_logis'] =  y_test
df['y_pred_logis'] = y_pred_logis

# KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3, metric='minkowski', p=3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3, p=3)

In [17]:
y_pred_knn = classifier.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_knn)

In [19]:
cm # confusion matrix

array([[1794,   14],
       [  63,  129]], dtype=int64)

In [20]:
accuracy_score(y_test, y_pred_knn) # find the accuracy

0.9615

In [21]:
df['y_true_knn'] =  y_test
df['y_pred_knn'] = y_pred_knn

# Classification Tree

In [22]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [23]:
# Predicting the Test set results
y_pred_tree = classifier.predict(X_test)

In [24]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred_tree)

In [25]:
cm # confusion matrix

array([[1799,    9],
       [  24,  168]], dtype=int64)

In [26]:
accuracy_score(y_test, y_pred_tree) # find the accuracy

0.9835

In [27]:
df['y_true_tree'] =  y_test
df['y_pred_tree'] = y_pred_tree

# top 10 rows

In [28]:
df.head(10)

Unnamed: 0,y_true_logis,y_pred_logis,y_true_knn,y_pred_knn,y_true_tree,y_pred_tree
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,1
7,1,1,1,1,1,1
8,0,0,0,0,0,0
9,0,0,0,0,0,0


# adding 2 new columns

In [29]:
df['majority vote'] = np.where((df['y_pred_logis']+df['y_pred_knn'] + df['y_pred_tree'])<2 ,0,1)

In [30]:
df['Average Probability'] = (df['y_pred_logis']+df['y_pred_knn']+df['y_pred_tree'])/3

In [31]:
df.head(10)

Unnamed: 0,y_true_logis,y_pred_logis,y_true_knn,y_pred_knn,y_true_tree,y_pred_tree,majority vote,Average Probability
0,0,0,0,0,0,0,0,0.0
1,0,0,0,0,0,0,0,0.0
2,0,0,0,0,0,0,0,0.0
3,0,0,0,0,0,0,0,0.0
4,0,0,0,0,0,0,0,0.0
5,0,0,0,0,0,0,0,0.0
6,1,1,1,0,1,1,1,0.666667
7,1,1,1,1,1,1,1,1.0
8,0,0,0,0,0,0,0,0.0
9,0,0,0,0,0,0,0,0.0


# comparing the accuracy

In [32]:
accuracy_score(y_test, y_pred_logis)

0.949

In [33]:
accuracy_score(y_test, y_pred_knn)

0.9615

In [34]:
accuracy_score(y_test, y_pred_tree)

0.9835

In [35]:
# hence classification using tree is more accurate as compared to other models.