# Comparing Statistical approach vs Sklearn

We will use logistic regression (classification) algorithm, K Nearest Neighbors, Support Vector Machine, Naive Bayes, Decision Tree, and Random Forest .

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
from sklearn import set_config
set_config(display="diagram")

# ****Read Data****

In [None]:
# We are reading our data
df = pd.read_csv("marketing/data/bank_full_raw.csv", delimiter=';')

In [None]:
# First 10 rows of our data
df.head(10)

#### I'm here Mapping data to convert values inside CVS which have yes or no to 1 and 0 respectively

In [None]:
df['default'] = df['default'].map( 
                   {'yes':1 ,'no':0}) 
df['housing'] = df['housing'].map( 
                   {'yes':1 ,'no':0}) 
df['loan'] = df['loan'].map( 
                   {'yes':1 ,'no':0}) 
df['Target'] = df['y'].map( 
                   {'yes':1 ,'no':0}) 

In [None]:
# First 5 rows of our data
df.head()

# Data Exploration

In [None]:
df.Target.value_counts()

In [None]:
df.loan.value_counts()

In [None]:
sns.countplot(x="Target", data=df, palette="bwr")
plt.show()

In [None]:
countNoLoan = len(df[df.Target == 0])
countHaveLoan = len(df[df.Target == 1])
print("Percentage of Customers who do not have a Loan: {:.2f}%".format((countNoLoan / (len(df.Target))*100)))
print("Percentage of Customers who have a Loan: {:.2f}%".format((countHaveLoan / (len(df.Target))*100)))

In [None]:
sns.countplot(x="loan", data=df, palette="bwr")
plt.show()

In [None]:
countNANLoan = len(df[df.loan == 0])
countOKLoan = len(df[df.loan == 1])
print("Percentage of Customer who don't have a Loan: {:.2f}%".format((countNoLoan / (len(df.loan))*100)))
print("Percentage of Customer who have a Loan: {:.2f}%".format((countHaveLoan / (len(df.loan))*100)))

In [None]:
df.groupby('Target').mean()

In [None]:
pd.crosstab(df.age,df.Target).plot(kind="bar",figsize=(20,6))
plt.title('Deposit Accepted/Rejected by Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('Age_frequency.png')
plt.show()

In [None]:
pd.crosstab(df.housing,df.Target).plot(kind="bar",color=['#1CA53B','#AA1111' ])
plt.title('Deposit accepted/Rejected based on House ownership status')
plt.xlabel('housing ("no" = Have no house , "yes" = Have House)')
plt.xticks(rotation=0)
plt.legend(["Have no house", "Have House"])
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.default[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.default[(df.Target== 0)])
plt.title('Accepted/Rejected based on Loan, Default coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Default")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.balance[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.balance[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs Balance coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Loan")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.housing[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.housing[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs Housing coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Housing")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.loan[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.loan[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs Loan coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Loan")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.day[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.day[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs Day coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Day")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.campaign[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.campaign[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs Campaign coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Campaign")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.pdays[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.pdays[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs pDays coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("pDays")
plt.show()

In [None]:
plt.scatter(x=df.age[df.Target== 1], y=df.previous[(df.Target== 1)], c="red")
plt.scatter(x=df.age[df.Target== 0], y=df.previous[(df.Target== 0)])
plt.title('Accepted/Rejected - Age vs Previous coorelation')
plt.legend(["Accepted", "Rejected"])
plt.xlabel("Age")
plt.ylabel("Previous")
plt.show()

In [None]:
df = df.drop(columns = ['job', 'marital', 'education','contact', 'duration', 'month', 'poutcome', 'y'])
df.head()

# Creating Model for Logistic Regression

In [None]:
y = df.Target.values
x_data = df.drop(['Target'], axis = 1)
x_data

# Normalize Data

In [None]:
# Normalize
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values

**We will split our data. 80% of our data will be train data and 20% of it will be test data.**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [None]:
#transpose matrices
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T

**Let's say weight = 0.01 and bias = 0.0**

In [None]:
#initialize
def initialize(dimension):
    
    weight = np.full((dimension,1),0.01)
    bias = 0.0
    return weight,bias

In [None]:
def sigmoid(z):
    
    y_head = 1/(1+ np.exp(-z))
    return y_head

In [None]:
def forwardBackward(weight,bias,x_train,y_train):
    # Forward
    
    y_head = sigmoid(np.dot(weight.T,x_train) + bias)
    loss = -(y_train*np.log(y_head) + (1-y_train)*np.log(1-y_head))
    cost = np.sum(loss) / x_train.shape[1]
    
    # Backward
    derivative_weight = np.dot(x_train,((y_head-y_train).T))/x_train.shape[1]
    derivative_bias = np.sum(y_head-y_train)/x_train.shape[1]
    gradients = {"Derivative Weight" : derivative_weight, "Derivative Bias" : derivative_bias}
    
    return cost,gradients

In [None]:
def update(weight,bias,x_train,y_train,learningRate,iteration) :
    costList = []
    index = []
    
    #for each iteration, update weight and bias values
    for i in range(iteration):
        cost,gradients = forwardBackward(weight,bias,x_train,y_train)
        weight = weight - learningRate * gradients["Derivative Weight"]
        bias = bias - learningRate * gradients["Derivative Bias"]
        
        costList.append(cost)
        index.append(i)

    parameters = {"weight": weight,"bias": bias}
    
    print("iteration:",iteration)
    print("cost:",cost)

    plt.plot(index,costList)
    plt.xlabel("Number of Iteration")
    plt.ylabel("Cost")
    plt.show()

    return parameters, gradients

In [None]:
def predict(weight,bias,x_test):
    z = np.dot(weight.T,x_test) + bias
    y_head = sigmoid(z)

    y_prediction = np.zeros((1,x_test.shape[1]))
    
    for i in range(y_head.shape[1]):
        if y_head[0,i] <= 0.5:
            y_prediction[0,i] = 0
        else:
            y_prediction[0,i] = 1
    return y_prediction

In [None]:
def logistic_regression(x_train,y_train,x_test,y_test,learningRate,iteration):
    dimension = x_train.shape[0]
    weight,bias = initialize(dimension)
    
    parameters, gradients = update(weight,bias,x_train,y_train,learningRate,iteration)

    y_prediction = predict(parameters["weight"],parameters["bias"],x_test)
    
    print("Manuel Test Accuracy: {:.2f}%".format((100 - np.mean(np.abs(y_prediction - y_test))*100)))

In [None]:
logistic_regression(x_train,y_train,x_test,y_test,1,100)

Manuel Test Accuracy is **88.25%**

**Let's find out sklearn's score.**

# Sklearn Logistic Regression

In [None]:
accuracies = {}

lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
acc = lr.score(x_test.T,y_test.T)*100

accuracies['Logistic Regression'] = acc
print("Test Accuracy {:.2f}%".format(acc))

Test Accuracy is 88.81% 

Accuracy increases due to the Sklearn Modification and improvements

# K-Nearest Neighbour Classification

In [None]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)  # n_neighbors means k
knn.fit(x_train.T, y_train.T)
prediction = knn.predict(x_test.T)

print("{} KNN Score: {:.2f}%".format(2, knn.score(x_test.T, y_test.T)*100))

**Try to find best k value to improve our accuracy**

In [None]:
# try ro find best k value
scoreList = []
for i in range(1,20):
    knn2 = KNeighborsClassifier(n_neighbors = i)  # n_neighbors means k
    knn2.fit(x_train.T, y_train.T)
    scoreList.append(knn2.score(x_test.T, y_test.T))
    
plt.plot(range(1,20), scoreList)
plt.xticks(np.arange(1,20,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()

acc = max(scoreList)*100
accuracies['KNN'] = acc
print("Maximum KNN Score is {:.2f}%".format(acc))

**We notes now that our Accuracy increases to 89.13%**

# Support Vector Machine Algorithm

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC(random_state = 1)
svm.fit(x_train.T, y_train.T)

acc = svm.score(x_test.T,y_test.T)*100
accuracies['SVM'] = acc
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc))

# Naive Bayes Algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train.T, y_train.T)

acc = nb.score(x_test.T,y_test.T)*100
accuracies['Naive Bayes'] = acc
print("Accuracy of Naive Bayes: {:.2f}%".format(acc))

## Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(x_train.T, y_train.T)

acc = dt.score(x_test.T,y_test.T)*100
accuracies['Decision Tree'] = acc
print("Accuracy of Decision Tree: {:.2f}%".format(acc))

# Random Forest Classification

In [None]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000, random_state = 1)
rf.fit(x_train.T, y_train.T)

acc = rf.score(x_test.T,y_test.T)*100
accuracies['Random Forest'] = acc
print("Random Forest Algorithm Accuracy Score : {:.2f}%".format(acc))

# Comparing Models

In [None]:
colors = ["aqua", "tan", "teal", "olive", "wheat", "salmon"]

sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.show()

# Confusion Matrix

In [None]:
# Predicted values
y_head_lr = lr.predict(x_test.T)
knn3 = KNeighborsClassifier(n_neighbors = 3)
knn3.fit(x_train.T, y_train.T)
y_head_knn = knn3.predict(x_test.T)
y_head_svm = svm.predict(x_test.T)
y_head_nb = nb.predict(x_test.T)
y_head_dt = dt.predict(x_test.T)
y_head_rf = rf.predict(x_test.T)

In [None]:
from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_test,y_head_lr)
cm_knn = confusion_matrix(y_test,y_head_knn)
cm_svm = confusion_matrix(y_test,y_head_svm)
cm_nb = confusion_matrix(y_test,y_head_nb)
cm_dt = confusion_matrix(y_test,y_head_dt)
cm_rf = confusion_matrix(y_test,y_head_rf)


In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrices",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(2,3,1)
plt.title("Logistic Regression Confusion Matrix\n", fontsize=24)
sns.heatmap(cm_lr,annot=True,cmap="coolwarm",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,2)
plt.title("K Nearest Neighbors Confusion Matrix\n", fontsize=16)
sns.heatmap(cm_knn,annot=True,cmap="coolwarm",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,3)
plt.title("Support Vector Machine Confusion Matrix\n", fontsize=16)
sns.heatmap(cm_svm,annot=True,cmap="coolwarm",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,4)
plt.title("Naive Bayes Confusion Matrix\n", fontsize=16)
sns.heatmap(cm_nb,annot=True,cmap="coolwarm",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,5)
plt.title("Decision Tree Confusion Matrix\n", fontsize=16)
sns.heatmap(cm_dt,annot=True,cmap="coolwarm",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(2,3,6)
plt.title("Random Forest Confusion Matrix\n", fontsize=16)
sns.heatmap(cm_rf,annot=True,cmap="coolwarm",fmt="d",cbar=False, annot_kws={"size": 24})

plt.show()