<a href="https://colab.research.google.com/github/anandaptralmira/classification/blob/master/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Classification Models**
```
1.  Age       : (numeric)
2.  Job       : (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3.  Marital   : (categorical: 'divorced','married','single','unknown')
4.  Education : (categorical: primary, secondary, tertiary and unknown)
5.  Default   : (categorical: 'no','yes','unknown')
6.  Housing   : (categorical: 'no','yes','unknown')
7.  Loan      : (categorical: 'no','yes','unknown')
8.  Balance   : Balance of the individual.
9.  Contact   : (categorical: 'cellular','telephone')
10. Month     : (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
11. Day       : (categorical: 'mon','tue','wed','thu','fri')
12. Duration  : Last contact duration, in seconds (numeric). 
13. Campaign  : (numeric, includes last contact)
14. Pdays     : (numeric; 999 means client was not previously contacted)
15. Previous  : (numeric)
16. Poutcome  : (categorical: 'failure','nonexistent','success')
17. y(deposit): (binary: 'yes','no')
```

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn import metrics

In [None]:
# import data to colab
df_dep = pd.read_csv('https://raw.githubusercontent.com/anandaptralmira/classification/master/data/bank.csv', sep= ',')
df_dep

# have a grasp of the data
df_dep.head()

In [None]:
# show the dataset information
df_dep.info()

In [None]:
# check for missing value
df_dep.isnull().sum()

#### **Descriptive Statistic**

In [None]:
# show the descriptive statistics of the numerical attributes
df_dep.describe()

In [None]:
# show the types of the categorical attributes
for col in df_dep.select_dtypes(include='object').columns:
  print(col)
  print(df_dep[col].unique())

In [None]:
# drop the duration variable
df_dep.drop("duration", axis=1, inplace=True)

#### **Checking Class Balance**

In [None]:
# checking class balance
df_dep.deposit.value_counts()/df_dep.deposit.count()

no     0.52616
yes    0.47384
Name: deposit, dtype: float64

#### **Partitioning**

In [None]:
# preserve class proportion using StratifiedShuffleSplit
sss=StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=1) # the proportion is 70:30
for train_index, test_index in sss.split(df_dep.drop("deposit", axis=1), df_dep.deposit):
      traindf=df_dep.loc[train_index]
      testdf=df_dep.loc[test_index]

#### **Data Preprocessing**

In [None]:
# encoding target label
LE=LabelEncoder()
df_dep['deposit']=LE.fit_transform(df_dep.deposit.values)

# encoding categorical features
df_dep=pd.get_dummies(df_dep)

df_dep

In [None]:
# partitioning again
for train_index, test_index in sss.split(df_dep.drop("deposit", axis=1), df_dep.deposit):
  traindf=df_dep.loc[train_index]
  testdf=df_dep.loc[test_index]

In [None]:
# partition x/y
xtrain=traindf.drop('deposit', axis=1)
ytrain=traindf.deposit

xtest=testdf.drop('deposit', axis=1)
ytest=testdf.deposit

ytest

#### **Classification Model Using Naive Bayes**

In [None]:
# modelling Naive Bayes Classifier
gnb = GaussianNB()
#gnb_scores = cross_val_score(gnb, xtrain, ytrain, cv=3)
gnb.fit(xtrain,ytrain)

# predict to test data
y_pred_gnb = gnb.predict(xtest)

In [None]:
# show the confusion matrix
cm_gnb = metrics.confusion_matrix(ytest, y_pred_gnb)
cm_gnb

array([[1460,  302],
       [ 704,  883]])

In [None]:
# show the accuracy, precision, recall
acc_gnb = metrics.accuracy_score(ytest, y_pred_gnb)
prec_gnb = metrics.precision_score(ytest, y_pred_gnb)
rec_gnb = metrics.recall_score(ytest, y_pred_gnb)
f1_gnb = metrics.f1_score(ytest, y_pred_gnb)
kappa_gnb = metrics.cohen_kappa_score(ytest, y_pred_gnb)

print('Accuracy:', acc_gnb)
print('Precision:', prec_gnb)
print('Recall:', rec_gnb)
print('F1 Score:', f1_gnb)
print('Cohens Kappa Score:', kappa_gnb)

In [None]:
# personalize the visualization
plt.rcParams['figure.figsize'] = (10,10)
plt.style.use('ggplot')

# ROC Curve
y_pred_gnb_prob = gnb.predict_proba(xtest)[::,1]
fprgnb, tprgnb, _ = metrics.roc_curve(ytest,  y_pred_gnb_prob)
aucgnb = metrics.roc_auc_score(ytest, y_pred_gnb_prob)
plt.plot(fprgnb,tprgnb,label="Naive Bayes, auc="+str(aucgnb))
plt.title('ROC Curve - Naive Bayes')
plt.xlabel('false positive rate') 
plt.ylabel('true positive rate')
plt.legend(loc=4)
plt.show()

#### **Classification Model Using KNN**

In [None]:
# modeling
knn_clf = KNeighborsClassifier()
knn_clf.fit(xtrain, ytrain)

# predict to test data
y_pred_knn = knn_clf.predict(xtest)

In [None]:
# show the confusion matrix
cm_knn = metrics.confusion_matrix(ytest, y_pred_knn)
cm_knn

In [None]:
# show the accuracy, precision, recall
acc_knn = metrics.accuracy_score(ytest, y_pred_knn)
prec_knn = metrics.precision_score(ytest, y_pred_knn)
rec_knn = metrics.recall_score(ytest, y_pred_knn)
f1_knn = metrics.f1_score(ytest, y_pred_knn)
kappa_knn = metrics.cohen_kappa_score(ytest, y_pred_knn)

print('Accuracy:', acc_knn)
print('Precision:', prec_knn)
print('Recall:', rec_knn)
print('F1 Score:', f1_knn)
print('Cohens Kappa Score:', kappa_knn)

In [None]:
# personalize the visualization
plt.rcParams['figure.figsize'] = (10,10)
plt.style.use('ggplot')

# ROC Curve
y_pred_knn_prob = knn_clf.predict_proba(xtest)[::,1]
fprknn, tprknn, _ = metrics.roc_curve(ytest,  y_pred_knn_prob)
aucknn = metrics.roc_auc_score(ytest, y_pred_knn_prob)
plt.plot(fprknn,tprknn,label="KNN, auc="+str(aucknn))
plt.title('ROC Curve - KNN')
plt.xlabel('false positive rate') 
plt.ylabel('true positive rate')
plt.legend(loc=4)
plt.show()

### **Comparison**

In [None]:
# comparing model performance
print('Naive-Bayes Accuracy:', acc_gnb)
print('Naive-Bayes Precision:', prec_gnb)
print('Naive-Bayes Recall:', rec_gnb)
print('Naive-Bayes F1 Score:', f1_gnb)
print("---------------------------")
print('KKN Accuracy:', acc_knn)
print('KKN Precision:', prec_knn)
print('KKN Recall:', rec_knn)
print('KKN F1 Score:', f1_knn)

In [None]:
# Comparing ROC Curve
plt.plot(fprgnb,tprgnb,label="Naive Bayes, auc="+str(aucgnb))
plt.plot(fprknn,tprknn,label="KNN, auc="+str(aucknn))
plt.title('ROC Curve Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc=4)
plt.show()

In [None]:
# Create Datafame Contains Fature and Result of prediction
df_compare = xtest.copy()
df_compare['ActualClass'] = ytest
df_compare['NaiveBayes'] = y_pred_gnb
df_compare['KNN'] = y_pred_knn
df_compare.head()

## **CONCLUSION**

Based on the value of comparison, **Naive Bayes** is the best model.