# 1. Download new_train.csv (don't worry about new_test.csv) from the following website.
--------
https://www.kaggle.com/code/rashmiranu/banking-dataset-eda-and-binary-classification/data


In [1]:
import math
import json
import concurrent.futures as cf # doesn't work with sklearn
import pandas as pd
import numpy as np
import copy as copy
import statistics as stt
import seaborn as sns
from os import system, getcwd, startfile
from os.path import join
from time import time
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter

In [2]:
src_path = join(getcwd().rstrip('src'), 'data/banking_dataset_classification_train.csv').replace('\\', '/')

In [3]:
# src: https://stackoverflow.com/questions/31328861/python-pandas-replacing-header-with-top-row
data = pd.read_csv(src_path, header=None, low_memory=False)
header = data.iloc[0]
data = data[1:]
data.columns = header
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
1,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,999,0,nonexistent,no
2,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,999,1,failure,no
3,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,999,0,nonexistent,yes
4,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,999,0,nonexistent,no
5,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,999,0,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32946,28,services,single,high.school,no,yes,no,cellular,jul,tue,192,1,999,0,nonexistent,no
32947,52,technician,married,professional.course,no,yes,no,cellular,nov,fri,64,1,999,1,failure,no
32948,54,admin.,married,basic.9y,no,no,yes,cellular,jul,mon,131,4,999,0,nonexistent,no
32949,29,admin.,married,university.degree,no,no,no,telephone,may,fri,165,1,999,0,nonexistent,no


# 2. There are some numerical and object columns.  Find out the columns which are highest correlated to 'y' columns for both numerical and object columns.

In [4]:
# convert numerical object to int64
data['age'] = data['age'].astype('int64')
data['duration'] = data['duration'].astype('int64')
data['campaign'] = data['campaign'].astype('int64')
data['pdays'] = data['pdays'].astype('int64')
data['previous'] = data['previous'].astype('int64')

In [5]:
# check datatype
result = data.dtypes
print(result)

0
age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y              object
dtype: object


In [6]:
# change datatype
# src: https://stackoverflow.com/questions/51241575/calculate-correlation-between-columns-of-strings
# src: https://stackoverflow.com/questions/51102205/how-to-know-the-labels-assigned-by-astypecategory-cat-codes
columns = data.columns
column_dict = []
for x in columns:
    c = data[x].astype('category')
    d = dict(enumerate(c.cat.categories))
    column_dict.append(d)
    data[x] = data[x].astype('category').cat.codes
data_corr = data.corr()
data_corr.sort_values('y', ascending=False).head(1)

Unnamed: 0_level_0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,y
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
y,0.02862,0.026278,0.050084,0.059263,-0.099142,0.009821,-0.000452,-0.143238,-0.007508,0.011926,0.417566,-0.06571,-0.319886,0.229759,0.127784,1.0


### Numerical Columns
| column | y corr value | highest |
| :-: | - ||
| age | 0.028620 ||
| duration | 0.079003 ||
| campaign | -0.051431 ||
| pdays | -0.238267 ||
| previous | 0.229759 | $\star$ |

### Object Columns
| column | y corr value | highest |
| :-: | - ||
| job | 0.026276 ||
| marital | 0.050084 ||
| education |0.059263||
| default |-0.099142||
| housing |0.009821||
| loan |-0.000452||
| contact |-0.143238||
| month |-0.007508||
| day_of_week |0.011926||
| poutcome |0.127784|$\star$|

# 3. Convert object columns, esp. the one that is highly correlated to 'y' column, to numerical so that it can be included in ML.
<!-- All columns will be converted. I'll use the correlation numbers in q4 to improve recall rate. -->
All data had been converted to numbers back in step 2.
The followings are the convertion labels.

In [7]:
print('job:\t' + str(json.dumps(column_dict[1], indent=4)))
print('marital:\t' + str(json.dumps(column_dict[2], indent=4)))
print('education:\t' + str(json.dumps(column_dict[3], indent=4)))
print('default:\t' + str(json.dumps(column_dict[4], indent=4)))
print('housing:\t' + str(json.dumps(column_dict[5], indent=4)))
print('loan:\t' + str(json.dumps(column_dict[6], indent=4)))
print('contact:\t' + str(json.dumps(column_dict[7], indent=4)))
print('month:\t' + str(json.dumps(column_dict[8], indent=4)))
print('day_of_week:\t' + str(json.dumps(column_dict[9], indent=4)))
print('poutcome:\t' + str(json.dumps(column_dict[14], indent=4)))
print('y:\t' + str(json.dumps(column_dict[15], indent=4)))

job:	{
    "0": "admin.",
    "1": "blue-collar",
    "2": "entrepreneur",
    "3": "housemaid",
    "4": "management",
    "5": "retired",
    "6": "self-employed",
    "7": "services",
    "8": "student",
    "9": "technician",
    "10": "unemployed",
    "11": "unknown"
}
marital:	{
    "0": "divorced",
    "1": "married",
    "2": "single",
    "3": "unknown"
}
education:	{
    "0": "basic.4y",
    "1": "basic.6y",
    "2": "basic.9y",
    "3": "high.school",
    "4": "illiterate",
    "5": "professional.course",
    "6": "university.degree",
    "7": "unknown"
}
default:	{
    "0": "no",
    "1": "unknown",
    "2": "yes"
}
housing:	{
    "0": "no",
    "1": "unknown",
    "2": "yes"
}
loan:	{
    "0": "no",
    "1": "unknown",
    "2": "yes"
}
contact:	{
    "0": "cellular",
    "1": "telephone"
}
month:	{
    "0": "apr",
    "1": "aug",
    "2": "dec",
    "3": "jul",
    "4": "jun",
    "5": "mar",
    "6": "may",
    "7": "nov",
    "8": "oct",
    "9": "sep"
}
day_of_week:	{


# 4. Explore the option of class_weight and anything you can do to improve recall rate for SVM and logistic regression to some reasonable value (try to keep precision to be better than 30%).
Approach: [include cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

In [8]:
# split X, y
y = data.y
X = data.drop(columns='y')

In [9]:
# scale X
s = StandardScaler()
X = s.fit_transform(X)

In [10]:
# split data set
# train : cross_validation : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=2018)
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=2018)
train_len = len(X_train)
cv_len = len(X_cv)
test_len = len(X_test)
total_len = len(data)
print('Percentage of Train set : CV set : Test set = {0} : {1} : {2}'.format(train_len/total_len*100, cv_len/total_len*100, test_len/total_len*100))
splitted_data = [X_train, y_train, X_cv, y_cv, X_test, y_test]

Percentage of Train set : CV set : Test set = 60.0 : 20.0 : 20.0


In [11]:
def evaluation(y_data, y_pred):
    accuracy = accuracy_score(y_data, y_pred)
    precision = precision_score(y_data, y_pred)
    recall = recall_score(y_data, y_pred)
    f1 = f1_score(y_data, y_pred)
    return [accuracy, precision, recall, f1]

def deploy_model(data, result=[], cv=True, default=False):
    data[0] = X_train
    data[1] = y_train
    data[2] = X_cv
    data[3] = y_cv
    data[4] = X_test
    data[5] = y_test

    # Logistic Regression
    if default:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['LR'] + eva_result)

    # Decision Tree
    if default:
        model = DecisionTreeClassifier()
    else:
        model = DecisionTreeClassifier(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['DT'] + eva_result)

    # Random Forest
    if default:
        model = RandomForestClassifier()
    else:
        model = RandomForestClassifier(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['RF'] + eva_result)

    # SVM
    if default:
        model = SVC()
    else:
        model = SVC(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['SVM'] + eva_result)

    # KNN
    if default:
        model = KNeighborsClassifier()
    else:
        model = KNeighborsClassifier(weights='distance')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['KNN'] + eva_result)

    return result

def result_evaluation(result):
    result = pd.DataFrame(result, columns=['model', 'accuracy', 'precision', 'recall', 'f1'])
    df_LR = result[result['model'] == 'LR']
    df_DT = result[result['model'] == 'DT']
    df_RF = result[result['model'] == 'RF']
    df_SVM = result[result['model'] == 'SVM']
    df_KNN = result[result['model'] == 'KNN']
    df_LR_dc = df_LR.describe()
    df_DT_dc = df_DT.describe()
    df_RF_dc = df_RF.describe()
    df_SVM_dc = df_SVM.describe()
    df_KNN_dc = df_KNN.describe()
    print('LR------------------------------------------------')
    print(df_LR_dc)
    print('DT------------------------------------------------')
    print(df_DT_dc)
    print('RF------------------------------------------------')
    print(df_RF_dc)
    print('SVM-----------------------------------------------')
    print(df_SVM_dc)
    print('KNN-----------------------------------------------')
    print(df_KNN_dc)

In [17]:
# cross validation 1
result = []
# with cf.ThreadPoolExecutor() as executor:
#     results = [executor.submit(deploy_model, splitted_data, result, True, False) for _ in range(5)]
#     result.append(results)
for i in range(5):
    result = deploy_model(splitted_data, result, True, False)
result_evaluation(result)

LR------------------------------------------------
       accuracy  precision    recall        f1
count  5.000000   5.000000  5.000000  5.000000
mean   0.832625   0.373064  0.763085  0.501131
std    0.000000   0.000000  0.000000  0.000000
min    0.832625   0.373064  0.763085  0.501131
25%    0.832625   0.373064  0.763085  0.501131
50%    0.832625   0.373064  0.763085  0.501131
75%    0.832625   0.373064  0.763085  0.501131
max    0.832625   0.373064  0.763085  0.501131
DT------------------------------------------------
       accuracy  precision    recall        f1
count  5.000000   5.000000  5.000000  5.000000
mean   0.878543   0.448219  0.443526  0.445857
std    0.001087   0.004989  0.005595  0.005115
min    0.877086   0.441341  0.435262  0.438280
25%    0.878300   0.446927  0.440771  0.443828
50%    0.878452   0.447989  0.444904  0.446441
75%    0.878756   0.449655  0.447658  0.449345
max    0.880121   0.455182  0.449036  0.451389
RF------------------------------------------------
 

In [13]:
# cross validation 2
result = []
for i in range(5):
    result = deploy_model(splitted_data, result, True, True)
result_evaluation(result)

LR------------------------------------------------
           accuracy  precision    recall            f1
count  5.000000e+00   5.000000  5.000000  5.000000e+00
mean   9.080425e-01   0.650754  0.356749  4.608541e-01
std    1.241267e-16   0.000000  0.000000  6.206335e-17
min    9.080425e-01   0.650754  0.356749  4.608541e-01
25%    9.080425e-01   0.650754  0.356749  4.608541e-01
50%    9.080425e-01   0.650754  0.356749  4.608541e-01
75%    9.080425e-01   0.650754  0.356749  4.608541e-01
max    9.080425e-01   0.650754  0.356749  4.608541e-01
DT------------------------------------------------
       accuracy  precision    recall        f1
count  5.000000   5.000000  5.000000  5.000000
mean   0.871745   0.423765  0.456198  0.439379
std    0.001575   0.006371  0.005956  0.005986
min    0.870106   0.417513  0.447658  0.432756
25%    0.870713   0.418814  0.453168  0.434610
50%    0.871624   0.423469  0.457300  0.439735
75%    0.872079   0.425478  0.460055  0.442091
max    0.874203   0.433548 

In [14]:
# test (settings from cross validation 1)
final_result = []
for i in range(5):
    final_result = deploy_model(splitted_data, final_result, False)
result_evaluation(final_result)

LR------------------------------------------------
       accuracy  precision    recall        f1
count  5.000000    5.00000  5.000000  5.000000
mean   0.836722    0.39165  0.789052  0.523472
std    0.000000    0.00000  0.000000  0.000000
min    0.836722    0.39165  0.789052  0.523472
25%    0.836722    0.39165  0.789052  0.523472
50%    0.836722    0.39165  0.789052  0.523472
75%    0.836722    0.39165  0.789052  0.523472
max    0.836722    0.39165  0.789052  0.523472
DT------------------------------------------------
       accuracy  precision    recall        f1
count  5.000000   5.000000  5.000000  5.000000
mean   0.879120   0.466292  0.439786  0.452648
std    0.000828   0.004040  0.006162  0.005136
min    0.878149   0.461429  0.431242  0.445825
25%    0.878300   0.462411  0.435247  0.448418
50%    0.879514   0.468354  0.443258  0.456044
75%    0.879818   0.469590  0.444593  0.456164
max    0.879818   0.469676  0.444593  0.456790
RF------------------------------------------------
 