# 1. Download new_train.csv (don't worry about new_test.csv) from the following website.
--------
https://www.kaggle.com/code/rashmiranu/banking-dataset-eda-and-binary-classification/data


In [1]:
import math
import json
import concurrent.futures as cf # doesn't work with sklearn
import pandas as pd
import numpy as np
import copy as copy
import statistics as stt
import seaborn as sns
from os import system, getcwd, startfile
from os.path import join
from time import time
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter

In [None]:
src_path = join(getcwd().rstrip('src'), 'data/banking_dataset_classification_train.csv').replace('\\', '/')

In [None]:
# src: https://stackoverflow.com/questions/31328861/python-pandas-replacing-header-with-top-row
data = pd.read_csv(src_path, header=None, low_memory=False)
header = data.iloc[0]
data = data[1:]
data.columns = header
data

# 2. There are some numerical and object columns.  Find out the columns which are highest correlated to 'y' columns for both numerical and object columns.

In [None]:
# convert numerical object to int64
data['age'] = data['age'].astype('int64')
data['duration'] = data['duration'].astype('int64')
data['campaign'] = data['campaign'].astype('int64')
data['pdays'] = data['pdays'].astype('int64')
data['previous'] = data['previous'].astype('int64')

In [None]:
# check datatype
result = data.dtypes
print(result)

In [None]:
# change datatype
# src: https://stackoverflow.com/questions/51241575/calculate-correlation-between-columns-of-strings
# src: https://stackoverflow.com/questions/51102205/how-to-know-the-labels-assigned-by-astypecategory-cat-codes
columns = data.columns
column_dict = []
for x in columns:
    c = data[x].astype('category')
    d = dict(enumerate(c.cat.categories))
    column_dict.append(d)
    data[x] = data[x].astype('category').cat.codes
data_corr = data.corr()
data_corr.sort_values('y', ascending=False).head(1)

### Numerical Columns
| column | y corr value | highest |
| :-: | - ||
| age | 0.028620 ||
| duration | 0.079003 ||
| campaign | -0.051431 ||
| pdays | -0.238267 ||
| previous | 0.229759 | $\star$ |

### Object Columns
| column | y corr value | highest |
| :-: | - ||
| job | 0.026276 ||
| marital | 0.050084 ||
| education |0.059263||
| default |-0.099142||
| housing |0.009821||
| loan |-0.000452||
| contact |-0.143238||
| month |-0.007508||
| day_of_week |0.011926||
| poutcome |0.127784|$\star$|

# 3. Convert object columns, esp. the one that is highly correlated to 'y' column, to numerical so that it can be included in ML.
<!-- All columns will be converted. I'll use the correlation numbers in q4 to improve recall rate. -->
All data had been converted to numbers back in step 2.
The followings are the convertion labels.

In [None]:
print('job:\t' + str(json.dumps(column_dict[1], indent=4)))
print('marital:\t' + str(json.dumps(column_dict[2], indent=4)))
print('education:\t' + str(json.dumps(column_dict[3], indent=4)))
print('default:\t' + str(json.dumps(column_dict[4], indent=4)))
print('housing:\t' + str(json.dumps(column_dict[5], indent=4)))
print('loan:\t' + str(json.dumps(column_dict[6], indent=4)))
print('contact:\t' + str(json.dumps(column_dict[7], indent=4)))
print('month:\t' + str(json.dumps(column_dict[8], indent=4)))
print('day_of_week:\t' + str(json.dumps(column_dict[9], indent=4)))
print('poutcome:\t' + str(json.dumps(column_dict[14], indent=4)))
print('y:\t' + str(json.dumps(column_dict[15], indent=4)))

# 4. Explore the option of class_weight and anything you can do to improve recall rate for SVM and logistic regression to some reasonable value (try to keep precision to be better than 30%).
Approach: [include cross validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation)

In [None]:
# split X, y
y = data.y
X = data.drop(columns='y')

In [None]:
# scale X
s = StandardScaler()
X = s.fit_transform(X)

In [None]:
# split data set
# train : cross_validation : test = 6 : 2 : 2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=2018)
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=2018)
train_len = len(X_train)
cv_len = len(X_cv)
test_len = len(X_test)
total_len = len(data)
print('Percentage of Train set : CV set : Test set = {0} : {1} : {2}'.format(train_len/total_len*100, cv_len/total_len*100, test_len/total_len*100))
splitted_data = [X_train, y_train, X_cv, y_cv, X_test, y_test]

In [None]:
def evaluation(y_data, y_pred):
    accuracy = accuracy_score(y_data, y_pred)
    precision = precision_score(y_data, y_pred)
    recall = recall_score(y_data, y_pred)
    f1 = f1_score(y_data, y_pred)
    return [accuracy, precision, recall, f1]

def deploy_model(data, result=[], cv=True, default=False):
    data[0] = X_train
    data[1] = y_train
    data[2] = X_cv
    data[3] = y_cv
    data[4] = X_test
    data[5] = y_test

    # Logistic Regression
    if default:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['LR'] + eva_result)

    # Decision Tree
    if default:
        model = DecisionTreeClassifier()
    else:
        model = DecisionTreeClassifier(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['DT'] + eva_result)

    # Random Forest
    if default:
        model = RandomForestClassifier()
    else:
        model = RandomForestClassifier(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['RF'] + eva_result)

    # SVM
    if default:
        model = SVC()
    else:
        model = SVC(class_weight='balanced')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['SVM'] + eva_result)

    # KNN
    if default:
        model = KNeighborsClassifier()
    else:
        model = KNeighborsClassifier(weights='distance')
    model.fit(X_train, y_train)
    if cv:
        y_pred = model.predict(X_cv)
        eva_result = evaluation(y_cv, y_pred)
    else:
        y_pred = model.predict(X_test)
        eva_result = evaluation(y_test, y_pred)
    result.append(['KNN'] + eva_result)

    return result

def result_evaluation(result):
    result = pd.DataFrame(result, columns=['model', 'accuracy', 'precision', 'recall', 'f1'])
    df_LR = result[result['model'] == 'LR']
    df_DT = result[result['model'] == 'DT']
    df_RF = result[result['model'] == 'RF']
    df_SVM = result[result['model'] == 'SVM']
    df_KNN = result[result['model'] == 'KNN']
    df_LR_dc = df_LR.describe()
    df_DT_dc = df_DT.describe()
    df_RF_dc = df_RF.describe()
    df_SVM_dc = df_SVM.describe()
    df_KNN_dc = df_KNN.describe()
    print('LR------------------------------------------------')
    print(df_LR_dc)
    print('DT------------------------------------------------')
    print(df_DT_dc)
    print('RF------------------------------------------------')
    print(df_RF_dc)
    print('SVM-----------------------------------------------')
    print(df_SVM_dc)
    print('KNN-----------------------------------------------')
    print(df_KNN_dc)

In [None]:
# cross validation 1
result = []
for i in range(100):
    result = deploy_model(splitted_data, result, True, False)
result_evaluation(result)

In [None]:
# cross validation 2
result = []
for i in range(100):
    result = deploy_model(splitted_data, result, True, True)
result_evaluation(result)

In [None]:
# test (settings from cross validation 1)
final_result = []
for i in range(100):
    final_result = deploy_model(splitted_data, final_result, False)
result_evaluation(final_result)