# **1. CROSS VALIDATION (MODEL BENCHMARKING)**

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler,RobustScaler

from sklearn.model_selection import train_test_split, StratifiedKFold,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, classification_report, precision_score,f1_score,recall_score

In [36]:
data = pd.read_csv('bankloan.csv')
data.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


In [37]:
# split x and y

x = data[['age','employ','income','debtinc','creddebt','othdebt']]
y = data['default']

xtrain,xtest,ytrain,ytest = train_test_split(
    x,
    y,
    stratify=y,
    random_state=100,
    test_size = 0.2
)

In [38]:
#model benchmarking
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
logreg = LogisticRegression()

In [39]:
#create manual cross validation
skfold = StratifiedKFold(n_splits=10)
logreg_cv = cross_val_score(logreg,xtrain,ytrain, cv=skfold)
knn_cv = cross_val_score(knn,xtrain,ytrain, cv=skfold)
tree_cv = cross_val_score(tree,xtrain,ytrain, cv=skfold)

In [40]:
# see the result

print('logistic regression CV: ',logreg_cv)
print('KNN CV: ',knn_cv)
print('DT CV: ',tree_cv)

logistic regression CV:  [0.89285714 0.80357143 0.76785714 0.78571429 0.83928571 0.83928571
 0.76785714 0.82142857 0.76785714 0.80357143]
KNN CV:  [0.73214286 0.73214286 0.73214286 0.76785714 0.82142857 0.78571429
 0.66071429 0.75       0.71428571 0.78571429]
DT CV:  [0.69642857 0.82142857 0.64285714 0.75       0.78571429 0.71428571
 0.64285714 0.71428571 0.625      0.71428571]


In [41]:
#MEAN CV
print('logistic regression CV: ',logreg_cv.mean())
print('KNN CV: ',knn_cv.mean())
print('DT CV: ',tree_cv.mean())

logistic regression CV:  0.8089285714285716
KNN CV:  0.7482142857142857
DT CV:  0.7107142857142856


In [42]:
#STD CV --> makin kecil makin akurat, tergambarkan oleh mean. data nya gak beragam, itungannya lebih akurat.
# 0.0
print('logistic regression CV: ',logreg_cv.std())
print('KNN CV: ',knn_cv.std())
print('DT CV: ',tree_cv.std())


logistic regression CV:  0.03834091170282837
KNN CV:  0.042595930149559154
DT CV:  0.06018678409411683


In [43]:
#create looping to summarize data
from sklearn.svm import SVC
tree = DecisionTreeClassifier(max_depth = 10,criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=10)
logreg = LogisticRegression()
svm = SVC(kernel = 'rbf')

model = [tree, knn, logreg, svm]
score = []
recall = []
std = []


for i in model:
    model_cv = cross_val_score(i,xtrain,ytrain, cv=skfold, scoring = 'recall' )
    score.append(model_cv)
    recall.append(model_cv.mean())
    std.append(model_cv.std())

result = pd.DataFrame({
    'Model' : ['Decision Tree','KNN','Logistic Regression','SVM'],
    'Recall': recall ,
    'Standard deviation' : std}
)

result.sort_values('Recall',ascending=False)

Unnamed: 0,Model,Recall,Standard deviation
2,Logistic Regression,0.46619,0.069805
0,Decision Tree,0.457619,0.160871
1,KNN,0.292381,0.110476
3,SVM,0.212381,0.082015


# **2.HYPERPARAMETER TUNING**

In [44]:
#Tuning with GridsearchCV

hyperparam_space ={
    'criterion' : ['gini','entropy','log_loss'],
    'splitter' : ['random','best'],
    'max_depth' : np.arange(1,51)

}

skfold = StratifiedKFold(n_splits=5)
tree = DecisionTreeClassifier()

grid_search = GridSearchCV(
    tree, 
    param_grid=hyperparam_space,
    cv = skfold,
    scoring = 'f1',
    verbose = 1
)

grid_search.fit(xtrain,ytrain)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [45]:
#see the result

pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.016577,0.024049,0.010714,0.006942,gini,1,random,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.000000,0.000000,0.461538,0.400000,0.492754,0.270858,0.223160,296
1,0.002984,0.000430,0.004670,0.001255,gini,1,best,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.500000,0.470588,0.560000,0.387097,0.514286,0.486394,0.057429,60
2,0.002763,0.000596,0.003345,0.000444,gini,2,random,"{'criterion': 'gini', 'max_depth': 2, 'splitte...",0.560000,0.343750,0.000000,0.315789,0.350000,0.313908,0.179609,292
3,0.002991,0.000601,0.004216,0.000754,gini,2,best,"{'criterion': 'gini', 'max_depth': 2, 'splitte...",0.171429,0.470588,0.270270,0.391304,0.111111,0.282941,0.133473,294
4,0.002789,0.000396,0.003850,0.001021,gini,3,random,"{'criterion': 'gini', 'max_depth': 3, 'splitte...",0.500000,0.439024,0.487805,0.358974,0.481481,0.453457,0.051511,163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.004414,0.000274,0.003906,0.000914,log_loss,48,best,"{'criterion': 'log_loss', 'max_depth': 48, 'sp...",0.367347,0.474576,0.500000,0.372881,0.562500,0.455461,0.075352,150
296,0.002726,0.000096,0.003747,0.000834,log_loss,49,random,"{'criterion': 'log_loss', 'max_depth': 49, 'sp...",0.357143,0.415094,0.459016,0.385965,0.456140,0.414672,0.039547,268
297,0.004861,0.000829,0.003757,0.000712,log_loss,49,best,"{'criterion': 'log_loss', 'max_depth': 49, 'sp...",0.423077,0.440678,0.500000,0.372881,0.611111,0.469549,0.081615,96
298,0.003022,0.000649,0.003482,0.000610,log_loss,50,random,"{'criterion': 'log_loss', 'max_depth': 50, 'sp...",0.440678,0.400000,0.548387,0.372881,0.562500,0.464889,0.077152,113


In [46]:
#get best score

grid_search.best_score_
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 10, 'splitter': 'best'}

In [47]:
#see the result before the tuning
tree = DecisionTreeClassifier()
tree.fit(xtrain,ytrain)
print(classification_report(ytest,tree.predict(xtest)))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       103
           1       0.58      0.49      0.53        37

    accuracy                           0.77       140
   macro avg       0.70      0.68      0.69       140
weighted avg       0.76      0.77      0.76       140



In [48]:
#see the result after tuning
tree = grid_search.best_estimator_
tree.fit(xtrain,ytrain)
print(classification_report(ytest,tree.predict(xtest)))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       103
           1       0.52      0.38      0.44        37

    accuracy                           0.74       140
   macro avg       0.66      0.63      0.64       140
weighted avg       0.72      0.74      0.73       140



# **Random Search**

In [49]:
hyperparam_space ={
    'criterion' : ['gini','entropy','log_loss'],
    'splitter' : ['random','best'],
    'max_depth' : np.arange(1,51)

}

skfold = StratifiedKFold(n_splits=5)
tree = DecisionTreeClassifier()

grid_search = RandomizedSearchCV(
    tree, 
    param_distributions=hyperparam_space,
    cv = skfold,
    scoring = 'f1',
    n_iter = 100,
    verbose = 1
)

grid_search.fit(xtrain,ytrain)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


# Algorithm Chain

In [50]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [51]:
# Change '?' To Missing Value

# data.replace('?', np.nan, inplace=True)
# data.isna().sum()/len(data)*100

In [52]:
# Skema Preprocessing

BE_Pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NC')),
    ('BE', ce.BinaryEncoder())
])

transformer = ColumnTransformer([
    ('OHE', OneHotEncoder(drop='first'), ['relationship', 'race', 'sex']),
    ('Binary Enc', BE_Pipeline, ['workclass', 'marital.status', 'occupation', 'native.country'])
], remainder='passthrough')

In [53]:
# Data Spliting

X = data.drop(columns=['fnlwgt', 'education', 'income'])
y = np.where(data['income'] == '>50K', 1, 0)


xtrain, xtest, ytrain, ytest = train_test_split(
    X,
    y,
    stratify=y,
    random_state=100,
    test_size=0.2
)

In [57]:
# CREATE PIPELINE

# tree = DecisionTreeClassifier(
#     criterion = 'entropy',
#     max_depth = 10,
#     random_state= 2023
# )

# tree_pipe = Pipeline([
#     ('Preprocess', transformer),
#     ('model',tree)
# ])

# tree_pipe.fit(xtrain,ytrain)


# CREATE PIPELINE

skfold = StratifiedKFold(n_splits=5)

tree = DecisionTreeClassifier(
    # criterion = 'entropy',
    # max_depth = 10,
    # random_state= 2023
)

tree_pipe = Pipeline([
    ('Preprocess', transformer),
    ('model',tree)
])

# HYPER PARAMETER TUNING

hyperparam_space ={
    'model__criterion' : ['gini','entropy','log_loss'],
    'model__splitter' : ['random','best'],
    'model__max_depth' : np.arange(1,51)
}

grid_search = GridSearchCV(
    tree_pipe, 
    param_grid=hyperparam_space,
    cv = skfold,
    scoring = 'f1',
    verbose = 1
)

grid_search.fit(xtrain,ytrain)


Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [None]:
tree_pipe

In [56]:
y_pred =  tree_pipe.predict(xtest)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.95      0.91      4945
           1       0.78      0.54      0.64      1568

    accuracy                           0.85      6513
   macro avg       0.82      0.75      0.77      6513
weighted avg       0.85      0.85      0.84      6513

