In [90]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

# train test split
from sklearn.model_selection import train_test_split

# impute missing values
from sklearn.impute import SimpleImputer # mean, median, most_frequent (mode), constant
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # regresi
from sklearn.impute import KNNImputer # regresi KKN

# encoding
from sklearn.preprocessing import OneHotEncoder
from category_encoders import OrdinalEncoder, BinaryEncoder

# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 

# column transformer & pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# cross validation
from sklearn.model_selection import cross_val_score

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [91]:
# load dataset
df = pd.read_csv('bankloan.csv')[['employ', 'debtinc','creddebt','othdebt', 'default']]
df.head()

Unnamed: 0,employ,debtinc,creddebt,othdebt,default
0,17,9.3,11.359392,5.008608,1
1,10,17.3,1.362202,4.000798,0
2,15,5.5,0.856075,2.168925,0
3,15,2.9,2.65872,0.82128,0
4,2,17.3,1.787436,3.056564,1


## Tentukan FP / FN yang merupakan kelas yang lebih parah, buat dikasih pinjeman 0 bisa bayar, 1 gagal bayar
- FP : ML memprediksi calon nasabah akan default, padahal aktualnya lunas. -> Kita gak kasih pinjaman padahal bisa bayar. -> Padahal ini untuk keuntungan.
- FN : ML memprekdisi calon nasabah akan lunas, padahal aktualmnya dia default. -> Kita pinjemin padagal gagal bayar.

Ketika sama-sama bahaya, pakai saja F-1 score.

In [92]:
df.isna().sum()

employ      0
debtinc     0
creddebt    0
othdebt     0
default     0
dtype: int64

In [93]:
df.duplicated().sum()

0

## Define Feature

In [94]:
x = df.drop(columns='default')
y = df['default']

In [95]:
# Data splitting
xtrain, xtest, ytrain, ytest = train_test_split(
                x,
                y,
                test_size=0.2,
                stratify=y,
                random_state=0
                )

## Pre Processing

In [96]:
scaler = RobustScaler()

In [97]:
transformer = ColumnTransformer([
    ('scaling', RobustScaler(), ['employ', 'debtinc', 'creddebt', 'othdebt'])
])
transformer

## Handling 4 skenario untuk Imblance Treatment

1. No Treatment
2. Optimized Threshold
3. Resampling (Oversampling or Undersampling)
4. Penalized model

### No Treament

Benchmark Model:

In [98]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

# predict
ypred_benchmark = pipe_model.predict(xtest)

# f1 score
f1_score(ytest, ypred_benchmark)

0.6101694915254238

In [99]:
pipe_model.predict_proba(xtest)

array([[0.5136731 , 0.4863269 ],
       [0.9879077 , 0.0120923 ],
       [0.95640033, 0.04359967],
       [0.83711282, 0.16288718],
       [0.97443242, 0.02556758],
       [0.86153037, 0.13846963],
       [0.93893686, 0.06106314],
       [0.97982765, 0.02017235],
       [0.97236611, 0.02763389],
       [0.95201433, 0.04798567],
       [0.93701082, 0.06298918],
       [0.22813487, 0.77186513],
       [0.68813742, 0.31186258],
       [0.79380327, 0.20619673],
       [0.37989445, 0.62010555],
       [0.69102679, 0.30897321],
       [0.65257812, 0.34742188],
       [0.71419602, 0.28580398],
       [0.43399589, 0.56600411],
       [0.93871143, 0.06128857],
       [0.99058339, 0.00941661],
       [0.19002858, 0.80997142],
       [0.24458056, 0.75541944],
       [0.75969159, 0.24030841],
       [0.57086808, 0.42913192],
       [0.73365392, 0.26634608],
       [0.73152543, 0.26847457],
       [0.35709099, 0.64290901],
       [0.99658213, 0.00341787],
       [0.51795285, 0.48204715],
       [0.

In [100]:
from sklearn.metrics import confusion_matrix

In [101]:
confmatrix = confusion_matrix(ytest, ypred_benchmark)
dfCM = pd.DataFrame(confmatrix, index=['Actual Negative', 'Actual Positive'], columns=['Predicted Negative', 'Predicted Positive'])
tp = dfCM.loc['Actual Positive', 'Predicted Positive']
fp = dfCM.loc['Actual Negative', 'Predicted Positive']
tn = dfCM.loc['Actual Negative', 'Predicted Negative']
fn = dfCM.loc['Actual Positive', 'Predicted Negative']
dfCM


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,99,4
Actual Positive,19,18


### Oprtimzed Threshold

Looping probability threshold untuk mencari performat terbaik.

In [102]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

# predict
ypred_benchmark = pipe_model.predict(xtest)

# f1 score
f1_score(ytest, ypred_benchmark)

0.6101694915254238

In [103]:
ypred_proba = pipe_model.predict_proba(xtest)[:,1]

ypred_class = np.where(ypred_proba>0.50, 1 ,0)
ypred_class

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [104]:
confusion_matrix(ytest,ypred_class)

array([[99,  4],
       [19, 18]])

In [105]:
f1_score(ytest,ypred_class)

0.6101694915254238

In [106]:
listTemp = []

model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

#=======================

#Looping THreshold
list_threshold = np.arange(0.01,1.00,0.01)

for threshold in list_threshold:

    #predict
    ypred_proba = pipe_model.predict_proba(xtest)[:,1]

    ypred_class = np.where(ypred_proba>threshold, 1 ,0)

    # f1 score
    listTemp.append(f1_score(ytest, ypred_class))

listTemp

[0.43274853801169594,
 0.4567901234567901,
 0.47435897435897434,
 0.4868421052631579,
 0.5068493150684932,
 0.5174825174825174,
 0.5285714285714286,
 0.5362318840579711,
 0.5441176470588236,
 0.5481481481481482,
 0.5522388059701492,
 0.5522388059701492,
 0.5648854961832062,
 0.5826771653543308,
 0.5737704918032787,
 0.5737704918032787,
 0.5619834710743802,
 0.5714285714285714,
 0.5964912280701754,
 0.6,
 0.6037735849056604,
 0.6153846153846153,
 0.6336633663366337,
 0.6597938144329898,
 0.6526315789473685,
 0.6236559139784947,
 0.6222222222222221,
 0.6292134831460674,
 0.6046511627906977,
 0.5853658536585367,
 0.5679012345679012,
 0.5569620253164557,
 0.5569620253164557,
 0.5194805194805195,
 0.5333333333333333,
 0.5135135135135135,
 0.5205479452054794,
 0.5205479452054794,
 0.5277777777777778,
 0.5142857142857143,
 0.5217391304347827,
 0.5217391304347827,
 0.5373134328358209,
 0.5454545454545455,
 0.5625,
 0.5625,
 0.5714285714285714,
 0.5714285714285714,
 0.6101694915254238,
 0.61016

In [107]:
dfTh = pd.DataFrame()
dfTh['threshold'] = list_threshold
dfTh['f1'] = listTemp

dfTh.sort_values('f1',ascending=False).reset_index(drop=True)

Unnamed: 0,threshold,f1
0,0.24,0.659794
1,0.25,0.652632
2,0.23,0.633663
3,0.28,0.629213
4,0.26,0.623656
...,...,...
94,0.95,0.052632
95,0.96,0.000000
96,0.97,0.000000
97,0.98,0.000000


In [110]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

#=======================

#Looping THreshold
list_threshold = np.arange(0.01,1.00,0.01)
listTotalCost = []

for threshold in list_threshold:

    #predict
    ypred_proba = pipe_model.predict_proba(xtest)[:,1]

    ypred_class = np.where(ypred_proba>threshold, 1 ,0)

    # total cost
    fn = confusion_matrix(ytest,ypred_class)[1,0] 
    cost_fn = fn*300

    fp = confusion_matrix(ytest,ypred_class)[0,1]
    cost_fp = fp*100

    totalCost = cost_fn + cost_fp
    listTotalCost.append(totalCost)

In [112]:
dfTh_cost = pd.DataFrame()
dfTh_cost['threshold'] = list_threshold
dfTh_cost['totalCost'] = listTotalCost

dfTh_cost.sort_values('totalCost',ascending=True).reset_index(drop=True)

Unnamed: 0,threshold,totalCost
0,0.24,4300
1,0.25,4500
2,0.23,4700
3,0.22,5000
4,0.28,5100
...,...,...
94,0.95,10800
95,0.98,11100
96,0.96,11100
97,0.97,11100


#### BEST THRESHOLD F1 SCORE TERBAIK

In [114]:
best_threshold = 0.24
y_pred_proba = pipe_model.predict_proba(xtest)[: ,1]
y_pred_threshold = np.where(y_pred_proba > best_threshold, 1, 0)
#f1
f1_score(ytest,y_pred_threshold)

0.6597938144329898

In [123]:
print(f1_score(ytest,ypred_benchmark), 'no treatment')
print(f1_score(ytest,y_pred_threshold), 'best threshold')

display(confusion_matrix(ytest,ypred_benchmark), 'matrix no treatment')
display(confusion_matrix(ytest,y_pred_threshold), 'matrix best threshold')

0.6101694915254238 no treatment
0.6597938144329898 best threshold


array([[99,  4],
       [19, 18]])

'matrix no treatment'

array([[75, 28],
       [ 5, 32]])

'matrix best threshold'

### RESAMPLING

In [119]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

In [120]:
ytrain.value_counts()

default
0    414
1    146
Name: count, dtype: int64

In [124]:
ytrain.value_counts() / len(ytrain)

default
0    0.739286
1    0.260714
Name: count, dtype: float64

In [129]:
# Define Resampler
over = RandomOverSampler(random_state=0)

#fit & resample xtrain
xtrainOver, ytrainOver = over.fit_resample(xtrain,ytrain)

#check
ytrainOver.value_counts()

default
0    414
1    414
Name: count, dtype: int64

In [130]:
# Define Resampler
under = RandomUnderSampler(random_state=0)

#fit & resample xtrain
xtrainUnder, ytrainUnder = under.fit_resample(xtrain,ytrain)

#check
ytrainUnder.value_counts()

default
0    146
1    146
Name: count, dtype: int64

#### Models

In [133]:
from imblearn.pipeline import Pipeline

In [137]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('resampling', over),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

#predict
yPred_over = pipe_model.predict(xtest)

# f1 score
f1_score(ytest, yPred_over)

0.6391752577319588

In [138]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('resampling', under),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

#predict
yPred_under = pipe_model.predict(xtest)

# f1 score
f1_score(ytest, yPred_under)

0.6451612903225806

In [139]:
print(f1_score(ytest,ypred_benchmark), 'benchmark no treatment')
print(f1_score(ytest,y_pred_threshold), 'best threshold')
print(f1_score(ytest,yPred_over), 'Oversampling')
print(f1_score(ytest,yPred_under), 'Undersampling')

0.6101694915254238 benchmark no treatment
0.6597938144329898 best threshold
0.6391752577319588 Oversampling
0.6451612903225806 Undersampling


### Penalized Models
Algortima ML akan membarikan perhatian lebih pada kelas minority )kelas 1). Error di kelas minority (kelas 1) akan diboboti lebih berat daripada error di kelas majority (kelas 0)

In [140]:
model = LogisticRegression(random_state=0, class_weight='balanced')

pipe_model = Pipeline([
    ('preprocessing',transformer),
    ('modeling', model)
])

# Fit
pipe_model.fit(xtrain,ytrain)

#cross validation di skip

#predict
yPred_penalize = pipe_model.predict(xtest)

# f1 score
f1_score(ytest, yPred_penalize)

0.6236559139784947

In [142]:
print(f1_score(ytest,ypred_benchmark), 'benchmark no treatment')
print(f1_score(ytest,y_pred_threshold), 'best threshold')
print(f1_score(ytest,yPred_over), 'Oversampling')
print(f1_score(ytest,yPred_under), 'Undersampling')
print(f1_score(ytest,yPred_penalize), 'Penalized Models')

0.6101694915254238 benchmark no treatment
0.6597938144329898 best threshold
0.6391752577319588 Oversampling
0.6451612903225806 Undersampling
0.6236559139784947 Penalized Models
