# Imbalance Classification

In [194]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

# train test split
from sklearn.model_selection import train_test_split

# impute missing values
from sklearn.impute import SimpleImputer # mean, median, most_frequent (mode), constant
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # regresi
from sklearn.impute import KNNImputer # regresi KKN

# encoding
from sklearn.preprocessing import OneHotEncoder
from category_encoders import OrdinalEncoder, BinaryEncoder

# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 

# column transformer & pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# cross validation
from sklearn.model_selection import cross_val_score

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

- FP: ML memprediksi calon nasabah akan default, padahal aktualnya lunas (cost 100jt)
- FN: ML memprediksi calon nasabah akan lunas, padahal aktualnya default (cost 100jt)

Metric: f1 score

In [195]:
# load dataset
df = pd.read_csv('bankloan.csv')[['employ', 'debtinc','creddebt','othdebt', 'default']]
df.head()

Unnamed: 0,employ,debtinc,creddebt,othdebt,default
0,17,9.3,11.359392,5.008608,1
1,10,17.3,1.362202,4.000798,0
2,15,5.5,0.856075,2.168925,0
3,15,2.9,2.65872,0.82128,0
4,2,17.3,1.787436,3.056564,1


In [196]:
df.isna().sum()

employ      0
debtinc     0
creddebt    0
othdebt     0
default     0
dtype: int64

In [197]:
df.duplicated().sum()

0

In [198]:
# define feature and target
X = df.drop(columns='default')
y = df['default']

In [199]:
# data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    stratify=y, 
    random_state=0
)

In [200]:
# preprocessing
scaler = RobustScaler()

In [201]:
transformer = ColumnTransformer([
    ('scaling', RobustScaler(), ['employ', 'debtinc', 'creddebt', 'othdebt'])
])

## Imbalance Treatment:
Kita akan coba 4 skenario berbeda:

1. No treatment
2. Optimized Threshold
3. Resampling (Oversampling or Undersampling)
4. Penalized model

## 1. No treatment

Benchmark model 

In [202]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing', scaler),    
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# predict
y_pred_benchmark = pipe_model.predict(X_test)

# f1 score
f1_score(y_test, y_pred_benchmark)

0.6101694915254238

In [203]:
from sklearn.metrics import confusion_matrix 

In [204]:
confusion_matrix(y_test, y_pred_benchmark)

array([[99,  4],
       [19, 18]], dtype=int64)

## 2. Optimized Threshold

Looping probability threshold untuk mencari performa terbaik.

In [205]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing', scaler),    
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# predict
y_pred_benchmark = pipe_model.predict(X_test)

# f1 score
f1_score(y_test, y_pred_benchmark)

0.6101694915254238

In [206]:
y_pred_proba = pipe_model.predict_proba(X_test)[: , 1]

y_pred_class = np.where(y_pred_proba>0.50, 1, 0)

In [207]:
# threshold = 0.50
confusion_matrix(y_test, y_pred_class)

array([[99,  4],
       [19, 18]], dtype=int64)

In [208]:
fp = confusion_matrix(y_test, y_pred_class)[0,1]
cost_fp = fp * 100
cost_fp 

400

In [209]:
fn = confusion_matrix(y_test, y_pred_class)[1,0]
cost_fn = fn * 300
cost_fn

5700

In [210]:
total_cost = cost_fp + cost_fn
total_cost

6100

In [211]:
f1_score(y_test, y_pred_class)

0.6101694915254238

In [212]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing', scaler),    
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# ==============================================================
# Looping threshold

list_threshold = np.arange(0.01, 1.00, 0.01)
list_f1 = []

for threshold in list_threshold: 

    # predict proba
    y_pred_proba = pipe_model.predict_proba(X_test)[: , 1]
    y_pred_class = np.where(y_pred_proba> threshold, 1, 0)

    # f1 score
    list_f1.append(f1_score(y_test, y_pred_class))

In [213]:
df_th = pd.DataFrame()
df_th['threshold'] = list_threshold
df_th['f1'] = list_f1

df_th.sort_values('f1', ascending=False)

Unnamed: 0,threshold,f1
23,0.24,0.659794
24,0.25,0.652632
22,0.23,0.633663
27,0.28,0.629213
25,0.26,0.623656
...,...,...
94,0.95,0.052632
95,0.96,0.000000
96,0.97,0.000000
97,0.98,0.000000


Jika 
- cost FN: 300 dollar
- cost FP: 100 dollar

Looping threshold untuk mencari cost terendah

In [214]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing', scaler),    
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# ==============================================================
# Looping threshold

list_threshold = np.arange(0.01, 1.00, 0.01)
list_total_cost = []

for threshold in list_threshold: 

    # predict proba
    y_pred_proba = pipe_model.predict_proba(X_test)[: , 1]
    y_pred_class = np.where(y_pred_proba> threshold, 1, 0)

    # total_cost
    fn = confusion_matrix(y_test, y_pred_class)[1,0]
    cost_fn = fn * 300

    fp = confusion_matrix(y_test, y_pred_class)[0,1]
    cost_fp = fp * 100
    
    total_cost = cost_fn + cost_fp
    list_total_cost.append(total_cost)
    

In [215]:
df_th_cost = pd.DataFrame()
df_th_cost['threshold'] = list_threshold
df_th_cost['total_cost'] = list_total_cost

df_th_cost.sort_values('total_cost').head()

Unnamed: 0,threshold,total_cost
23,0.24,4300
24,0.25,4500
22,0.23,4700
21,0.22,5000
27,0.28,5100


Best Threshold dengan f1 score terbaik

In [216]:
best_threshold = 0.24 

# predict proba
y_pred_proba = pipe_model.predict_proba(X_test)[: , 1]
y_pred_threshold = np.where(y_pred_proba> best_threshold, 1, 0)

# f1
f1_score(y_test, y_pred_threshold)

0.6597938144329898

In [217]:
print(f1_score(y_test, y_pred_benchmark), 'benchmark (no treatment)')
print(f1_score(y_test, y_pred_threshold), 'best threshold')

0.6101694915254238 benchmark (no treatment)
0.6597938144329898 best threshold


In [218]:
display(confusion_matrix(y_test, y_pred_benchmark))
display(confusion_matrix(y_test, y_pred_threshold))

array([[99,  4],
       [19, 18]], dtype=int64)

array([[75, 28],
       [ 5, 32]], dtype=int64)

## 3. Resampling

- Oversampling
- Undersampling

In [219]:
# pip install imblearn 

In [220]:
from imblearn.over_sampling import RandomOverSampler, SMOTE 
from imblearn.under_sampling import RandomUnderSampler, NearMiss 

In [221]:
y_train.value_counts()

default
0    414
1    146
Name: count, dtype: int64

In [222]:
y_train.value_counts() / len(y_train)

default
0    0.739286
1    0.260714
Name: count, dtype: float64

In [223]:
# define resampler
over = RandomOverSampler(random_state=0)

# fit & resample
X_train_over, y_train_over = over.fit_resample(X_train, y_train)

In [224]:
y_train_over.value_counts()
# data sudah balance setelah dilakukan oversampling
# kelas 1 (minority) diperbanyak sampai jumlahnya sama dengan kelas 0 (majority)

default
0    414
1    414
Name: count, dtype: int64

In [225]:
# define resampler
under = RandomUnderSampler(random_state=0)

# fit & resample
X_train_under, y_train_under = under.fit_resample(X_train, y_train)

In [226]:
y_train_under.value_counts()
# data sudah balance setelah dilakukan undersampling
# kelas 0 (majority) dikurangi sampai jumlahnya sama dengan kelas 1 (minority)

default
0    146
1    146
Name: count, dtype: int64

Modeling

In [228]:
from imblearn.pipeline import Pipeline 

In [231]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing', scaler),    
    ('resampling', over),
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# predict
y_pred_over = pipe_model.predict(X_test)

# f1 score
f1_score(y_test, y_pred_over)

0.6391752577319588

In [232]:
model = LogisticRegression(random_state=0)

pipe_model = Pipeline([
    ('preprocessing', scaler),    
    ('resampling', under),
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# predict
y_pred_under = pipe_model.predict(X_test)

# f1 score
f1_score(y_test, y_pred_under)

0.6451612903225806

In [233]:
print(f1_score(y_test, y_pred_benchmark), 'benchmark (no treatment)')
print(f1_score(y_test, y_pred_threshold), 'best threshold')
print(f1_score(y_test, y_pred_over), 'Oversampling')
print(f1_score(y_test, y_pred_under), 'Undersampling') 

0.6101694915254238 benchmark (no treatment)
0.6597938144329898 best threshold
0.6391752577319588 Oversampling
0.6451612903225806 Undersampling


## 4. Penalized Model 

Algoritma ML akan memberikan perhatian lebih pada kelas minority (kelas 1).
Error di kelas minority (kelas 1) akan diboboti lebih berat daripada error 
di kelas majority (kelas 0).

In [241]:
model = LogisticRegression(random_state=0, class_weight='balanced')
# model = LogisticRegression(random_state=0, class_weight={1 : 2.83})

pipe_model = Pipeline([
    ('preprocessing', scaler), 
    ('modeling', model)
]) 

# fit
pipe_model.fit(X_train, y_train)

# predict
y_pred_penalize = pipe_model.predict(X_test)

# f1 score
f1_score(y_test, y_pred_penalize)

0.6236559139784947

In [235]:
print(f1_score(y_test, y_pred_benchmark), 'benchmark (no treatment)')
print(f1_score(y_test, y_pred_threshold), 'best threshold')
print(f1_score(y_test, y_pred_over), 'Oversampling')
print(f1_score(y_test, y_pred_under), 'Undersampling')
print(f1_score(y_test, y_pred_penalize), 'Penalize model')

0.6101694915254238 benchmark (no treatment)
0.6597938144329898 best threshold
0.6391752577319588 Oversampling
0.6451612903225806 Undersampling
0.6236559139784947 Penalize model


In [236]:
y_train.value_counts()

default
0    414
1    146
Name: count, dtype: int64

In [237]:
414/146

2.835616438356164

---
# Exercise: Imbalance Classification

Anda bekerja sebagai data scientist di perusahaan asuransi. Anda diminta untuk memprediksi apakah seorang calon nasabah akan mengajukan klaim atau tidak ketika nantinya dia memiliki asuransi di perusahaan Anda. 

- Dataset: Car_Insurance_Claim.csv
- Target: 'OUTCOME'
    - 0: tidak mengajukan claim
    - 1: mengajukan claim
<br><br>

1. Definisikan FP dan FN dalam kasus asuransi kendaraan ini. Menurut Anda, manakah kesalahan yang memiliki cost lebih tinggi (FP atau FN)? Gunakan evaluation metric yang sesuai dengan kebutuhan bisnis!
1. Lakukan data cleaning jika diperlukan!
1. Lakukan EDA singkat untuk memahami dataset yang Anda gunakan!
1. Lakukan data splitting!
1. Lakukan preprocessing/feature engineering yang sesuai dengan kebutuhan!
1. Lakukan cross validation menggunakan beberapa algoritma ML yang sudah Anda kuasai. Pilih algoritma terbaik untuk melakukan modeling akhir!
1. Lakukan hyperparameter tuning pada model terpilih (dari cross validation). Pilih hyperparameter terbaik untuk melakukan modeling akhir!
1. Bandingkan performa model sebelum dan sesudah hyperparameter tuning, apakah performanya meningkat?
    - Model sebelum sebelum hyperparameter tuning
    - Model setelah setelah hyperparameter tuning