In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Splitting Data
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV

# Modeling, Fitting and Evaluation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score

# Resampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.pipeline import Pipeline

In [2]:
bankloan = pd.read_csv('/Users/zaki/Downloads/bankloan.csv')
bankloan

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.658720,0.821280,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1
...,...,...,...,...,...,...,...,...,...
695,36,2,6,15,27,4.6,0.262062,0.979938,1
696,29,2,6,4,21,11.5,0.369495,2.045505,0
697,33,1,15,3,32,7.6,0.491264,1.940736,0
698,45,1,19,22,77,8.4,2.302608,4.165392,0


In [3]:
X = bankloan[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bankloan['default']

In [4]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, stratify = y, test_size = 0.2, random_state = 1899)

**Cross Validation**

In [5]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 1899)
knn = KNeighborsClassifier()

In [6]:
logreg_pipe_scale = Pipeline([
    ('scale', StandardScaler()),
    ('logreg', logreg)
])

tree_pipe_scale = Pipeline([
    ('tree', tree)
])

knn_pipe_scale = Pipeline([
    ('scale', StandardScaler()),
    ('knn', knn)
])

In [7]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train_val, y_train_val, cv = skfold, scoring = metric)
    return model_cv

In [8]:
def print_result(model_cv):
    print('cv score', model_cv)
    print('cv score mean', model_cv.mean())

In [9]:
logreg_scale_cv = model_evaluation(logreg_pipe_scale, 'f1')
print_result(logreg_scale_cv)

cv score [0.5        0.6122449  0.53333333 0.70833333 0.47058824]
cv score mean 0.5648999599839936


In [10]:
logreg_pipe_scale.fit(X_train_val, y_train_val)
y_pred_logreg = logreg_pipe_scale.predict(X_test)
print(classification_report(y_test, y_pred_logreg))

              precision    recall  f1-score   support

           0       0.82      0.94      0.87       103
           1       0.71      0.41      0.52        37

    accuracy                           0.80       140
   macro avg       0.76      0.67      0.70       140
weighted avg       0.79      0.80      0.78       140



In [11]:
tree_scale_cv = model_evaluation(tree_pipe_scale, 'f1')
print_result(tree_scale_cv)

cv score [0.49180328 0.42857143 0.46666667 0.56140351 0.46428571]
cv score mean 0.4825461193968528


In [12]:
tree_pipe_scale.fit(X_train_val, y_train_val)
y_pred_tree = tree_pipe_scale.predict(X_test)
print(classification_report(y_test, y_pred_tree))

              precision    recall  f1-score   support

           0       0.78      0.75      0.76       103
           1       0.37      0.41      0.38        37

    accuracy                           0.66       140
   macro avg       0.57      0.58      0.57       140
weighted avg       0.67      0.66      0.66       140



In [13]:
knn_scale_cv = model_evaluation(knn_pipe_scale, 'f1')
print_result(knn_scale_cv)

cv score [0.44       0.46511628 0.4        0.6        0.29787234]
cv score mean 0.44059772389906


In [14]:
knn_pipe_scale.fit(X_train_val, y_train_val)
y_pred_knn = knn_pipe_scale.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       103
           1       0.59      0.46      0.52        37

    accuracy                           0.77       140
   macro avg       0.70      0.67      0.68       140
weighted avg       0.76      0.77      0.76       140



**Resampling: UnderSampling**

In [15]:
rus = RandomUnderSampler(random_state = 1899)
X_under, y_under = rus.fit_resample(X_train_val, y_train_val)

In [16]:
logreg_pipe_scale_rus = Pipeline([
    ('scale', StandardScaler()),
    ('rus', rus),
    ('logreg', logreg)
])

tree_pipe_scale_rus = Pipeline([
    ('rus', rus),
    ('tree', tree)
])

knn_pipe_scale_rus = Pipeline([
    ('scale', StandardScaler()),
    ('rus', rus),
    ('knn', knn)
])

In [17]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_under, y_under, cv = skfold, scoring = metric)
    return model_cv

In [18]:
def print_result(model_cv):
    print('cv score', model_cv)
    print('cv score mean', model_cv.mean())

In [19]:
logreg_scale_rus_cv = model_evaluation(logreg_pipe_scale_rus, 'f1')
print_result(logreg_scale_rus_cv)

cv score [0.73015873 0.73333333 0.78571429 0.78571429 0.74509804]
cv score mean 0.7560037348272642


In [20]:
logreg_pipe_scale_rus.fit(X_train_val, y_train_val)
y_pred_logreg_rus = logreg_pipe_scale_rus.predict(X_test)
print(classification_report(y_test, y_pred_logreg_rus))

              precision    recall  f1-score   support

           0       0.90      0.69      0.78       103
           1       0.48      0.78      0.59        37

    accuracy                           0.71       140
   macro avg       0.69      0.74      0.69       140
weighted avg       0.79      0.71      0.73       140



In [21]:
tree_scale_rus_cv = model_evaluation(tree_pipe_scale_rus, 'f1')
print_result(tree_scale_rus_cv)

cv score [0.64615385 0.68965517 0.6        0.73846154 0.60714286]
cv score mean 0.656282682834407


In [22]:
tree_pipe_scale_rus.fit(X_train_val, y_train_val)
y_pred_tree_rus = tree_pipe_scale_rus.predict(X_test)
print(classification_report(y_test, y_pred_tree_rus))

              precision    recall  f1-score   support

           0       0.78      0.59      0.67       103
           1       0.32      0.54      0.40        37

    accuracy                           0.58       140
   macro avg       0.55      0.57      0.54       140
weighted avg       0.66      0.58      0.60       140



In [23]:
knn_scale_rus_cv = model_evaluation(knn_pipe_scale_rus, 'f1')
print_result(knn_scale_rus_cv)

cv score [0.70769231 0.67692308 0.74074074 0.78688525 0.66666667]
cv score mean 0.7157816075848864


In [24]:
knn_pipe_scale_rus.fit(X_train_val, y_train_val)
y_pred_knn_rus = knn_pipe_scale_rus.predict(X_test)
print(classification_report(y_test, y_pred_knn_rus))

              precision    recall  f1-score   support

           0       0.88      0.70      0.78       103
           1       0.47      0.73      0.57        37

    accuracy                           0.71       140
   macro avg       0.67      0.71      0.67       140
weighted avg       0.77      0.71      0.72       140



**Resampling: OverSampling**

In [25]:
ros = RandomOverSampler(random_state = 1899)
X_over, y_over = ros.fit_resample(X_train_val, y_train_val)

In [26]:
logreg_pipe_scale_ros = Pipeline([
    ('scale', StandardScaler()),
    ('ros', ros),
    ('logreg', logreg)
])

tree_pipe_scale_ros = Pipeline([
    ('ros', ros),
    ('tree', tree)
])

knn_pipe_scale_ros = Pipeline([
    ('scale', StandardScaler()),
    ('ros', ros),
    ('knn', knn)
])

In [27]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_over, y_over, cv = skfold, scoring = metric)
    return model_cv

In [28]:
def print_result(model_cv):
    print('cv score', model_cv)
    print('cv score mean', model_cv.mean())

In [29]:
logreg_scale_ros_cv = model_evaluation(logreg_pipe_scale_ros, 'f1')
print_result(logreg_scale_ros_cv)

cv score [0.70520231 0.76923077 0.79532164 0.73972603 0.75903614]
cv score mean 0.7537033781543944


In [30]:
logreg_pipe_scale_ros.fit(X_train_val, y_train_val)
y_pred_logreg_ros = logreg_pipe_scale_ros.predict(X_test)
print(classification_report(y_test, y_pred_logreg_ros))

              precision    recall  f1-score   support

           0       0.91      0.71      0.80       103
           1       0.50      0.81      0.62        37

    accuracy                           0.74       140
   macro avg       0.71      0.76      0.71       140
weighted avg       0.80      0.74      0.75       140



In [31]:
tree_scale_ros_cv = model_evaluation(tree_pipe_scale_ros, 'f1')
print_result(tree_scale_ros_cv)

cv score [0.86363636 0.87719298 0.92222222 0.9010989  0.88770053]
cv score mean 0.8903702008345971


In [32]:
tree_pipe_scale_ros.fit(X_train_val, y_train_val)
y_pred_tree_ros = tree_pipe_scale_ros.predict(X_test)
print(classification_report(y_test, y_pred_tree_ros))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80       103
           1       0.46      0.49      0.47        37

    accuracy                           0.71       140
   macro avg       0.64      0.64      0.64       140
weighted avg       0.72      0.71      0.72       140



In [33]:
knn_scale_ros_cv = model_evaluation(knn_pipe_scale_ros, 'f1')
print_result(knn_scale_ros_cv)

cv score [0.71676301 0.77192982 0.78212291 0.77575758 0.77777778]
cv score mean 0.7648702177810074


In [34]:
knn_pipe_scale_ros.fit(X_train_val, y_train_val)
y_pred_knn_ros = knn_pipe_scale_ros.predict(X_test)
print(classification_report(y_test, y_pred_knn_ros))

              precision    recall  f1-score   support

           0       0.89      0.73      0.80       103
           1       0.50      0.76      0.60        37

    accuracy                           0.74       140
   macro avg       0.70      0.74      0.70       140
weighted avg       0.79      0.74      0.75       140



**Resampling: SMOTE**

In [35]:
smote = SMOTE(random_state = 1899)
X_smote, y_smote = smote.fit_resample(X_train_val, y_train_val)

In [36]:
logreg_pipe_scale_smote = Pipeline([
    ('scale', StandardScaler()),
    ('smote', smote),
    ('logreg', logreg)
])

tree_pipe_scale_smote = Pipeline([
    ('smote', smote),
    ('tree', tree)
])

knn_pipe_scale_smote = Pipeline([
    ('scale', StandardScaler()),
    ('smote', smote),
    ('knn', knn)
])

In [37]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_smote, y_smote, cv = skfold, scoring = metric)
    return model_cv

In [38]:
def print_result(model_cv):
    print('cv score', model_cv)
    print('cv score mean', model_cv.mean())

In [39]:
logreg_scale_smote_cv = model_evaluation(logreg_pipe_scale_smote, 'f1')
print_result(logreg_scale_smote_cv)

cv score [0.72       0.75       0.725      0.81290323 0.74698795]
cv score mean 0.7509782355227361


In [40]:
logreg_pipe_scale_smote.fit(X_train_val, y_train_val)
y_pred_logreg_smote = logreg_pipe_scale_smote.predict(X_test)
print(classification_report(y_test, y_pred_logreg_smote))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82       103
           1       0.53      0.89      0.67        37

    accuracy                           0.76       140
   macro avg       0.74      0.81      0.74       140
weighted avg       0.84      0.76      0.78       140



In [41]:
tree_scale_smote_cv = model_evaluation(tree_pipe_scale_smote, 'f1')
print_result(tree_scale_smote_cv)

cv score [0.69512195 0.78571429 0.77894737 0.80722892 0.79310345]
cv score mean 0.7720231938586726


In [42]:
tree_pipe_scale_smote.fit(X_train_val, y_train_val)
y_pred_tree_smote = tree_pipe_scale_smote.predict(X_test)
print(classification_report(y_test, y_pred_tree_smote))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79       103
           1       0.44      0.54      0.49        37

    accuracy                           0.70       140
   macro avg       0.63      0.65      0.64       140
weighted avg       0.72      0.70      0.71       140



In [43]:
knn_scale_smote_cv = model_evaluation(knn_pipe_scale_smote, 'f1')
print_result(knn_scale_smote_cv)

cv score [0.76300578 0.82222222 0.83428571 0.83977901 0.83870968]
cv score mean 0.8196004799597947


In [44]:
knn_pipe_scale_smote.fit(X_train_val, y_train_val)
y_pred_knn_smote = knn_pipe_scale_smote.predict(X_test)
print(classification_report(y_test, y_pred_knn_smote))

              precision    recall  f1-score   support

           0       0.91      0.77      0.83       103
           1       0.55      0.78      0.64        37

    accuracy                           0.77       140
   macro avg       0.73      0.78      0.74       140
weighted avg       0.81      0.77      0.78       140



# Summary

**Logistic Regression Model**

In [45]:
f1_logreg = f1_score(y_test, y_pred_logreg)
f1_logreg_scale_under = f1_score(y_test, y_pred_logreg_rus)
f1_logreg_scale_over = f1_score(y_test, y_pred_logreg_ros)
f1_logreg_scale_smote = f1_score(y_test, y_pred_logreg_smote)

In [46]:
score_list = [f1_logreg, f1_logreg_scale_under, f1_logreg_scale_over, f1_logreg_scale_smote]
method_name = ['f1 Score Logreg', 'f1 Score Logreg Undersampling', 
              'f1 Score Logreg Oversampling', 'f1 Score Logreg SMOTE']
summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
summary

Unnamed: 0,method,score
0,f1 Score Logreg,0.517241
1,f1 Score Logreg Undersampling,0.591837
2,f1 Score Logreg Oversampling,0.618557
3,f1 Score Logreg SMOTE,0.666667


**Decision Tree Classifier Model**

In [47]:
f1_tree = f1_score(y_test, y_pred_tree)
f1_tree_scale_under = f1_score(y_test, y_pred_tree_rus)
f1_tree_scale_over = f1_score(y_test, y_pred_tree_ros)
f1_tree_scale_smote = f1_score(y_test, y_pred_tree_smote)

In [48]:
score_list = [f1_tree, f1_tree_scale_under, f1_tree_scale_over, f1_tree_scale_smote]
method_name = ['f1 Score Tree', 'f1 Score Tree Undersampling', 
              'f1 Score Tree Oversampling', 'f1 Score Tree SMOTE']
summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
summary

Unnamed: 0,method,score
0,f1 Score Tree,0.384615
1,f1 Score Tree Undersampling,0.40404
2,f1 Score Tree Oversampling,0.473684
3,f1 Score Tree SMOTE,0.487805


**KNN Classifier Model**

In [49]:
f1_knn = f1_score(y_test, y_pred_knn)
f1_knn_scale_under = f1_score(y_test, y_pred_knn_rus)
f1_knn_scale_over = f1_score(y_test, y_pred_knn_ros)
f1_knn_scale_smote = f1_score(y_test, y_pred_knn_smote)

In [50]:
score_list = [f1_knn, f1_knn_scale_under, f1_knn_scale_over, f1_knn_scale_smote]
method_name = ['f1 Score KNN', 'f1 Score KNN Undersampling', 
              'f1 Score KNN Oversampling', 'f1 Score KNN SMOTE']
summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
summary

Unnamed: 0,method,score
0,f1 Score KNN,0.515152
1,f1 Score KNN Undersampling,0.568421
2,f1 Score KNN Oversampling,0.602151
3,f1 Score KNN SMOTE,0.644444


# HyperParam Tuning

In [51]:
smote = SMOTE(random_state = 1899)
model = LogisticRegression()
scale = StandardScaler()
estimator = Pipeline([
    ('scale', scale),
    ('balancing', smote),
    ('model', model)
])

In [52]:
hyperparam_space = {
    'balancing__k_neighbors': [2, 5, 10, 15, 20],
    'model__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'model__solver': ['liblinear', 'newton-cg', 'saga', 'lbfgs'],
    'model__max_iter': [100, 200, 300, 400]
}

skfold = StratifiedKFold(n_splits = 5)

In [53]:
grid_search = GridSearchCV(
                estimator,
                param_grid = hyperparam_space,
                cv = skfold,
                scoring = 'f1',
                n_jobs = -1)

In [54]:
grid_search.fit(X_train_val, y_train_val)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('balancing', SMOTE(random_state=1899)),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'balancing__k_neighbors': [2, 5, 10, 15, 20],
                         'model__C': [100, 10, 1, 0.1, 0.01, 0.001],
                         'model__max_iter': [100, 200, 300, 400],
                         'model__solver': ['liblinear', 'newton-cg', 'saga',
                                           'lbfgs']},
             scoring='f1')

In [55]:
estimator.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'scale', 'balancing', 'model', 'scale__copy', 'scale__with_mean', 'scale__with_std', 'balancing__k_neighbors', 'balancing__n_jobs', 'balancing__random_state', 'balancing__sampling_strategy', 'model__C', 'model__class_weight', 'model__dual', 'model__fit_intercept', 'model__intercept_scaling', 'model__l1_ratio', 'model__max_iter', 'model__multi_class', 'model__n_jobs', 'model__penalty', 'model__random_state', 'model__solver', 'model__tol', 'model__verbose', 'model__warm_start'])

In [56]:
print('best score', grid_search.best_score_)
print('best param', grid_search.best_params_)

best score 0.6231147457408557
best param {'balancing__k_neighbors': 5, 'model__C': 0.1, 'model__max_iter': 100, 'model__solver': 'newton-cg'}


# Before Tuning

In [57]:
smote = SMOTE(random_state = 1899)
model = LogisticRegression()
scale = StandardScaler()
estimator = Pipeline([
    ('scale', scale),
    ('balancing', smote),
    ('model', model)
])

In [58]:
estimator.fit(X_train_val, y_train_val)
y_pred = estimator.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82       103
           1       0.53      0.89      0.67        37

    accuracy                           0.76       140
   macro avg       0.74      0.81      0.74       140
weighted avg       0.84      0.76      0.78       140



# After Tuning

In [59]:
grid_search.best_estimator_.fit(X_train_val, y_train_val)
y_pred = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.71      0.80       103
           1       0.51      0.84      0.63        37

    accuracy                           0.74       140
   macro avg       0.72      0.77      0.72       140
weighted avg       0.81      0.74      0.76       140



Setelah hyperparameter tuning, nilai f1 menurun dari 0.67 ke 0.63