# DILAKUKAN SETELAH PRE PROCESSING DAN SEBELUM TUNING

In [93]:
# Library

#Standard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats

#Data Pre-Processing
import statsmodels.api as sm
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, StandardScaler

#Data Modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Data Output
from sklearn.metrics import accuracy_score,recall_score, confusion_matrix,f1_score
from sklearn.pipeline import Pipeline #menggabungkan proces

warnings.filterwarnings('ignore')

In [94]:
data = pd.read_csv('bankloan.csv')
data.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


In [95]:
# Split X and Y
x = data[['age','employ','income','debtinc','creddebt','othdebt',]]
y = data['default']

xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    stratify=y,
    random_state=100,
    test_size=0.2
)

In [96]:
# Model Benchmarking
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
logreg = LogisticRegression()

In [97]:
# Create Manual Cross Validation

skfold = StratifiedKFold(n_splits=5)

logreg_cv = cross_val_score(logreg,xtrain,ytrain, cv=skfold)
tree_cv = cross_val_score(tree,xtrain,ytrain, cv=skfold)
knn_cv = cross_val_score(knn,xtrain,ytrain, cv=skfold)

In [98]:
print('LogisticRegression CV : ',logreg_cv)
print('DecisionTreeClassifier : ',knn_cv)
print('KNeighborsClassifier CV : ',tree_cv)

LogisticRegression CV :  [0.82142857 0.76785714 0.84821429 0.79464286 0.78571429]
DecisionTreeClassifier :  [0.72321429 0.75892857 0.8125     0.72321429 0.76785714]
KNeighborsClassifier CV :  [0.75892857 0.74107143 0.74107143 0.69642857 0.8125    ]


muncul 5 akurasi sesuai dengan k yang disebutkan, setelah didapatkan hasil ini untuk menentukan yang paling bagus ambil mean-nya!

In [99]:
print('LogisticRegression CV : ',logreg_cv.mean())
print('DecisionTreeClassifier : ',knn_cv.mean())
print('KNeighborsClassifier CV : ',tree_cv.mean())

LogisticRegression CV :  0.8035714285714286
DecisionTreeClassifier :  0.7571428571428571
KNeighborsClassifier CV :  0.75


kalau tipis-tipis aja bisa gunakan standar deviasi, makin kecil makin bagus

In [100]:
print('LogisticRegression CV : ',logreg_cv.std())
print('DecisionTreeClassifier : ',knn_cv.std())
print('KNeighborsClassifier CV : ',tree_cv.std())

LogisticRegression CV :  0.028234621965789086
DecisionTreeClassifier :  0.03312006605534181
KNeighborsClassifier CV :  0.0374574588632197


bisa dibuat looping

In [104]:
tree = DecisionTreeClassifier(max_depth=10, criterion='entropy')
knn = KNeighborsClassifier(n_neighbors=10)
logreg = LogisticRegression()
svm = SVC(kernel='rbf')

skfold = StratifiedKFold(n_splits=5)

model = [tree, knn, logreg, svm]
score = []
recall = []
std = []

for i in model:
    model_cv = cross_val_score(i,xtrain,ytrain,cv=skfold, scoring='recall') #
    score.append(model_cv)
    recall.append(model_cv.mean())
    std.append(model_cv.std())

result = pd.DataFrame({
    'Model' : ['DecisionTreeClassifier','KNeighborsClassifier','LogisticRegression','SVM'],
    'Mean Recall' : recall, #Recall ->  nilai akurasi fokus pada FN
    'Standard Deviation' : std
}).sort_values('Mean Recall',ascending=False)

In [105]:
result

Unnamed: 0,Model,Mean Recall,Standard Deviation
2,LogisticRegression,0.45908,0.036807
0,DecisionTreeClassifier,0.437241,0.114208
1,KNeighborsClassifier,0.294253,0.069576
3,SVM,0.178391,0.067456
