In [16]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

# train test split
from sklearn.model_selection import train_test_split

# impute missing values
from sklearn.impute import SimpleImputer # mean, median, most_frequent (mode), constant
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # regresi
from sklearn.impute import KNNImputer # regresi KKN

# encoding
from sklearn.preprocessing import OneHotEncoder
from category_encoders import OrdinalEncoder, BinaryEncoder

# scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 

# column transformer & pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE 
from imblearn.under_sampling import RandomUnderSampler, NearMiss 

# cross validation
from sklearn.model_selection import cross_val_score

# algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [6]:
#Defining Function

def dataDescription(df):
    tempList = []
    for col in df.columns:
        tempList.append(
            [col,
            df[col].dtype,
            df[col].isna().sum(),
            round(df[col].isna().sum()/len(df)*100,2),
            df[col].nunique(),
            #list(df[col].drop_duplicates().sample(5,replace=True).values)
            list(df[col].drop_duplicates().sort_values().values)
            ]
        )

    descData = pd.DataFrame(data = tempList,
                            columns = ['Col','Data Type','Missing Value', 'Pct Missing Value','Num Unique','Unique Sample']
                            )
    display(descData)

def normalCheckShapiro(data):

    _, p_value = stats.shapiro(data)

    alpha = 0.05
    if p_value > alpha:
        print("The data is normally distributed.")
    else:
        print("The data is not normally distributed.")


In [4]:
# load dataset
df = pd.read_csv('bankloan.csv')
df.head() 

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


In [8]:
#Descriptive Analysis
display(df.info(),df.describe(),df.isnull().sum(),df.head(),dataDescription(df),df.corr('spearman'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       700 non-null    int64  
 1   ed        700 non-null    int64  
 2   employ    700 non-null    int64  
 3   address   700 non-null    int64  
 4   income    700 non-null    int64  
 5   debtinc   700 non-null    float64
 6   creddebt  700 non-null    float64
 7   othdebt   700 non-null    float64
 8   default   700 non-null    int64  
dtypes: float64(3), int64(6)
memory usage: 49.3 KB


Unnamed: 0,Col,Data Type,Missing Value,Pct Missing Value,Num Unique,Unique Sample
0,age,int64,0,0.0,37,"[20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 3..."
1,ed,int64,0,0.0,5,"[1, 2, 3, 4, 5]"
2,employ,int64,0,0.0,32,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
3,address,int64,0,0.0,31,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
4,income,int64,0,0.0,114,"[14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2..."
5,debtinc,float64,0,0.0,231,"[0.4, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, ..."
6,creddebt,float64,0,0.0,695,"[0.011696, 0.014835, 0.024528, 0.024576, 0.025..."
7,othdebt,float64,0,0.0,699,"[0.045584, 0.089488, 0.100926, 0.10752, 0.1295..."
8,default,int64,0,0.0,2,"[0, 1]"


None

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,34.86,1.722857,8.388571,8.278571,45.601429,10.260571,1.553553,3.058209,0.261429
std,7.997342,0.928206,6.658039,6.824877,36.814226,6.827234,2.117197,3.287555,0.439727
min,20.0,1.0,0.0,0.0,14.0,0.4,0.011696,0.045584,0.0
25%,29.0,1.0,3.0,3.0,24.0,5.0,0.369059,1.044178,0.0
50%,34.0,1.0,7.0,7.0,34.0,8.6,0.854869,1.987567,0.0
75%,40.0,2.0,12.0,12.0,55.0,14.125,1.901955,3.923065,1.0
max,56.0,5.0,31.0,34.0,446.0,41.3,20.56131,27.0336,1.0


age         0
ed          0
employ      0
address     0
income      0
debtinc     0
creddebt    0
othdebt     0
default     0
dtype: int64

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1
1,27,1,10,6,31,17.3,1.362202,4.000798,0
2,40,1,15,14,55,5.5,0.856075,2.168925,0
3,41,1,15,14,120,2.9,2.65872,0.82128,0
4,24,2,2,0,28,17.3,1.787436,3.056564,1


None

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
age,1.0,0.00301,0.529993,0.561612,0.585356,0.010888,0.314232,0.343798,-0.157212
ed,0.00301,1.0,-0.154336,0.048915,0.202078,0.006499,0.096669,0.130843,0.123968
employ,0.529993,-0.154336,1.0,0.303133,0.712129,-0.070538,0.32918,0.340974,-0.311875
address,0.561612,0.048915,0.303133,1.0,0.362405,0.035499,0.246965,0.237814,-0.16953
income,0.585356,0.202078,0.712129,0.362405,1.0,-0.016446,0.510601,0.539169,-0.152505
debtinc,0.010888,0.006499,-0.070538,0.035499,-0.016446,1.0,0.624044,0.734347,0.358176
creddebt,0.314232,0.096669,0.32918,0.246965,0.510601,0.624044,1.0,0.622144,0.207304
othdebt,0.343798,0.130843,0.340974,0.237814,0.539169,0.734347,0.622144,1.0,0.132298
default,-0.157212,0.123968,-0.311875,-0.16953,-0.152505,0.358176,0.207304,0.132298,1.0


In [11]:
X = df[['age','employ','debtinc','creddebt','othdebt']]
y = df['default']

In [12]:
y.value_counts() / len(y)
# imbalance

default
0    0.738571
1    0.261429
Name: count, dtype: float64

# Data Spliting

In [13]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

# Pre Processing

In [77]:
# define resampler
over = RandomOverSampler(random_state=0)

# fit & resample
X_train_over, y_train_over = over.fit_resample(X_train, y_train)

# define resampler
under = RandomUnderSampler(random_state=0)

# fit & resample
X_train_under, y_train_under = under.fit_resample(X_train, y_train)

In [91]:
pipe_model = Pipeline([
    ('resample', over),
    ('scaler', RobustScaler())
    #('model', model)
])

# transformer = ColumnTransformer([
    
# ])

# fit
# pipe_model.fit(X_train, y_train)

# # predict
# y_pred_over = pipe_model.predict(X_test)

# # f1 score
# f1_score(y_test, y_pred_over)

In [92]:
pipe_model

In [100]:
model = RandomForestClassifier(n_estimators=1,criterion='gini',max_features=4)
#Untuk Estimator
pipe_grid = Pipeline([
        ('resample', over),
        ('scaler', RobustScaler()),
        #('preprocessing', pipe_model),
        ('modeling', model)
    ])

#untuk param+gtid
hyperparam = {
    'modeling__n_estimators':np.arange(50,200,1)#, #hyperparam untuk log reg #Flatting linear line agar lebih tergenalisir sehingga tidak terjadi overfitting (dekat dengan y test)
    #'modeling__criterion':['gini','entropy'], #hyperparam untuk knn
    #'modeling__max_depth': np.arange(1,11,1),
    #'modeling__min_samples_split': np.arange(2,20,2) #hyperparam untuk dec tree
}

grid = GridSearchCV(
    estimator=pipe_grid,
    param_grid= hyperparam,
    cv=5,
    scoring='f1',
    n_jobs= -1
)

In [101]:
pipe_grid

In [102]:
grid.fit(X_train, y_train)

In [103]:
df_grid = pd.DataFrame(grid.cv_results_).sort_values('rank_test_score')
df_grid.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_modeling__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
79,0.967924,0.061177,0.072343,0.003268,129,{'modeling__n_estimators': 129},0.5,0.482759,0.676471,0.655172,0.509091,0.564699,0.083272,1
33,1.224338,0.032824,0.096378,0.004823,83,{'modeling__n_estimators': 83},0.586207,0.5,0.637681,0.581818,0.508475,0.562836,0.051792,2
88,1.055583,0.071344,0.063095,0.003403,138,{'modeling__n_estimators': 138},0.481481,0.491228,0.676471,0.618182,0.517241,0.556921,0.076976,3
116,1.577739,0.212965,0.075901,0.007972,166,{'modeling__n_estimators': 166},0.509091,0.491228,0.65625,0.607143,0.517241,0.556191,0.06413,4
143,1.413338,0.02161,0.08411,0.004995,193,{'modeling__n_estimators': 193},0.535714,0.491228,0.647059,0.642857,0.45614,0.5546,0.077981,5


In [104]:
df_grid.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_modeling__n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

50 - 200
random oversampler