In [16]:
import warnings
import graphviz
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.tools.eval_measures import rmse
from statsmodels.stats.outliers_influence import variance_inflation_factor # 다중공선성 패키지 불러오기
from sklearn.linear_model import LinearRegression # sklearn 선형회귀 모형 -> 변수 선택법을 위함
from sklearn.feature_selection import RFE # 변수 선택법 (후진 제거법)
from sklearn.preprocessing import scale # 데이터 scale
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, fbeta_score
# Import ML Classification algos
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
warnings.filterwarnings('ignore')
%matplotlib inline
matplotlib.rc('font', family='NanumBarunGothic')
plt.rcParams['axes.unicode_minus']=False
ds_hmeq = pd.read_csv("./Data/HMEQ.csv", engine='python')
ds_hmeq.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320.0,HomeImp,Other,9.0,0,0.0,101.466002,1.0,8,37.113614
1,1,1800,28502,43034.0,HomeImp,Other,11.0,0,0.0,88.76603,0.0,8,36.884894
2,0,2300,102370,120953.0,HomeImp,Office,2.0,0,0.0,90.992533,0.0,13,31.588503
3,1,2400,34863,47471.0,HomeImp,Mgr,12.0,0,0.0,70.49108,1.0,21,38.263601
4,0,2400,98449,117195.0,HomeImp,Office,4.0,0,0.0,93.811775,0.0,13,29.681827


In [17]:
ds_hmeq["JOB"].fillna("Other",inplace=True)
ds_hmeq.fillna(ds_hmeq.mean(),inplace=True)

In [18]:
ds_hmeq_dummy = pd.get_dummies(ds_hmeq)
ds_hmeq_dummy.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,1,1700,30548,40320.0,9.0,0,0.0,101.466002,1.0,8,37.113614,0,1,0,0,1,0,0,0
1,1,1800,28502,43034.0,11.0,0,0.0,88.76603,0.0,8,36.884894,0,1,0,0,1,0,0,0
2,0,2300,102370,120953.0,2.0,0,0.0,90.992533,0.0,13,31.588503,0,1,0,1,0,0,0,0
3,1,2400,34863,47471.0,12.0,0,0.0,70.49108,1.0,21,38.263601,0,1,1,0,0,0,0,0
4,0,2400,98449,117195.0,4.0,0,0.0,93.811775,0.0,13,29.681827,0,1,0,1,0,0,0,0


In [19]:
# random seed 고정 매번 실행 시 같은 값을 얻음
np.random.seed(seed = 1234)
# 0.7(70%) 보다 작으면 True: train 데이터, 아니면 False: test 데이터
msk = np.random.rand(ds_hmeq_dummy.shape[0], ) < 0.7
ds_hmeq_train = ds_hmeq_dummy[msk]
ds_hmeq_test = ds_hmeq_dummy[~msk]
# train/test 데이터의 목표변수 설명변수 지정
ds_hmeq_train_y = ds_hmeq_train["BAD"]
ds_hmeq_train_x = ds_hmeq_train.drop("BAD", axis = 1, inplace = False)
ds_hmeq_test_y = ds_hmeq_test["BAD"]
ds_hmeq_test_x = ds_hmeq_test.drop("BAD", axis = 1, inplace = False)
# train 데이터와 test 데이터 크기
print("train data X size : {}".format(ds_hmeq_train_x.shape))
print("train data Y size : {}".format(ds_hmeq_train_y.shape))
print("test data X size : {}".format(ds_hmeq_test_x.shape))
print("test data Y size : {}".format(ds_hmeq_test_y.shape))

train data X size : (2604, 18)
train data Y size : (2604,)
test data X size : (1144, 18)
test data Y size : (1144,)


In [37]:
# train 데이터셋 스케일 조정
ds_hmeq_train_x_scaled= scale(ds_hmeq_train_x, axis=0)
# test 데이터셋 스케일 조정
ds_hmeq_test_x_scaled =scale(ds_hmeq_test_x, axis = 0)
# 스케일이 변경된 X변수 확인
pd.DataFrame(ds_hmeq_train_x_scaled, columns = ds_hmeq_train_x.columns).head()

model = ["DecisionTree","RandomForest","GradientBoosting","SVM"]
train_accuracy = []
test_accuracy = []
model_f1_score = []

In [38]:
tree = DecisionTreeClassifier(criterion = "gini",max_depth=10,min_samples_leaf=30,random_state=1234)
tree.fit(ds_hmeq_train_x_scaled,ds_hmeq_train_y)
train_accuracy.append(tree.score(ds_hmeq_train_x_scaled,ds_hmeq_train_y))
test_accuracy.append(tree.score(ds_hmeq_test_x_scaled,ds_hmeq_test_y))
model_f1_score.append(f1_score(ds_hmeq_test_y,tree.predict(ds_hmeq_test_x_scaled)))

In [39]:
rf = RandomForestClassifier(criterion="gini",n_estimators=500,max_depth=4,min_samples_leaf=30,random_state=1234)
rf.fit(ds_hmeq_train_x,ds_hmeq_train_y)
train_accuracy.append(rf.score(ds_hmeq_train_x_scaled,ds_hmeq_train_y))
test_accuracy.append(rf.score(ds_hmeq_test_x_scaled,ds_hmeq_test_y))

model_f1_score.append(f1_score(ds_hmeq_test_y,rf.predict(ds_hmeq_test_x_scaled)))

In [40]:
gb=GradientBoostingClassifier(learning_rate=0.1,max_depth=4,min_impurity_split=30,n_estimators=5,random_state=1234)
gb.fit(ds_hmeq_train_x_scaled,ds_hmeq_train_y)
train_accuracy.append(gb.score(ds_hmeq_train_x_scaled,ds_hmeq_train_y))
test_accuracy.append(gb.score(ds_hmeq_test_x_scaled,ds_hmeq_test_y))
model_f1_score.append(f1_score(ds_hmeq_test_y,rf.predict(ds_hmeq_test_x_scaled)))

In [41]:
svm=SVC(C=6,gamma=0.1,random_state=1234)
svm.fit(ds_hmeq_train_x_scaled,ds_hmeq_train_y)
train_accuracy.append(svm.score(ds_hmeq_train_x_scaled,ds_hmeq_train_y))
test_accuracy.append(svm.score(ds_hmeq_test_x_scaled,ds_hmeq_test_y))
model_f1_score.append(f1_score(ds_hmeq_test_y,rf.predict(ds_hmeq_test_x_scaled)))

In [43]:
ds_eval=pd.DataFrame(index=model)
ds_eval["TrainAccuracy"]=train_accuracy
ds_eval["TestAccuracy"]=test_accuracy
ds_eval["F1Score"]=model_f1_score
ds_eval.round(3)

Unnamed: 0,TrainAccuracy,TestAccuracy,F1Score
DecisionTree,0.929,0.927,0.42
RandomForest,0.907,0.91,0.0
GradientBoosting,0.907,0.91,0.0
SVM,0.981,0.957,0.0
