In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('./Total_0823.csv', encoding='euc-kr', index_col=0)

In [11]:
# 대출연체에 대한 분류모델이므로 기존 해약건수의 컬럼의 파생변수인 Target, Target_year의 값과 상관도가 너무 높기에 제거하고 모델링
df.drop(['Target', 'Target_year'], axis=1, inplace=True)

In [12]:
df.drop('CUST_ID', axis=1, inplace=True)

In [133]:
df['TARGET'].value_counts()

0    91424
1     3856
Name: TARGET, dtype: int64

# Get_dummies

In [13]:
from sklearn.model_selection import train_test_split


df_dummy = pd.get_dummies(df)

df_x = df_dummy.drop('TARGET', axis=1)
df_y = df_dummy['TARGET']

train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.3, random_state=1234)

In [14]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report



tree = DecisionTreeClassifier(random_state = 2022)
tree.fit(train_x, train_y)

print(f"Train Accuracy: {tree.score(train_x, train_y)}")
print(f"Test Accuracy: {tree.score(test_x, test_y)}")


Train Accuracy: 1.0
Test Accuracy: 0.9314301707248811


# SMOTE

In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy='auto')

x_resampled, y_resampeld = sm.fit_resample(train_x, train_y)

print('resamp 전 train x,y 수:', train_x.shape, train_y.shape)
print('resamp 후 train x,y 수: ', x_resampled.shape, y_resampeld.shape)
print('resamp 전 y: \n', train_y.value_counts())
print('resamp 후 y = 1 수:', sum(y_resampeld==1))
print('resamp 후 y = 0 수:', sum(y_resampeld==0)) 

resamp 전 train x,y 수: (66696, 66) (66696,)
resamp 후 train x,y 수:  (128038, 66) (128038,)
resamp 전 y: 
 0    64019
1     2677
Name: TARGET, dtype: int64
resamp 후 y = 1 수: 64019
resamp 후 y = 0 수: 64019


In [87]:
df_resampled = pd.concat([x_resampled, y_resampeld], axis=1)
df_resampled

Unnamed: 0,CUST_JOB_INCM,HSHD_INFR_INCM,ACTL_FMLY_NUM,CUST_FMLY_NUM,LAST_CHLD_AGE,CRDT_LOAN_CNT,MIN_CNTT_DATE,TOT_CRLN_AMT,TOT_REPY_AMT,CRLN_OVDU_RATE,...,OCCP_NAME_G_단순 노무직,OCCP_NAME_G_단순 사무직,OCCP_NAME_G_사무직,OCCP_NAME_G_예체능계 종사자,OCCP_NAME_G_운전직,OCCP_NAME_G_자영업,OCCP_NAME_G_전문직,OCCP_NAME_G_주부,OCCP_NAME_G_학생,TARGET
0,6000,16900,2,1,29.000000,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,4900,8500,4,2,29.000000,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3800,8300,3,1,29.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4800,4800,4,1,14.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4500,5800,2,1,0.000000,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128033,3600,9673,3,1,23.647326,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
128034,6967,11553,4,2,20.504329,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
128035,4462,7350,4,2,11.388565,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
128036,3600,9930,1,1,0.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [88]:
df_resampled.corr()['TARGET'].sort_values(ascending=False).reset_index().head(15)

Unnamed: 0,index,TARGET
0,TARGET,1.0
1,SPTCT_OCCR_MDIF,0.317496
2,SPART_LNIF_CNT,0.236045
3,Category,0.191789
4,LT1Y_PEOD_RATE,0.126058
5,CRDT_OCCR_MDIF,0.123772
6,CPT_LNIF_CNT,0.096819
7,CPT_LNIF_AMT,0.089966
8,AUTR_FAIL_MCNT,0.064304
9,ECT_LNIF_CNT,0.051587


In [89]:
df_resampled.corr()['TARGET'].sort_values(ascending=False).reset_index().tail(15)

Unnamed: 0,index,TARGET
52,OCCP_NAME_G_전문직,-0.141585
53,OCCP_NAME_G_공무원,-0.145967
54,FMLY_PLPY_CNT,-0.165594
55,OCCP_NAME_G_3차산업 종사자,-0.167805
56,ACTL_FMLY_NUM,-0.174518
57,OCCP_NAME_G_자영업,-0.182869
58,OCCP_NAME_G_2차산업 종사자,-0.183243
59,SEX,-0.217999
60,OCCP_NAME_G_사무직,-0.220112
61,TOT_LNIF_AMT,-0.234703


# 평가지표

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import roc_auc_score


def get_eval(test_y, pred=None):
    confusion = confusion_matrix(test_y, pred)
    accuracy = accuracy_score(test_y, pred)
    precision = precision_score(test_y, pred)
    recall = recall_score(test_y, pred)
    f1 = f1_score(test_y, pred)
    
    roc_auc = roc_auc_score(test_y, pred)
    print("오차행렬")
    print(confusion)
    
    print(f"정확도: {accuracy}, 정밀도: {precision}, 재현율: {recall}, F1: {f1}, AUC: {roc_auc}")

# DecisionTRee

In [16]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report



tree = DecisionTreeClassifier(random_state = 2022)
tree.fit(x_resampled, y_resampeld)

print(f"Train Accuracy: {tree.score(x_resampled, y_resampeld)}")
print(f"Test Accuracy: {tree.score(test_x, test_y)}")



Train Accuracy: 1.0
Test Accuracy: 0.919045619927232


In [140]:
y_pred = tree.predict(test_x)
print("Confusion Matrix:\n ", confusion_matrix(test_y, y_pred))

print(classification_report(test_y, y_pred, digits=3))

Confusion Matrix:
  [[26005  1400]
 [  900   279]]
              precision    recall  f1-score   support

           0      0.967     0.949     0.958     27405
           1      0.166     0.237     0.195      1179

    accuracy                          0.920     28584
   macro avg      0.566     0.593     0.576     28584
weighted avg      0.934     0.920     0.926     28584



In [None]:
from sklearn.tree import export_graphviz
import graphviz
v_feature_name = train_x.columns
export_graphviz(tree, out_file='tree_nature.dot', class_names=['연체X','연체O'],
                feature_names = v_feature_name, impurity = True, filled = True)

with open('tree_nature.dot') as f:
    dot_graph = f.read()
display(graphviz.Source(dot_graph))


# LightGBM

In [141]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

lgbm_clf = LGBMClassifier(n_estimator=500)

evals = [(test_x, test_y)]
lgbm_clf.fit(x_resampled, y_resampeld, early_stopping_rounds=100, eval_metric='f1', eval_set = evals, verbose=False)

lgbm_roc_score = roc_auc_score(test_y, lgbm_clf.predict_proba(test_x)[:,1], average='macro')
print(f'ROC AUC: {round(lgbm_roc_score,3)}')

ROC AUC: 0.889


## GRidSearch

In [143]:
from sklearn.model_selection import GridSearchCV

lgbm = LGBMClassifier()

params = {'num_leaves':[32, 64],
          'max_depth': [128,160],
          'min_child_samples': [60,100],
          'sub_sample': [0.8,1],
          'learning_rate': [0.1,0.01,0.5]}

gridCv = GridSearchCV(lgbm, param_grid=params, cv=3)
gridCv.fit(x_resampled, y_resampeld, early_stopping_rounds = 30, eval_metric='f1', eval_set=[(x_resampled, y_resampeld), (test_x,test_y)])

print('GridSeach 최적 파라미터: ', gridCv.best_params_)
print("Best score ", gridCv.best_score_.round(3) )

[1]	valid_0's binary_logloss: 0.628799	valid_1's binary_logloss: 0.63019
[2]	valid_0's binary_logloss: 0.574219	valid_1's binary_logloss: 0.57679
[3]	valid_0's binary_logloss: 0.5299	valid_1's binary_logloss: 0.533199
[4]	valid_0's binary_logloss: 0.491868	valid_1's binary_logloss: 0.495935
[5]	valid_0's binary_logloss: 0.45767	valid_1's binary_logloss: 0.462304
[6]	valid_0's binary_logloss: 0.429688	valid_1's binary_logloss: 0.434448
[7]	valid_0's binary_logloss: 0.403196	valid_1's binary_logloss: 0.408621
[8]	valid_0's binary_logloss: 0.379917	valid_1's binary_logloss: 0.385735
[9]	valid_0's binary_logloss: 0.358947	valid_1's binary_logloss: 0.365271
[10]	valid_0's binary_logloss: 0.33714	valid_1's binary_logloss: 0.344273
[11]	valid_0's binary_logloss: 0.31972	valid_1's binary_logloss: 0.327408
[12]	valid_0's binary_logloss: 0.303172	valid_1's binary_logloss: 0.311506
[13]	valid_0's binary_logloss: 0.289258	valid_1's binary_logloss: 0.298202
[14]	valid_0's binary_logloss: 0.275664	v

In [147]:
lgbm_clf = LGBMClassifier( min_child_samples=100, max_depth=128, num_leaves = 64, reg_alpha=1.0)

evals = [(test_x, test_y)]

lgbm_clf.fit(x_resampled, y_resampeld, early_stopping_rounds=100, eval_metric=['f1', 'accuracy'], eval_set=evals, verbose=True)

pred_y = lgbm_clf.predict(test_x)
print("Train :", lgbm_clf.score(x_resampled, y_resampeld))
print('Test: ', lgbm_clf.score(test_x, test_y))
print("Confusion Matrix:\n ", confusion_matrix(test_y, pred_y))

print(classification_report(test_y, pred_y, digits=3))


[1]	valid_0's binary_logloss: 0.626995
[2]	valid_0's binary_logloss: 0.572424
[3]	valid_0's binary_logloss: 0.526577
[4]	valid_0's binary_logloss: 0.486353
[5]	valid_0's binary_logloss: 0.452282
[6]	valid_0's binary_logloss: 0.422578
[7]	valid_0's binary_logloss: 0.395473
[8]	valid_0's binary_logloss: 0.372608
[9]	valid_0's binary_logloss: 0.350448
[10]	valid_0's binary_logloss: 0.331603
[11]	valid_0's binary_logloss: 0.314663
[12]	valid_0's binary_logloss: 0.29812
[13]	valid_0's binary_logloss: 0.283707
[14]	valid_0's binary_logloss: 0.271391
[15]	valid_0's binary_logloss: 0.260029
[16]	valid_0's binary_logloss: 0.249453
[17]	valid_0's binary_logloss: 0.240173
[18]	valid_0's binary_logloss: 0.231293
[19]	valid_0's binary_logloss: 0.223065
[20]	valid_0's binary_logloss: 0.215875
[21]	valid_0's binary_logloss: 0.209472
[22]	valid_0's binary_logloss: 0.202832
[23]	valid_0's binary_logloss: 0.196773
[24]	valid_0's binary_logloss: 0.191216
[25]	valid_0's binary_logloss: 0.186529
[26]	valid

In [148]:
df_importance = pd.DataFrame()
df_importance['Feature'] = train_x.columns
df_importance['Importance'] = lgbm_clf.feature_importances_

df_importance.sort_values('Importance', ascending=False, inplace=True)
df_importance.round(3).head(10)

Unnamed: 0,Feature,Importance
44,SPTCT_OCCR_MDIF,412
43,CRDT_OCCR_MDIF,393
39,TOT_LNIF_AMT,361
42,CPT_LNIF_AMT,331
46,CTCD_OCCR_MDIF,318
33,AGE,293
4,LAST_CHLD_AGE,275
45,CRDT_CARD_CNT,271
15,LT1Y_PEOD_RATE,269
41,BNK_LNIF_AMT,259


# Random Forest

In [149]:
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': [100],
    'max_depth': [6,8,10,12],
    'min_samples_leaf': [8,12,18],
    'min_samples_split' : [8,16,20]
}

rf_clf = RandomForestClassifier(random_state=1234, n_jobs=-1)
gridCv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
gridCv.fit(x_resampled, y_resampeld)


print('GridSeach 최적 파라미터: ', gridCv.best_params_)
print("Best score ", gridCv.best_score_.round(3) )


GridSeach 최적 파라미터:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
Best score  0.95


In [161]:
rf_clf = RandomForestClassifier(random_state=1234, n_jobs=-1, max_depth=12, min_samples_leaf=8, min_samples_split=20, n_estimators=100)



rf_clf.fit(x_resampled, y_resampeld)

pred_y = rf_clf.predict(test_x)

print('Accuracy Test: ', accuracy_score(test_y, pred_y))

print("Confusion Matrix:\n ", confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y, digits=3))



Accuracy Test:  0.9273369717324377
Confusion Matrix:
  [[26052  1353]
 [  724   455]]
              precision    recall  f1-score   support

           0      0.973     0.951     0.962     27405
           1      0.252     0.386     0.305      1179

    accuracy                          0.927     28584
   macro avg      0.612     0.668     0.633     28584
weighted avg      0.943     0.927     0.935     28584



# XGBOost

In [163]:
from xgboost import XGBClassifier


params = {
    'n_estimators': [100,300],
    'max_depth': [6,8,10,12],
    'min_samples_leaf': [8,12,18],
    'min_samples_split' : [8,16,20]
}

xg_clf = XGBClassifier(random_state=1234, n_jobs=-1)
gridCv = GridSearchCV(xg_clf, param_grid=params, cv=2, n_jobs=-1)
gridCv.fit(x_resampled, y_resampeld)


print('GridSeach 최적 파라미터: ', gridCv.best_params_)
print("Best score ", gridCv.best_score_.round(3) )

Parameters: { "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "min_samples_leaf", "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issu

In [171]:
import xgboost as xgb

xg_clf = XGBClassifier( min_samples_leaf =8, min_samples_split = 8, max_depth=6, n_estimators = 300)


evals = [(test_x,test_y)]
#xg_clf = xgb.train(early_stopping_rounds=100, evals =evals,  min_samples_leaf =8, min_samples_split = 8, max_depth=6, n_estimators = 300)

xg_clf.fit(x_resampled, y_resampeld, early_stopping_rounds=150, eval_metric='f1', eval_set = evals)

pred_y = xg_clf.predict(test_x)
print("Train :",xg_clf.score(x_resampled, y_resampeld))
print('Test: ', xg_clf.score(test_x, test_y))
print("Confusion Matrix:\n ", confusion_matrix(test_y, pred_y))

print(classification_report(test_y, pred_y, digits=3))

XGBoostError: [11:57:44] /Users/runner/miniforge3/conda-bld/xgboost-split_1660208952489/work/src/metric/metric.cc:49: Unknown metric function f1
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000297233c0c dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x000000029733e9e0 xgboost::Metric::Create(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, xgboost::GenericParameter const*) + 140
  [bt] (2) 3   libxgboost.dylib                    0x000000029731578c xgboost::LearnerConfiguration::ConfigureMetrics(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > > const&) + 220
  [bt] (3) 4   libxgboost.dylib                    0x000000029730b850 xgboost::LearnerConfiguration::Configure() + 1108
  [bt] (4) 5   libxgboost.dylib                    0x00000002972490f8 XGBoosterBoostedRounds + 116
  [bt] (5) 6   libffi.8.dylib                      0x000000010192004c ffi_call_SYSV + 76
  [bt] (6) 7   libffi.8.dylib                      0x000000010191d74c ffi_call_int + 1208
  [bt] (7) 8   _ctypes.cpython-38-darwin.so        0x00000001018f851c _ctypes_callproc + 1196
  [bt] (8) 9   _ctypes.cpython-38-darwin.so        0x00000001018f2780 PyCFuncPtr_call + 1168



In [172]:
# 분류모델 통합 평가: 혼동행렬, 정확도, 정밀도, 재현율, F1, AUC 등
def eval_class_model(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    
    print('오차행렬:\n', confusion, '\n')
    print('정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1    : {:.4f}'.format(F1))
    print('AUC   : {:.4f}'.format(AUC))

In [95]:
pred_y = lgbm_clf.predict(test_x)
get_eval(test_y, pred_y)



오차행렬
[[19092   105]
 [  709 18506]]
정확도: 0.9788087056128293, 정밀도: 0.9943581752726882, 재현율: 0.9631017434296123, F1: 0.9784804102997938, AUC: 0.9788160694019448


In [107]:
lgbm_clf.score(train_x, train_y)

0.9844911074911298

In [96]:
df_importance = pd.DataFrame()
df_importance['Feature'] = train_x.columns
df_importance['Importance'] = lgbm_clf.feature_importances_

df_importance.sort_values('Importance', ascending=False, inplace=True)
df_importance.round(3).head(10)

Unnamed: 0,Feature,Importance
1,HSHD_INFR_INCM,734
39,TOT_LNIF_AMT,704
44,SPTCT_OCCR_MDIF,618
43,CRDT_OCCR_MDIF,603
33,AGE,523
46,CTCD_OCCR_MDIF,517
24,MAX_MON_PREM,515
26,FMLY_TOT_PREM,507
0,CUST_JOB_INCM,502
40,TOT_CLIF_AMT,491


In [80]:
df

Unnamed: 0,CUST_ID,OCCP_NAME_G,CUST_JOB_INCM,HSHD_INFR_INCM,ACTL_FMLY_NUM,CUST_FMLY_NUM,LAST_CHLD_AGE,CRDT_LOAN_CNT,MIN_CNTT_DATE,TOT_CRLN_AMT,...,BNK_LNIF_AMT,CPT_LNIF_AMT,CRDT_OCCR_MDIF,SPTCT_OCCR_MDIF,CRDT_CARD_CNT,CTCD_OCCR_MDIF,CB_GUIF_CNT,CB_GUIF_AMT,Auto_fail_cnt,Category
0,1,공무원,5400,7700,4,1,24.0,0,0,0,...,9001,0,1,0,2,13,3,420001,1,1
1,2,자영업,5500,8100,4,2,29.0,0,0,0,...,24001,0,0,0,2,121,0,0,0,1
2,3,주부,3600,4900,4,1,34.0,0,0,0,...,0,3001,1,25,4,121,0,0,0,4
3,4,학생,3600,10100,2,1,0.0,0,0,0,...,0,3001,1,25,4,61,0,0,0,4
4,5,공무원,4800,4800,4,1,14.0,0,0,0,...,21001,0,1,0,1,97,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95275,102248,자영업,5500,12900,3,3,34.0,0,0,0,...,57001,0,37,109,5,121,0,0,1,4
95276,102249,운전직,4600,9800,5,2,19.0,0,0,0,...,54001,0,85,0,6,121,0,0,0,1
95277,102250,자영업,4800,10400,4,2,14.0,0,0,0,...,27001,0,1,0,2,121,0,0,1,1
95278,102251,사무직,4200,4200,1,1,0.0,0,0,0,...,0,0,1,1,3,121,0,0,0,4
