### kaggle 데이터 다운

In [None]:
!kaggle competitions download -c santander-customer-satisfaction

### 목표
- 370개의 피처, 데이터 세트 기반에서 고객 만족 여부를 예측
- 클래스 레이블 명 : TARGET => 1이면 불만을 가진 고객, 0이면 만족한 고객

대부분 만족하는 데이터이며, 불만족인 데이터는 일부일 것이라, 정확도 수치봅다는 ROC-AUC가 적합

### 라이브러리 불러오기

In [49]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib

import warnings

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

from hyperopt import hp

In [2]:
warnings.filterwarnings('ignore')

# 데이터 프레임 불러오기
cust_df = pd.read_csv('./kaggle_data/santander-customer-satisfaction/train.csv')

In [3]:
# 데이터 확인
tmp = cust_df.shape
print(f'shape: {cust_df.shape}')
print(f'columns : {cust_df.columns}')
print(f'top 3, {cust_df.head(3)}')
print(f'{cust_df.info()}')

shape: (76020, 371)
columns : Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)
top 3,    ID  var3  var15  imp_ent_var16_ult1  imp_op_var39_comer_ult1  \
0   1     2     23                 0.0                      0.0   
1   3     2     34                 0.0                      0.0   
2   4     2     23                 0.0                      0.0   

   imp_op_var39_comer_ult3  imp_op_var40_comer_ult1  imp_op_var40_comer_ult3  \
0                      0.0                      0.0                      0

In [4]:
# Target이 1의 비율
round(
    cust_df['TARGET'].value_counts()[0] / cust_df['TARGET'].count() * 100
    , 2)

96.04

In [5]:
cust_df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


### 데이터 전처리


In [6]:
### var3 컬럼의 최빈값 찾기
cust_df['var3'].value_counts()[:1]

### var3의 이상치(-999999)를 최빈값으로 대치
cust_df['var3'].replace(-999999, 2, inplace = True)

### ID 컬럼은 단순 식별자이니 삭제
cust_df.drop('ID', axis = 1, inplace= True)

In [7]:
### 최종 확인
print(f'''
      원래 형태 : {tmp}
      전처리 후 형태 : {cust_df.shape}
    ''')


      원래 형태 : (76020, 371)
      전처리 후 형태 : (76020, 370)
    


In [14]:
cust_df[cust_df.columns[:-1]]

Unnamed: 0,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000
1,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000
2,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000
3,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000
4,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,2,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.490000
76016,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.520000
76017,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.150000
76018,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.160000


In [19]:
### feature와 label 분리
X_feature = cust_df[cust_df.columns[:-1]]
y_label = cust_df['TARGET']

In [20]:
### 학습과 테스트 데이터 분리
X_train, X_test, y_train, y_test = \
train_test_split(X_feature
                 , y_label
                 , test_size=0.2
                 , random_state=0)

In [27]:
### 분리 확인
print(f'''
      X_train : {X_train.shape[0]}
      X_test : {X_test.shape[0]}
      y_train : {y_train.shape[0]}
      y_test : {y_test.shape[0]}
      '''
)


      X_train : 60816
      X_test : 15204
      y_train : 60816
      y_test : 15204
      


In [31]:
### 학습 데이터를 다시 학습과 검증 데이터로 구분(조기 중단의 검증 데이터 세트 사용)
X_tr, X_val, y_tr, y_val = \
train_test_split(X_train
                 , y_train
                 , test_size=0.3
                 , random_state=0)

In [33]:
### n_estimators는 500으로, random state는 예제 수행 시마다 동일 예측 결과를 위해서 설정
xgb_clf = XGBClassifier(n_estimators = 500
                        , learning_rate = 0.05
                        , random_state = 156)

xgb_clf

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=156, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)

In [35]:
### 성능 평가 지표는 auc, 조기 중단 파라미터는 100으로 설정하고 학습 수행
xgb_clf.fit(X_tr
            , y_tr
            , early_stopping_rounds = 100
            , eval_metric = 'auc'
            , eval_set = [(X_tr, y_tr), (X_val, y_val)]
            )

predict_proba = xgb_clf.predict_proba(X_test)
predict_proba

[0]	validation_0-auc:0.82179	validation_1-auc:0.80068
[1]	validation_0-auc:0.83092	validation_1-auc:0.80941
[2]	validation_0-auc:0.83207	validation_1-auc:0.80903
[3]	validation_0-auc:0.83288	validation_1-auc:0.80889
[4]	validation_0-auc:0.83414	validation_1-auc:0.80924
[5]	validation_0-auc:0.83524	validation_1-auc:0.80907
[6]	validation_0-auc:0.83568	validation_1-auc:0.81005
[7]	validation_0-auc:0.83741	validation_1-auc:0.81088
[8]	validation_0-auc:0.83896	validation_1-auc:0.81305
[9]	validation_0-auc:0.83949	validation_1-auc:0.81363
[10]	validation_0-auc:0.83908	validation_1-auc:0.81277
[11]	validation_0-auc:0.83913	validation_1-auc:0.81260
[12]	validation_0-auc:0.84009	validation_1-auc:0.81325
[13]	validation_0-auc:0.84081	validation_1-auc:0.81329
[14]	validation_0-auc:0.84196	validation_1-auc:0.81380
[15]	validation_0-auc:0.84394	validation_1-auc:0.81540
[16]	validation_0-auc:0.84414	validation_1-auc:0.81573
[17]	validation_0-auc:0.84437	validation_1-auc:0.81577
[18]	validation_0-au

array([[0.9978479 , 0.00215211],
       [0.98383725, 0.01616276],
       [0.9932553 , 0.00674468],
       ...,
       [0.87325156, 0.12674847],
       [0.9850331 , 0.01496693],
       [0.9854666 , 0.01453341]], dtype=float32)

In [47]:
roc_auc_score(y_test, predict_proba[:, 1])

0.842853493090032

### 베이지안 최적화 기반으로 XGBoost의 하이퍼 파라미터 튜닝

In [None]:
### max_depth는 5에서 15까지 1 간격
### min_child_weight는 1에서 6까지 1 간격
### colsample_bytree는 0.5에서 0.95 사이
### learning_rate는 0.01dptj 0.2 사이 정규 분포된 값으로 검색

xgb_search_space = {
                    'max_depth' : hp.quniform('max_depth', 5, 15, 1)
                    , 'min_child_weight' : hp.quniform('colsample_bytree', 0.5, 0.95)
                    , 'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 0.95)
                    , 'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2)
                    }

In [50]:
### fmin()에서 호출 시, search_space 값으로 XGBClassifier 교차 검증 학습 후, -1 * roc_auc 평균 값을 반환
def objective_func(search_space):
    xgb_clf = XGBClassifier(
                    n_estimators = 100
                  , max_depth = int(search_space['max_depth'])
                  , min_child_weight = int(search_space['min_child_weight'])
                  , colsample_bytree = search_space['colsample_bytree']
                  , learning_rate = search_space['learning_rate']
                        )

    # 3개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
    roc_auc_list = []

    # 3개 k-fold 방식 적용
    kf = KFold(n_splits = 3)

    # X_train을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_train):
        # kf.split(X_train)으로 추출된 학습과 검증 index 값으로 학습과 검증 데이터 세트 분리
        X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
        X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행
        xgb_clf.fit(X_tr
                    , y_tr
                    , early_stopping_rounds=30
                    , eval_metric='auc'
                    , eval_set = [(X_tr, y_tr), (X_val, y_val)]
                    )
        
        # 1로 예측한 확률값 추출 후 roc auc 계산하고 평균 roc auc 계산을 우히ㅐ list에 결과값 담음
        score = roc_auc_score(y_val
                              , xgb_clf.predict_proba(X_val)[:, 1]
                              )
        roc_auc_list.append(score)


    # 3개 k-fold로 계산된 roc_auc 값의 평균값을 반환하되,
    # HyperOpt는 목적함수의 최솟값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환
    return -1 * np.mean(roc_auc_list)

### 마지막