In [1]:
import xgboost as xgb
print(xgb.__version__)

1.7.5


In [2]:
import xgboost as xgb
from xgboost import plot_importance
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 데이터 추출
dataset = load_breast_cancer()
dataset

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [4]:
features = dataset.data # 데이터 추출 저장
labels = dataset.target # 답 추출 저장

In [5]:
# 데이터프레임 생성
cancer_df = pd.DataFrame(data=features, columns=dataset.feature_names)
cancer_df['target'] = labels
cancer_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [6]:
# 불균형 데이터셋인지 확인(레이블)
print(dataset.target_names)
print(cancer_df['target'].value_counts())

['malignant' 'benign']
target
1    357
0    212
Name: count, dtype: int64


In [7]:
# 데이터프레임에서 데이터와 답을 분리
X_features = cancer_df.iloc[:,:-1] # 데이터만 추출
y_label = cancer_df.iloc[:,-1]

In [8]:
# 학습데이터/테스트데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
      X_features
    , y_label
    , test_size=0.2 # 8:2 비율 분리
    , random_state=156
)

In [9]:
# 학습데이터로 학습데이터와 검증데이터 분리
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=156
)

In [10]:
print(X_train.shape, X_test.shape)
print(X_tr.shape, X_val.shape)

(455, 30) (114, 30)
(409, 30) (46, 30)


In [11]:
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [12]:
params = {
      'max_depth' : 3
    , 'eta' : 0.05
    , 'objective' : 'binary:logistic' # 이진분류
    , 'eval_metric' : 'logloss'
}
num_rounds = 1000 # 모델의 개수

In [13]:
# 학습데이터, 검증데이터 지정
eval_list = [(dtr,'train'),(dval,'eval')]
xgb_model = xgb.train(
      params=params # 파라미터 설정
    , dtrain=dtr # 학습데이터 설정
    , num_boost_round=num_rounds
    , early_stopping_rounds=100
    , evals=eval_list 
)

[0]	train-logloss:0.65016	eval-logloss:0.66183
[1]	train-logloss:0.61131	eval-logloss:0.63609
[2]	train-logloss:0.57563	eval-logloss:0.61144
[3]	train-logloss:0.54310	eval-logloss:0.59204
[4]	train-logloss:0.51323	eval-logloss:0.57329
[5]	train-logloss:0.48447	eval-logloss:0.55037
[6]	train-logloss:0.45796	eval-logloss:0.52930


[7]	train-logloss:0.43436	eval-logloss:0.51534
[8]	train-logloss:0.41150	eval-logloss:0.49718
[9]	train-logloss:0.39027	eval-logloss:0.48154
[10]	train-logloss:0.37128	eval-logloss:0.46990
[11]	train-logloss:0.35254	eval-logloss:0.45474
[12]	train-logloss:0.33528	eval-logloss:0.44229
[13]	train-logloss:0.31892	eval-logloss:0.42961
[14]	train-logloss:0.30439	eval-logloss:0.42065
[15]	train-logloss:0.29000	eval-logloss:0.40958
[16]	train-logloss:0.27651	eval-logloss:0.39887
[17]	train-logloss:0.26389	eval-logloss:0.39050
[18]	train-logloss:0.25210	eval-logloss:0.38254
[19]	train-logloss:0.24123	eval-logloss:0.37393
[20]	train-logloss:0.23076	eval-logloss:0.36789
[21]	train-logloss:0.22091	eval-logloss:0.36017
[22]	train-logloss:0.21155	eval-logloss:0.35421
[23]	train-logloss:0.20263	eval-logloss:0.34683
[24]	train-logloss:0.19434	eval-logloss:0.34111
[25]	train-logloss:0.18637	eval-logloss:0.33634
[26]	train-logloss:0.17875	eval-logloss:0.33082
[27]	train-logloss:0.17167	eval-logloss:0.3

In [14]:
# 예측 추출
pred = xgb_model.predict(dtest)
print(np.round(pred[:10],3))

[0.943 0.006 0.669 0.037 0.99  1.    0.999 1.    0.998 0.   ]


In [15]:
preds = [1 if x > 0.5 else 0 for x in pred]
preds[:10]

[1, 0, 1, 0, 1, 1, 1, 1, 1, 0]

In [17]:
# 사이킷런 사용한 xgboost
from xgboost import XGBClassifier

xgb_wrapper = XGBClassifier(
      n_estimators=400
    , learning_rate=0.05
    , max_depth=3
    , eval_metric='logloss'
)

In [20]:
xgb_wrapper.fit(X_train, y_train, verbose=False)

In [21]:
# 예측값 추출
w_preds = xgb_wrapper.predict(X_test)
w_preds

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])

In [22]:
# 예측 확률 추출
w_preds_proba = xgb_wrapper.predict_proba(X_test)
w_preds_proba[:10,:]

array([[9.3345761e-02, 9.0665424e-01],
       [9.9703813e-01, 2.9618922e-03],
       [7.7277362e-02, 9.2272264e-01],
       [8.5381806e-01, 1.4618194e-01],
       [9.2908144e-03, 9.9070919e-01],
       [1.0728836e-04, 9.9989271e-01],
       [4.6670437e-04, 9.9953330e-01],
       [1.0513663e-03, 9.9894863e-01],
       [4.3853521e-03, 9.9561465e-01],
       [9.9982631e-01, 1.7371235e-04]], dtype=float32)

In [29]:
# 행 ALL , 열 1
w_preds_proba = xgb_wrapper.predict_proba(X_test)[:,1]

In [32]:
# 오차행렬, 정확도, 정밀도, 재현율, f1점수, AUC 확인
from sklearn.metrics import confusion_matrix # 오차행렬 (원래답, 예측값)
from sklearn.metrics import accuracy_score # 분류에서의 정확도 (원래답, 예측값)
from sklearn.metrics import precision_score # 정밀도 점수 (원래답, 예측값)
from sklearn.metrics import recall_score # 재현율 점수 (원래답, 예측값)
from sklearn.metrics import f1_score # (원래답, 예측값)
from sklearn.metrics import roc_auc_score # (원래답, 예측확률)

def get_clf_eval(y_test, preds, preds_proba):
    # 오차행렬
    confusion = confusion_matrix(y_test, preds)
    # 정확도 점수
    accuracy = accuracy_score(y_test, preds)
    # 정밀도 점수
    precision = precision_score(y_test, preds)
    recall = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    roc_auc = roc_auc_score(y_test, preds_proba)
    print('오차행렬')
    print(confusion)
    print(f'정확도 : {accuracy:.4f}, 정밀도 : {precision:.4f},\
            재현율 : {recall:.4f}, F1 : {f1:.4f}, \
            AUC : {roc_auc:.4f}')


In [33]:
get_clf_eval(y_test, w_preds, w_preds_proba)

오차행렬
[[34  3]
 [ 1 76]]
정확도 : 0.9649, 정밀도 : 0.9620,            재현율 : 0.9870, F1 : 0.9744,             AUC : 0.9954


In [35]:
# 조기 중단 처리
from xgboost import XGBClassifier
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.05, max_depth=3)
evals = [(X_tr, y_tr),(X_val,y_val)]
xgb_wrapper.fit(
      X_tr, y_tr
    , early_stopping_rounds=50
    , eval_metric='logloss'
    , eval_set=evals
    , verbose=True
)

[0]	validation_0-logloss:0.65016	validation_1-logloss:0.66183
[1]	validation_0-logloss:0.61131	validation_1-logloss:0.63609
[2]	validation_0-logloss:0.57563	validation_1-logloss:0.61144
[3]	validation_0-logloss:0.54310	validation_1-logloss:0.59204
[4]	validation_0-logloss:0.51323	validation_1-logloss:0.57329
[5]	validation_0-logloss:0.48447	validation_1-logloss:0.55037
[6]	validation_0-logloss:0.45796	validation_1-logloss:0.52930
[7]	validation_0-logloss:0.43436	validation_1-logloss:0.51534
[8]	validation_0-logloss:0.41150	validation_1-logloss:0.49718
[9]	validation_0-logloss:0.39027	validation_1-logloss:0.48154
[10]	validation_0-logloss:0.37128	validation_1-logloss:0.46990
[11]	validation_0-logloss:0.35254	validation_1-logloss:0.45474
[12]	validation_0-logloss:0.33528	validation_1-logloss:0.44229
[13]	validation_0-logloss:0.31892	validation_1-logloss:0.42961
[14]	validation_0-logloss:0.30439	validation_1-logloss:0.42065
[15]	validation_0-logloss:0.29000	validation_1-logloss:0.40958
[1

In [36]:
# 예측값, 예측확률 추출
ws50_preds = xgb_wrapper.predict(X_test)
ws50_preds_proba = xgb_wrapper.predict_proba(X_test)[:,1]

In [37]:
get_clf_eval(y_test, ws50_preds, ws50_preds_proba)

오차행렬
[[34  3]
 [ 2 75]]
정확도 : 0.9561, 정밀도 : 0.9615,            재현율 : 0.9740, F1 : 0.9677,             AUC : 0.9933
