In [17]:
import pandas as pd
import numpy as np

np.random.seed(1234)

n_samples = 1000
data = {
    'age': np.random.randint(20, 60, size=n_samples),
    'salary': np.random.randint(30000, 120000, size=n_samples),
    'owns_house': np.random.randint(0, 2, size=n_samples),
}

df = pd.DataFrame(data)

df['defaulted'] = (
    (df['age'] < 30) & 
    (df['salary'] < 60000) & 
    (df['owns_house'] == 0)
).astype(int)

df.head()

Unnamed: 0,age,salary,owns_house,defaulted
0,39,114753,0,0
1,58,88650,0,0
2,32,31193,0,0
3,44,68343,1,0
4,35,37214,0,0


In [18]:
from sklearn.model_selection import train_test_split

X = df[['age', 'salary', 'owns_house']]
y = df['defaulted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from xgboost import XGBClassifier

# n_estimators = 트리의 수

# 모델 학습
model = XGBClassifier(random_state=1234, max_depth=5, n_estimators=30)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=None, n_estimators=30,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=1234, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [20]:
# 예측
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [28]:
y_proba = model.predict_proba(X_test)[:, 1]  # 양성 클래스(1)의 확률
y_proba

array([0.0257317 , 0.03081029, 0.03081029, 0.0257317 , 0.04803712,
       0.0257317 , 0.0257317 , 0.0257317 , 0.0257317 , 0.0257317 ,
       0.0257317 , 0.03081029, 0.9578316 , 0.0257317 , 0.0257317 ,
       0.0257317 , 0.0257317 , 0.0257317 , 0.03081029, 0.0257317 ,
       0.0257317 , 0.0257317 , 0.03081029, 0.03081029, 0.0257317 ,
       0.0257317 , 0.03081029, 0.0257317 , 0.03081029, 0.0257317 ,
       0.0257317 , 0.0257317 , 0.0257317 , 0.72680855, 0.03081029,
       0.0257317 , 0.03081029, 0.04803712, 0.0257317 , 0.03081029,
       0.0257317 , 0.03081029, 0.0257317 , 0.04803712, 0.03081029,
       0.03081029, 0.0257317 , 0.0257317 , 0.0257317 , 0.0257317 ,
       0.0257317 , 0.0257317 , 0.0257317 , 0.04803712, 0.0257317 ,
       0.0257317 , 0.0257317 , 0.0257317 , 0.0257317 , 0.0257317 ,
       0.0257317 , 0.0257317 , 0.0257317 , 0.0257317 , 0.03081029,
       0.0257317 , 0.0257317 , 0.03081029, 0.0257317 , 0.0257317 ,
       0.0257317 , 0.0257317 , 0.0257317 , 0.03081029, 0.02573

In [29]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# ROC-AUC (Receiver Operating Characteristic - Area Under Curve)
# 분류 모델이 양성 클래스(1)를 얼마나 잘 구별하는지를 측정하는 지표
# 모델이 출력한 확률 값을 이용해서 모든 임계값에서의 성능을 평가

acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("Precision :", precision_score(y_test, y_pred))

# pos_label=1 => 1이 양성 클래스라고 간주 (default)
print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))

print("ROC-AUC Score :", roc_auc)

Accuracy : 0.995
Recall : 0.75
Precision : 1.0
F1 Score: 0.8571428571428571
ROC-AUC Score : 0.9636479591836734
