In [1]:
# 라이브러리 import
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report

from xgboost import XGBClassifier


In [2]:
# 데이터 불러오기 & X, y 준비

df = pd.read_csv("stroke.csv")
target_col = "stroke" 

# X: 입력 변수(피처), y: 타깃(stroke 여부)
X = df.drop(columns=[target_col])
y = df[target_col]


In [3]:

# train / test 데이터 나누기
#  - test_size : 테스트 데이터 비율 (예: 0.2 = 20%)
#  - stratify=y : 클래스 비율을 train/test에 비슷하게 맞추기 위해 사용
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # 뇌졸중/비-뇌졸중 비율 유지
)


In [5]:

# 양성 클래스 비율 & scale_pos_weight 계산
#  - 여기서 '1'이 뇌졸중 발생(stroke=1)이라고 가정
#  - pos_ratio: y_train에서 1의 비율
#  - scale_pos_weight: (음성 비율 / 양성 비율)
#    -> 불균형 데이터에서 양성 클래스의 중요도를 높이는 역할

# y_train에서 양성(1)의 비율
pos_ratio = y_train.sum() / len(y_train)

# 음성(0)의 비율은 1 - pos_ratio
neg_ratio = 1 - pos_ratio

# scale_pos_weight 계산
scale_pos = neg_ratio / pos_ratio

print(f"양성(1) 비율: {pos_ratio:.4f}")
print(f"음성(0) 비율: {neg_ratio:.4f}")
print(f"계산된 scale_pos_weight: {scale_pos:.4f}")


양성(1) 비율: 0.0487
음성(0) 비율: 0.9513
계산된 scale_pos_weight: 19.5427


In [17]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.44-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
Downloading alembic-1.16.5-py3-none-any.whl (247 kB)
Downloading sqlalchemy-2.0.44-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m35.6 MB/s[0m  [33

In [19]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    
    params = {
        "learning_rate": trial.suggest_float("lr", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "n_estimators": trial.suggest_int("n_estimators", 300, 900),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 5),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "scale_pos_weight": scale_pos,
        "eval_metric": "auc",
        "random_state": 42,
        "n_jobs": -1
    }
    
    model = XGBClassifier(**params)
    auc = cross_val_score(model, X_train, y_train,
                          cv=3,
                          scoring="roc_auc").mean()
    
    return auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print(study.best_params)
print(study.best_value)


[I 2025-12-05 11:44:16,070] A new study created in memory with name: no-name-c4490de8-b3ec-470b-be50-8837acd1911c
[I 2025-12-05 11:48:59,656] Trial 0 finished with value: 0.8166846102329313 and parameters: {'lr': 0.03551347295699043, 'max_depth': 6, 'n_estimators': 407, 'subsample': 0.8991460061913226, 'colsample_bytree': 0.8914981241344369, 'min_child_weight': 1, 'gamma': 0.776867485713345}. Best is trial 0 with value: 0.8166846102329313.
[I 2025-12-05 11:53:55,463] Trial 1 finished with value: 0.8015278158710024 and parameters: {'lr': 0.06245778190755447, 'max_depth': 3, 'n_estimators': 642, 'subsample': 0.7996376756593817, 'colsample_bytree': 0.8688222809137849, 'min_child_weight': 1, 'gamma': 0.7026937165949332}. Best is trial 0 with value: 0.8166846102329313.
[I 2025-12-05 11:59:10,783] Trial 2 finished with value: 0.811275174885822 and parameters: {'lr': 0.04325031403462434, 'max_depth': 3, 'n_estimators': 744, 'subsample': 0.6608118764874814, 'colsample_bytree': 0.88489092063574

KeyboardInterrupt: 

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    
    # Optuna가 탐색할 하이퍼파라미터 범위 정의
    params = {
        "learning_rate": trial.suggest_float("lr", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 3, 6),
        "n_estimators": trial.suggest_int("n_estimators", 300, 900),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 5),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "scale_pos_weight": scale_pos,  # 불균형 데이터 가중치
        "eval_metric": "auc",
        "random_state": 42,
        "n_jobs": -1,
        
        # 🔻 여기부터 GPU 관련 옵션
        # GPU에서 학습하는 트리 메서드
        "tree_method": "gpu_hist",
        # 예측도 GPU에서 수행
        "predictor": "gpu_predictor"
    }
    
    # 위에서 정의한 params로 XGBoost 모델 생성
    model = XGBClassifier(**params)
    
    # 3-fold 교차검증으로 ROC-AUC 평균 계산
    auc = cross_val_score(
        model,
        X_train,
        y_train,
        cv=3,
        scoring="roc_auc"
    ).mean()
    
    return auc

# Optuna Study 생성 (AUC를 최대화하는 방향)
study = optuna.create_study(direction="maximize")

# 하이퍼파라미터 탐색 실행 (trial 수는 필요에 따라 조절 가능)
study.optimize(objective, n_trials=50)

# 최적 파라미터와 그때의 AUC 출력
print("Best params:", study.best_params)
print("Best AUC:", study.best_value)


[I 2025-12-05 13:23:20,542] A new study created in memory with name: no-name-16e91edf-ceda-43a3-93c0-adcc61682400

    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

[I 2025-12-05 13:23:32,936] Trial 0 finished with value: 0.801916328912721 and parameters: {'lr': 0.058419643637317824, 'max_depth': 6, 'n_estimators': 828, 'subsample': 0.6427282745539856, 'colsample_bytree': 0.7446933788346353, 'min_child_weight': 5, 'gamma': 0.502439643425009}. Best is trial 0 with value

Best params: {'lr': 0.010002154204346032, 'max_depth': 4, 'n_estimators': 537, 'subsample': 0.6055566573624598, 'colsample_bytree': 0.8979287201002785, 'min_child_weight': 5, 'gamma': 0.7209404101015859}
Best AUC: 0.8377032622337969


In [25]:
# baseline XGBoost 모델 정의
#  - 핵심: scale_pos_weight를 우리가 계산한 값으로 설정
#  - eval_metric="auc" : 내부 평가 지표를 AUC로 사용
#  - 나머지 파라미터는 최대한 기본값 + 약간만 설정
#    (추후 튜닝을 위해 지금은 심플하게 유지)

xgb_baseline = XGBClassifier(
    random_state=42,        # 재현성을 위해 고정
    n_estimators=300,       # 기본값(100)보다 약간 늘려줌
    learning_rate=0.1,      # 기본적인 학습률
    max_depth=4,            # 너무 깊지 않게 설정 (과적합 완화)
    scale_pos_weight=scale_pos,  # **불균형 조정의 핵심**
    eval_metric="auc",      # AUC 기준으로 학습 모니터링
    n_jobs=-1               # 가능한 모든 코어 사용
)


In [26]:
# 모델 학습
#  - X_train, y_train으로 학습
xgb_baseline.fit(X_train, y_train)

In [28]:
# baseline 성능 확인 (ROC-AUC 중심)
#  - predict_proba 로 1(양성)일 확률을 가져와서 AUC 계산
#  - classification_report는 대략적인 성능 느낌 확인용

# 1) 확률 예측 (양성 클래스 확률만 사용)
y_proba = xgb_baseline.predict_proba(X_test)[:, 1]

# 2) ROC-AUC 계산
auc = roc_auc_score(y_test, y_proba)
print(f"✅ Baseline XGBoost (scale_pos_weight 적용) ROC-AUC: {auc:.4f}")

# 3) 디폴트 threshold(0.5) 기준으로 분류 결과 보기
y_pred_default = (y_proba >= 0.5).astype(int)
print("\n[기본 threshold=0.5 기준 분류 결과]")
print(classification_report(y_test, y_pred_default, digits=3))

# 3) 디폴트 threshold(0.2) 기준으로 분류 결과 보기
y_pred_default = (y_proba >= 0.2).astype(int)
print("\n[기본 threshold=0.2 기준 분류 결과]")
print(classification_report(y_test, y_pred_default, digits=3))


✅ Baseline XGBoost (scale_pos_weight 적용) ROC-AUC: 0.8142

[기본 threshold=0.5 기준 분류 결과]
              precision    recall  f1-score   support

           0      0.964     0.949     0.956       972
           1      0.242     0.320     0.276        50

    accuracy                          0.918      1022
   macro avg      0.603     0.634     0.616      1022
weighted avg      0.929     0.918     0.923      1022


[기본 threshold=0.2 기준 분류 결과]
              precision    recall  f1-score   support

           0      0.975     0.829     0.896       972
           1      0.149     0.580     0.237        50

    accuracy                          0.817      1022
   macro avg      0.562     0.705     0.566      1022
weighted avg      0.934     0.817     0.864      1022



In [None]:
from xgboost import XGBClassifier

# 최적 파라미터(Optuna 결과)
best_params = {
    "learning_rate": 0.010002154204346032,
    "max_depth": 4,
    "n_estimators": 537,
    "subsample": 0.6055566573624598,
    "colsample_bytree": 0.8979287201002785,
    "min_child_weight": 5,
    "gamma": 0.7209404101015859,
}

# scale_pos_weight는 Optuna가 고정한 값 사용
# (천천히 튜닝할 때 같이 넣는 게 좋기 때문에)
best_params["scale_pos_weight"] = scale_pos

# 기타 고정 설정
best_params["eval_metric"] = "auc"
best_params["random_state"] = 42
best_params["n_jobs"] = -1


# 최종 모델 생성
model_optuna = XGBClassifier(**best_params)

# 학습
model_optuna.fit(X_train, y_train)


In [30]:
from sklearn.metrics import precision_recall_curve
import numpy as np

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)

best_thr = thresholds[np.argmax(f1_scores)]
print(f"F1 기준 최적 threshold: {best_thr:.4f}")


F1 기준 최적 threshold: 0.5288


In [40]:
y_pred_opt = (y_proba >= 0.4).astype(int)
print(classification_report(y_test, y_pred_opt, digits=3))


              precision    recall  f1-score   support

           0      0.967     0.922     0.944       972
           1      0.200     0.380     0.262        50

    accuracy                          0.895      1022
   macro avg      0.583     0.651     0.603      1022
weighted avg      0.929     0.895     0.910      1022

