In [1]:
import lightgbm as lgb

lgb.__version__

'4.6.0'

[1] 기본 학습<hr>

In [3]:
## 모듈 로딩
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

## 1) 데이터
X, y = load_breast_cancer(return_X_y=True)

## 2) Train/Test 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 3) 조기종료용 Validation 분리
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

## 4) 모델
model = lgb.LGBMClassifier(
    n_estimators=100,      # 크게 주고 early stopping으로 자동 결정
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
)

## 5) 콜백: 조기종료 + 로그 끄기(원하면 period=100 등으로)
callbacks = [
    lgb.early_stopping(stopping_rounds=50),   # patience=50
    lgb.log_evaluation(period=0),             # 학습 로그 숨김
]

## 6) 학습: eval_set이 있어야 조기종료가 동작
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="binary_logloss",
    callbacks=callbacks
)

print("best_iteration_:", model.best_iteration_)  # early stopping 결과 :contentReference[oaicite:1]{index=1}

## 7) 평가
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)

print("Accuracy:", acc)
print(classification_report(y_test, pred))


[LightGBM] [Info] Number of positive: 228, number of negative: 136
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3648
[LightGBM] [Info] Number of data points in the train set: 364, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.156025
best_iteration_: 100
Accuracy: 0.956140350877193
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        42
           1       0.97      0.96      0.97        72

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96



[2] GridSearchCV + LightGBM(LGBMClassifier) <hr>

- 스케일링/인코딩 같은 전처리 들어가면 Pipeline + GridSearchCV :데이터 누수 방지
- 트리계열인 LightGBM은 스케일링이 보통 필요 없지만, 결측치 처리/범주형 인코딩은 Pipeline 유리

- [정석] Pipeline + GridSearchCV 전처리+모델 튜닝

In [3]:
## 모듈 로딩
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## 1) 데이터(DataFrame 유지)
data = load_breast_cancer(as_frame=True)
X, y = data.data, data.target

## 2) 학습용 테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split( X, y, 
                                                    test_size=0.2,
                                                    random_state=42, 
                                                    stratify=y )

## 3) 파이프라인: 전처리 -> 모델
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # 예시 전처리
    ("clf", lgb.LGBMClassifier(objective="binary", random_state=42, n_jobs=-1))
])

## 4) 그리드: 파이프라인 안의 step 이름으로 접근 (clf__ 파라미터)
param_grid = {
    "clf__learning_rate": [0.03, 0.05, 0.1],
    "clf__num_leaves": [15, 31, 63],
    "clf__max_depth": [-1, 4, 6],
    "clf__min_child_samples": [10, 20, 50],
    "clf__subsample": [0.8, 0.9, 1.0],
    "clf__colsample_bytree": [0.8, 0.9, 1.0],
    "clf__reg_alpha": [0.0, 0.1],
    "clf__reg_lambda": [0.0, 1.0],
    "clf__n_estimators": [400, 800, 1500],   # (early stopping 없이) 적당히 후보로 둠
}

## 5) 커스텀 교차 검증 및 하이퍼파라미터 튜닝 진행

##- 사용자 정의 KFold 생성
cv   = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
##- GridSearchCV 인스턴스 생성
grid = GridSearchCV(    pipe,
                        param_grid=param_grid,
                        scoring="accuracy",
                        cv=cv,
                        n_jobs=-1,
                        verbose=1,
                        refit=True,
                    )

##- 교차검증 + 하이퍼파라미터 튜닝 진행
grid.fit(X_train, y_train)

##- 최고 점수 및 파라미터 출력
print("Best CV score:", grid.best_score_)
print("Best params:", grid.best_params_)

##- 테스트 평가는 grid.best_estimator_로 바로 가능
best_pipe = grid.best_estimator_
print("Test score:", best_pipe.score(X_test, y_test))


Fitting 5 folds for each of 8748 candidates, totalling 43740 fits
[LightGBM] [Info] Number of positive: 285, number of negative: 170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000504 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691
Best CV score: 0.9758241758241759
Best params: {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.1, 'clf__max_depth': -1, 'clf__min_child_samples': 50, 'clf__n_estimators': 800, 'clf__num_leaves': 15, 'clf__reg_alpha': 0.0, 'clf__reg_lambda': 1.0, 'clf__subsample': 0.8}
Test score: 0.956140350877193




[3] LightGBM + GridSearchCV + EarlyStopping <hr>

- 1단계(그리드서치): CV(교차검증)로 하이퍼파라미터 후보를 고르고
- 2단계(최종  학습): 선택된 파라미터로 별도 검증셋(eval_set) 을 두고 Early Stopping 적용

* 이유
    - GridSearchCV 내부 CV 폴드마다 eval_set을 따로 넣어 early stopping을 깔끔하게 돌리기 어렵기 때문
    - 기술적으로 억지로 넣을 수도 있지만 추천 흐름은 아님
    - GridSearchCV는 “전체 데이터로 refit”을 수행함! 

In [None]:
## --------------------------------------------------
## 모듈 로딩
## --------------------------------------------------
import pandas as pd
import lightgbm as lgb

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

## --------------------------------------------------
## 0) 데이터 (DataFrame 유지: feature-name 경고 방지)
## --------------------------------------------------
data = load_breast_cancer(as_frame=True)
X = data.data          # pandas DataFrame
y = data.target        # pandas Series

## --------------------------------------------------
## 1) Train / Valid / Test 분리
##    - GridSearch는 Train만 사용 (CV로 검증)
##    - EarlyStopping은 Valid 사용
## --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

## --------------------------------------------------
## 2) (1단계) GridSearchCV: 구조/규제 파라미터 탐색
##    - 여기서는 early stopping을 빼고 "공정하게" CV로 비교
## --------------------------------------------------
base = lgb.LGBMClassifier(
    objective="binary",
    random_state=42,
    n_estimators=800,     # early stopping 없이도 충분히 크게 잡고
    n_jobs=-1
)

param_grid = {
    "learning_rate": [0.03, 0.05, 0.1],
    "num_leaves": [15, 31, 63],            # leaf 복잡도
    "max_depth": [-1, 4, 6],               # -1이면 제한 없음 (num_leaves와 같이 조절) :contentReference[oaicite:2]{index=2}
    "min_child_samples": [10, 20, 50],     # leaf에 필요한 최소 샘플 수
    "subsample": [0.8, 0.9, 1.0],          # row 샘플링
    "colsample_bytree": [0.8, 0.9, 1.0],   # feature 샘플링
    "reg_alpha": [0.0, 0.1, 0.5],          # L1 규제
    "reg_lambda": [0.0, 1.0, 5.0],         # L2 규제
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=base,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    refit=True,  # best params로 X_tr 전체에 다시 학습 :contentReference[oaicite:3]{index=3}
)

grid.fit(X_tr, y_tr)

print("Best CV score:", grid.best_score_)
print("Best params:", grid.best_params_)

## --------------------------------------------------
## 3) (2단계) 최종 학습: Best params + EarlyStopping on (X_val, y_val)
##    - n_estimators를 크게 주고 early stopping으로 best_iteration 자동 결정
## --------------------------------------------------
final_model = lgb.LGBMClassifier(
    **grid.best_params_,
    objective="binary",
    random_state=42,
    n_estimators=5000,
    n_jobs=-1
)

callbacks = [
    lgb.early_stopping(stopping_rounds=50, first_metric_only=True),  # :contentReference[oaicite:4]{index=4}
    lgb.log_evaluation(period=0),
]

final_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="binary_logloss",
    callbacks=callbacks
)

print("best_iteration_:", final_model.best_iteration_)  # early stopping 결과 :contentReference[oaicite:5]{index=5}

## --------------------------------------------------
## 4) 테스트 평가
## --------------------------------------------------
pred = final_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


best_iteration: 77
Accuracy: 0.9385964912280702
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.94      0.95        72

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

