In [10]:
!pip -q install -U optuna optuna-integration[xgboost] xgboost

In [26]:
import warnings

warnings.filterwarnings(
    "ignore", category=FutureWarning
)

import pandas as pd
import numpy as np
from pathlib import (
    Path,
)
from sklearn.preprocessing import (
    LabelEncoder,
)
from sklearn.model_selection import (
    TimeSeriesSplit,
)
import xgboost as xgb
import optuna

print("라이브러리 로드 완료.")

라이브러리 로드 완료.


In [27]:
# ===== 2. 경로 설정 =====

TRAIN_FP = (
    "train.csv"
)
SAMPLE_FP = "sample_submission.csv"

In [28]:
# ===== 3. 데이터 로드 및 전처리 =====
print("데이터 로딩 및 전처리 시작...")
train = pd.read_csv(TRAIN_FP)
train["영업일자"] = pd.to_datetime(
    train["영업일자"]
)

# 3.1. 이상치 처리 (IQR 기반)
print("이상치 처리 시작...")

def handle_outliers_iqr(df_group):
    # 매출이 0인 경우가 많을 수 있으므로, 0을 제외하고 분위수를 계산하여 더 현실적인 이상치 범위를 설정
    non_zero_sales = df_group[df_group["매출수량"] > 0]["매출수량"]
    if len(non_zero_sales) < 5:  # 데이터가 너무 적으면 이상치 처리를 건너뜁니다.
        return df_group

    # 데이터의 분포를 파악하기 위해 1사분위수(Q1)와 3사분위수(Q3)를 계산합니다.
    q1, q3 = non_zero_sales.quantile(0.25), non_zero_sales.quantile(0.75)
    iqr = q3 - q1

    lower_bound = max(
        0, q1 - 1.5 * iqr
    )  # 하한선 (매출이 음수일 수는 없으므로 0보다 작아지지 않게 함)
    upper_bound = q3 + 1.5 * iqr  # 상한선

    # 이상치를 정상 범위의 최대/최소값으로 대체(Clipping)합니다.
    df_group["매출수량"] = np.clip(df_group["매출수량"], lower_bound, upper_bound)
    return df_group


# 각 메뉴별로 그룹을 지어 이상치 처리 함수를 적용합니다.
train = train.groupby("영업장명_메뉴명", group_keys=False).apply(handle_outliers_iqr)
print("이상치 처리 완료.")

# 3.2. 테스트 데이터 로드
sample = pd.read_csv(SAMPLE_FP)
tests = {}
for i in range(10):
    name = f"TEST_{i:02d}"
    df = pd.read_csv( f"{name}.csv")
    df["영업일자"] = pd.to_datetime(df["영업일자"])
    tests[name] = df

데이터 로딩 및 전처리 시작...
이상치 처리 시작...
이상치 처리 완료.


  train = train.groupby("영업장명_메뉴명", group_keys=False).apply(handle_outliers_iqr)


In [29]:
# ===== 4. 특징 공학 (Feature Engineering) =====
print("특징 공학 시작...")

# 4.1. 라벨 인코딩
le = LabelEncoder()
train["item_id"] = le.fit_transform(
    train["영업장명_메뉴명"]
)  # 메뉴 이름을 모델이 이해할 수 있는 고유 숫자 ID로 변환


# 4.2. 날짜 특징 생성 함수
def make_date_feats(df):
    out = df.copy()
    # 기본 날짜 정보
    out["year"], out["month"], out["day"], out["weekday"] = (
        out["영업일자"].dt.year,
        out["영업일자"].dt.month,
        out["영업일자"].dt.day,
        out["영업일자"].dt.weekday,
    )
    out["is_weekend"] = (
        out["weekday"].isin([5, 6]).astype(int)
    )  # 주말 (토=5, 일=6)

    # 주기성 특징: 12월과 1월이 가깝다는 것을 모델에게 알려주기 위해 시계처럼 원형으로 변환
    out["month_sin"], out["month_cos"] = np.sin(
        2 * np.pi * out["month"] / 12.0
    ), np.cos(2 * np.pi * out["month"] / 12.0)
    out["wday_sin"], out["wday_cos"] = np.sin(2 * np.pi * out["weekday"] / 7.0), np.cos(
        2 * np.pi * out["weekday"] / 7.0
    )
    return out


train = make_date_feats(train)
train = train.sort_values(
    ["item_id", "영업일자"]
)  # Lag, Rolling 계산을 위해 아이템별, 날짜순으로 정렬

# 4.3. Lag & Rolling 특징 생성
# Lag: "어제는 몇 개 팔렸나?", "지난주 같은 요일에는 몇 개 팔렸나?"
for lag in [1, 7, 14, 28]:
    train[f"lag{lag}"] = train.groupby("item_id")["매출수량"].shift(lag)

# Rolling: "지난 7일간의 평균 매출은?", "매출 변동성은 어땠나?" (데이터 누수 방지를 위해 shift(1) 적용)
g = train.groupby("item_id")["매출수량"]
train["roll7_mean"], train["roll14_mean"], train["roll7_std"] = (
    g.shift(1).rolling(7).mean(),
    g.shift(1).rolling(14).mean(),
    g.shift(1).rolling(7).std(),
)

# 특징 생성 과정에서 생긴 결측치(NaN)가 있는 행은 학습에 사용할 수 없으므로 제거
train = train.dropna()
print("특징 공학 완료.")

# 4.4. 최종 학습 데이터 준비
feature_cols = [
    "year",
    "month",
    "day",
    "weekday",
    "is_weekend",
    "month_sin",
    "month_cos",
    "wday_sin",
    "wday_cos",
    "item_id",
    "lag1",
    "lag7",
    "lag14",
    "lag28",
    "roll7_mean",
    "roll14_mean",
    "roll7_std",
]
X, y = train[feature_cols], train["매출수량"].astype(
    float
)  # X: 문제지(특징), y: 정답지(매출수량)

특징 공학 시작...
특징 공학 완료.


In [31]:
# ===== 5. Optuna를 이용한 하이퍼파라미터 튜닝 =====
# "모델의 성능을 최대로 끌어올리기 위해, 최적의 설정값을 자동으로 찾아보자"
print("Optuna 하이퍼파라미터 튜닝 시작...")
try:
    import torch

    HAS_CUDA = torch.cuda.is_available()
except:
    HAS_CUDA = False

# TimeSeriesSplit 객체를 미리 생성하여 objective 함수와 최종 학습에서 공유
tscv = TimeSeriesSplit(n_splits=5)


# Optuna가 최적화할 목표(Objective) 함수를 정의합니다.
# 이 함수는 특정 하이퍼파라미터 조합으로 모델을 학습하고, 그 성능(RMSE)을 반환합니다.
def objective(trial):
    # 탐색할 하이퍼파라미터의 범위(Search Space)를 지정합니다.
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "device": "cuda" if HAS_CUDA else "cpu",
        "seed": 42,
        # trial.suggest_... : Optuna가 이 범위 내에서 다음 시도해볼 값을 '제안'합니다.
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }
    rmses = []
    # 교차 검증을 통해 파라미터 조합의 성능을 안정적으로 평가합니다.
    for tr_idx, va_idx in tscv.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        dtr, dva = xgb.DMatrix(X_tr, label=y_tr), xgb.DMatrix(X_va, label=y_va)

        # Pruning Callback: 성능이 나쁠 것으로 예상되는 시도를 조기에 중단시켜 탐색 시간을 절약합니다.
        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "val-rmse")
        booster = xgb.train(
            params,
            dtr,
            num_boost_round=1000,
            evals=[(dva, "val")],
            early_stopping_rounds=50,
            callbacks=[pruning_callback],
            verbose_eval=False,
        )
        rmses.append(booster.best_score)

    # 교차 검증 결과의 평균 RMSE를 반환합니다. Optuna는 이 값을 '최소화'하는 방향으로 탐색합니다.
    return np.mean(rmses)


# Optuna 스터디(탐색 과정)를 생성하고 실행합니다.
study = optuna.create_study(
    direction="minimize", pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(
    objective, n_trials=50
)  # 50번의 다른 파라미터 조합으로 최적화를 시도합니다.

print("튜닝 완료!")
print(f"최적의 하이퍼파라미터: {study.best_params}")
print(f"최적 RMSE: {study.best_value}")

[I 2025-08-16 07:21:28,693] A new study created in memory with name: no-name-4b3d25d9-c6ab-4e06-8f8c-a8e65eb2b596


Optuna 하이퍼파라미터 튜닝 시작...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[115]	valid_0's rmse: 6.31541




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[222]	valid_0's rmse: 6.66058




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[204]	valid_0's rmse: 14.6192




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[218]	valid_0's rmse: 21.6524




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:21:45,727] Trial 0 finished with value: 20.465898371938557 and parameters: {'max_depth': 5, 'learning_rate': 0.023169044605017956, 'num_leaves': 240, 'feature_fraction': 0.6511660365947726, 'bagging_fraction': 0.6152682559628108, 'bagging_freq': 4, 'lambda_l1': 1.6456110894881002e-08, 'lambda_l2': 0.0003747271448118907, 'min_child_samples': 91}. Best is trial 0 with value: 20.465898371938557.


Early stopping, best iteration is:
[256]	valid_0's rmse: 53.0819
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	valid_0's rmse: 6.40207




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[55]	valid_0's rmse: 6.62392




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[42]	valid_0's rmse: 14.8661




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[54]	valid_0's rmse: 21.6309




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:21:50,753] Trial 1 finished with value: 20.261762012378707 and parameters: {'max_depth': 6, 'learning_rate': 0.09668281240710329, 'num_leaves': 193, 'feature_fraction': 0.8973124568456569, 'bagging_fraction': 0.7636706377944557, 'bagging_freq': 6, 'lambda_l1': 2.0682078566393236e-06, 'lambda_l2': 4.010693015868048e-08, 'min_child_samples': 93}. Best is trial 1 with value: 20.261762012378707.


Early stopping, best iteration is:
[90]	valid_0's rmse: 51.7858
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's rmse: 6.57137




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[147]	valid_0's rmse: 6.69537




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[61]	valid_0's rmse: 14.9274




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[115]	valid_0's rmse: 21.4602




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:21:57,868] Trial 2 finished with value: 20.033935544129417 and parameters: {'max_depth': 7, 'learning_rate': 0.05577687233322113, 'num_leaves': 17, 'feature_fraction': 0.829194037180028, 'bagging_fraction': 0.9225599426641631, 'bagging_freq': 4, 'lambda_l1': 0.02860010825572879, 'lambda_l2': 1.9808836291076764e-05, 'min_child_samples': 58}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[113]	valid_0's rmse: 50.5154
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	valid_0's rmse: 6.99696




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[11]	valid_0's rmse: 6.72428




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[8]	valid_0's rmse: 16.0632




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[8]	valid_0's rmse: 22.6393




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:04,072] Trial 3 finished with value: 20.86674997030671 and parameters: {'max_depth': 12, 'learning_rate': 0.29689319577172774, 'num_leaves': 175, 'feature_fraction': 0.7068190035944791, 'bagging_fraction': 0.7792572750627231, 'bagging_freq': 6, 'lambda_l1': 6.795940271831482e-07, 'lambda_l2': 0.1722851789975223, 'min_child_samples': 16}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[36]	valid_0's rmse: 51.91
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[110]	valid_0's rmse: 6.51003




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[140]	valid_0's rmse: 6.65841




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[150]	valid_0's rmse: 14.8681




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[240]	valid_0's rmse: 21.5836




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:14,077] Trial 4 finished with value: 20.356029138075222 and parameters: {'max_depth': 5, 'learning_rate': 0.026470820914681496, 'num_leaves': 45, 'feature_fraction': 0.8414633165734596, 'bagging_fraction': 0.9739995332651091, 'bagging_freq': 5, 'lambda_l1': 1.6718233442831628e-07, 'lambda_l2': 6.464442786102374e-05, 'min_child_samples': 94}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[194]	valid_0's rmse: 52.16


[I 2025-08-16 07:22:14,328] Trial 5 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:14,597] Trial 6 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:14,846] Trial 7 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[26]	valid_0's rmse: 6.43593




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[51]	valid_0's rmse: 6.62771




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[50]	valid_0's rmse: 15.3605




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[45]	valid_0's rmse: 21.7571




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:23,030] Trial 8 finished with value: 20.325717208531785 and parameters: {'max_depth': 9, 'learning_rate': 0.08015733671604541, 'num_leaves': 81, 'feature_fraction': 0.6776515327150422, 'bagging_fraction': 0.6903101747469155, 'bagging_freq': 5, 'lambda_l1': 3.5125865416102053e-06, 'lambda_l2': 0.52466174328084, 'min_child_samples': 17}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[45]	valid_0's rmse: 51.4474


[I 2025-08-16 07:22:23,388] Trial 9 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:23,827] Trial 10 pruned. Trial was pruned at iteration 32.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	valid_0's rmse: 6.465




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[23]	valid_0's rmse: 6.72948




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[33]	valid_0's rmse: 14.776




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[57]	valid_0's rmse: 21.5079




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:29,747] Trial 11 finished with value: 20.072982544350175 and parameters: {'max_depth': 7, 'learning_rate': 0.11922569814296682, 'num_leaves': 236, 'feature_fraction': 0.9325899666682373, 'bagging_fraction': 0.8565103022305699, 'bagging_freq': 3, 'lambda_l1': 0.002550708670334646, 'lambda_l2': 3.5355963907930964e-08, 'min_child_samples': 81}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[84]	valid_0's rmse: 50.8865


[I 2025-08-16 07:22:30,041] Trial 12 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:30,603] Trial 13 pruned. Trial was pruned at iteration 28.
[I 2025-08-16 07:22:31,049] Trial 14 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:31,566] Trial 15 pruned. Trial was pruned at iteration 26.
[I 2025-08-16 07:22:32,168] Trial 16 pruned. Trial was pruned at iteration 17.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:32,541] Trial 17 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 6.53246




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[35]	valid_0's rmse: 6.68048




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[28]	valid_0's rmse: 15.074




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[76]	valid_0's rmse: 21.5226




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:39,264] Trial 18 finished with value: 20.144585756029237 and parameters: {'max_depth': 8, 'learning_rate': 0.09446657452122927, 'num_leaves': 54, 'feature_fraction': 0.8942831660554093, 'bagging_fraction': 0.8324188678920543, 'bagging_freq': 2, 'lambda_l1': 0.04945824415252874, 'lambda_l2': 0.0007414667948105555, 'min_child_samples': 58}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[57]	valid_0's rmse: 50.9133


[I 2025-08-16 07:22:39,593] Trial 19 pruned. Trial was pruned at iteration 18.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:39,912] Trial 20 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	valid_0's rmse: 6.51658




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[34]	valid_0's rmse: 6.67127




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[26]	valid_0's rmse: 15.0317




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[68]	valid_0's rmse: 21.746




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:48,033] Trial 21 finished with value: 20.056387243249212 and parameters: {'max_depth': 8, 'learning_rate': 0.09989531341970011, 'num_leaves': 50, 'feature_fraction': 0.8920819782482251, 'bagging_fraction': 0.8282413682400896, 'bagging_freq': 2, 'lambda_l1': 0.04094480765243066, 'lambda_l2': 0.002274016432235264, 'min_child_samples': 59}. Best is trial 2 with value: 20.033935544129417.


Early stopping, best iteration is:
[57]	valid_0's rmse: 50.3163


[I 2025-08-16 07:22:48,398] Trial 22 pruned. Trial was pruned at iteration 19.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:48,763] Trial 23 pruned. Trial was pruned at iteration 14.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:49,108] Trial 24 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:49,392] Trial 25 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:49,805] Trial 26 pruned. Trial was pruned at iteration 15.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:50,221] Trial 27 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	valid_0's rmse: 6.46435




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[38]	valid_0's rmse: 6.70618




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[21]	valid_0's rmse: 15.0948




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[38]	valid_0's rmse: 21.7139




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:56,463] Trial 28 finished with value: 19.895719895441136 and parameters: {'max_depth': 7, 'learning_rate': 0.11661680317061691, 'num_leaves': 221, 'feature_fraction': 0.9700547166052165, 'bagging_fraction': 0.9096473419099177, 'bagging_freq': 2, 'lambda_l1': 0.001769591711201634, 'lambda_l2': 3.107981628749321e-05, 'min_child_samples': 44}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[56]	valid_0's rmse: 49.4994


[I 2025-08-16 07:22:56,865] Trial 29 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:57,246] Trial 30 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:22:57,825] Trial 31 pruned. Trial was pruned at iteration 16.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	valid_0's rmse: 6.47822




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[39]	valid_0's rmse: 6.68586




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[33]	valid_0's rmse: 14.8068




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[58]	valid_0's rmse: 21.4429




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:04,949] Trial 32 finished with value: 20.323909580546804 and parameters: {'max_depth': 8, 'learning_rate': 0.10047619468210378, 'num_leaves': 252, 'feature_fraction': 0.920324461237389, 'bagging_fraction': 0.8985944213673542, 'bagging_freq': 2, 'lambda_l1': 0.025736616024086015, 'lambda_l2': 0.0002451722435986704, 'min_child_samples': 87}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[60]	valid_0's rmse: 52.2058
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	valid_0's rmse: 6.4117




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[47]	valid_0's rmse: 6.81942




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[18]	valid_0's rmse: 15.0211




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[19]	valid_0's rmse: 21.9867




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:10,325] Trial 33 finished with value: 20.19530791207269 and parameters: {'max_depth': 6, 'learning_rate': 0.16966833165850315, 'num_leaves': 234, 'feature_fraction': 0.9183534900401398, 'bagging_fraction': 0.7842582759267183, 'bagging_freq': 4, 'lambda_l1': 0.002431081096590307, 'lambda_l2': 8.37703451433159e-08, 'min_child_samples': 59}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[25]	valid_0's rmse: 50.7376


[I 2025-08-16 07:23:10,828] Trial 34 pruned. Trial was pruned at iteration 15.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:11,234] Trial 35 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:11,530] Trial 36 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 6.40671




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[40]	valid_0's rmse: 6.63424




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[24]	valid_0's rmse: 14.9104




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[44]	valid_0's rmse: 21.7077




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:17,752] Trial 37 finished with value: 20.320752775661823 and parameters: {'max_depth': 8, 'learning_rate': 0.1533589448373118, 'num_leaves': 157, 'feature_fraction': 0.8567709607316198, 'bagging_fraction': 0.9487851063871329, 'bagging_freq': 2, 'lambda_l1': 3.6238698963019246e-05, 'lambda_l2': 6.800189723000287e-07, 'min_child_samples': 93}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[38]	valid_0's rmse: 51.9447


[I 2025-08-16 07:23:18,088] Trial 38 pruned. Trial was pruned at iteration 12.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:18,413] Trial 39 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:18,691] Trial 40 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	valid_0's rmse: 6.43901




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[36]	valid_0's rmse: 6.67248




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[22]	valid_0's rmse: 15.0193




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[72]	valid_0's rmse: 21.5836




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:25,984] Trial 41 finished with value: 19.977120901117665 and parameters: {'max_depth': 8, 'learning_rate': 0.09862487881156086, 'num_leaves': 50, 'feature_fraction': 0.8996927576752928, 'bagging_fraction': 0.8306190881893254, 'bagging_freq': 2, 'lambda_l1': 0.0421978613116575, 'lambda_l2': 0.0007497604792109703, 'min_child_samples': 50}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[56]	valid_0's rmse: 50.1712


[I 2025-08-16 07:23:26,318] Trial 42 pruned. Trial was pruned at iteration 8.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	valid_0's rmse: 6.4598




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[30]	valid_0's rmse: 6.69049




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[16]	valid_0's rmse: 15.081




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[21]	valid_0's rmse: 21.8358




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:31,018] Trial 43 finished with value: 20.139047913898946 and parameters: {'max_depth': 9, 'learning_rate': 0.1400599950931655, 'num_leaves': 82, 'feature_fraction': 0.908848352888028, 'bagging_fraction': 0.8520094762064746, 'bagging_freq': 1, 'lambda_l1': 0.020821148853852233, 'lambda_l2': 0.005142646841695294, 'min_child_samples': 52}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[36]	valid_0's rmse: 50.6282


[I 2025-08-16 07:23:31,309] Trial 44 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:31,617] Trial 45 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:31,962] Trial 46 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:32,250] Trial 47 pruned. Trial was pruned at iteration 5.


Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	valid_0's rmse: 6.48774




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[56]	valid_0's rmse: 6.61885




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[30]	valid_0's rmse: 15.227




Training until validation scores don't improve for 50 rounds




Early stopping, best iteration is:
[32]	valid_0's rmse: 21.9836




Training until validation scores don't improve for 50 rounds


[I 2025-08-16 07:23:38,940] Trial 48 finished with value: 20.144017118461687 and parameters: {'max_depth': 7, 'learning_rate': 0.10606505439779688, 'num_leaves': 169, 'feature_fraction': 0.9792501305145072, 'bagging_fraction': 0.8921362070076903, 'bagging_freq': 3, 'lambda_l1': 0.11119602731102596, 'lambda_l2': 0.0015588233545650275, 'min_child_samples': 27}. Best is trial 28 with value: 19.895719895441136.


Early stopping, best iteration is:
[61]	valid_0's rmse: 50.4029


[I 2025-08-16 07:23:39,264] Trial 49 pruned. Trial was pruned at iteration 13.


Training until validation scores don't improve for 50 rounds
튜닝 완료!
최적의 하이퍼파라미터: {'max_depth': 7, 'learning_rate': 0.11661680317061691, 'num_leaves': 221, 'feature_fraction': 0.9700547166052165, 'bagging_fraction': 0.9096473419099177, 'bagging_freq': 2, 'lambda_l1': 0.001769591711201634, 'lambda_l2': 3.107981628749321e-05, 'min_child_samples': 44}
최적 RMSE: 19.895719895441136


In [32]:
# ===== 6. 최적 파라미터로 최종 모델 학습 =====
print("최종 모델 학습 시작...")
best_params = study.best_params
best_params.update(
    {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "device": "cuda" if HAS_CUDA else "cpu",
    }
)

# 최적 파라미터로 최적 학습 횟수(best_iteration)를 다시 찾습니다.
last_tr_idx, last_va_idx = list(tscv.split(X))[-1]
X_tr, X_va = X.iloc[last_tr_idx], X.iloc[last_va_idx]
y_tr, y_va = y.iloc[last_tr_idx], y.iloc[last_va_idx]
dtr, dva = xgb.DMatrix(X_tr, label=y_tr), xgb.DMatrix(X_va, label=y_va)
booster = xgb.train(
    best_params,
    dtr,
    num_boost_round=5000,
    evals=[(dva, "val")],
    early_stopping_rounds=100,
    verbose_eval=False,
)
best_iter = booster.best_iteration
print(f"최적 파라미터로 찾은 학습 횟수: {best_iter}")

# 모든 학습 데이터를 사용하여 최종 모델을 만듭니다.
dall = xgb.DMatrix(X, label=y)
final_model = xgb.train(
    best_params, dall, num_boost_round=best_iter, verbose_eval=False
)
print("최종 모델 학습 완료.")


최종 모델 학습 시작...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[56]	valid_0's rmse: 49.499
최적 파라미터로 찾은 학습 횟수: 56
최종 모델 학습 완료.


In [33]:
# ===== 7. 재귀 예측 및 제출 파일 생성 =====
print("재귀 예측 및 제출 파일 생성 시작...")
all_preds = []
full_history = train.copy()  # 재귀 예측에 사용할 전체 과거 데이터를 미리 복사해 둡니다.

for test_name, test_df in tests.items():
    test_df = test_df.copy()
    test_df["item_id"] = le.transform(test_df["영업장명_메뉴명"])
    test_df = make_date_feats(test_df)

    # 예측의 기반이 될 과거 데이터를 매번 새로 만듭니다. (학습 데이터 + 해당 테스트 데이터)
    history = pd.concat([full_history, test_df], ignore_index=True)
    history = history.sort_values(["item_id", "영업일자"])

    last_date = test_df["영업일자"].max()
    items = test_df["영업장명_메뉴명"].unique()

    preds_rows = []
    current_date = last_date
    for step in range(1, 8):  # 7일간 하루씩 예측을 반복합니다.
        target_date = current_date + pd.Timedelta(days=1)

        # 1. 예측할 날짜의 기본 프레임(뼈대) 생성
        frame = pd.DataFrame(
            {"영업일자": np.repeat(target_date, len(items)), "영업장명_메뉴명": items}
        )
        frame["item_id"] = le.transform(frame["영업장명_메뉴명"])
        frame = make_date_feats(frame)

        # 2. 업데이트된 'history'를 사용하여 Lag & Rolling 특징 계산
        temp_hist = history.copy()
        for lag in [1, 7, 14, 28]:
            lagged = temp_hist[["영업일자", "item_id", "매출수량"]].copy()
            lagged["영업일자"] = lagged["영업일자"] + pd.Timedelta(days=lag)
            frame = frame.merge(
                lagged.rename(columns={"매출수량": f"lag{lag}"}),
                on=["영업일자", "item_id"],
                how="left",
            )

        roll_base = temp_hist.sort_values(["item_id", "영업일자"]).copy()
        gb = roll_base.groupby("item_id")["매출수량"]
        roll_base["roll7_mean"] = gb.rolling(7).mean().reset_index(0, drop=True)
        roll_base["roll14_mean"] = gb.rolling(14).mean().reset_index(0, drop=True)
        roll_base["roll7_std"] = gb.rolling(7).std().reset_index(0, drop=True)
        roll_base["영업일자"] = roll_base["영업일자"] + pd.Timedelta(days=1)
        frame = frame.merge(
            roll_base[
                ["영업일자", "item_id", "roll7_mean", "roll14_mean", "roll7_std"]
            ],
            on=["영업일자", "item_id"],
            how="left",
        )

        frame[feature_cols] = frame[feature_cols].fillna(0)  # 결측치는 0으로 채움

        # 3. 모델로 예측 수행
        X_pred = frame[feature_cols]
        dpred = xgb.DMatrix(X_pred)
        yhat = final_model.predict(dpred)
        yhat = np.clip(yhat, 0, None)  # 매출이 음수가 나오지 않도록 0으로 조정
        frame["pred"] = yhat

        # 4. 예측값을 history에 추가하여 다음 날 예측에 사용 (재귀의 핵심)
        add_hist = frame[["영업일자", "item_id", "영업장명_메뉴명", "pred"]].rename(
            columns={"pred": "매출수량"}
        )
        history = pd.concat([history, add_hist], ignore_index=True)

        # 5. 최종 제출용으로 결과 저장
        frame_out = frame[["영업일자", "영업장명_메뉴명", "pred"]].copy()
        frame_out["영업일자"] = f"{test_name}+{step}일"
        preds_rows.append(frame_out)

        current_date = target_date  # 기준 날짜를 하루 뒤로 업데이트

    test_pred = pd.concat(preds_rows, ignore_index=True)
    wide = test_pred.pivot(index="영업일자", columns="영업장명_메뉴명", values="pred")
    all_preds.append(wide)

재귀 예측 및 제출 파일 생성 시작...


In [34]:
all_preds

[영업장명_메뉴명    느티나무 셀프BBQ_1인 수저세트  느티나무 셀프BBQ_BBQ55(단체)  느티나무 셀프BBQ_대여료 30,000원  \
 영업일자                                                                           
 TEST_00+1일           11.436474              3.104638                5.229937   
 TEST_00+2일            4.601531              4.908819                2.297441   
 TEST_00+3일            5.057312              2.979447                2.678973   
 TEST_00+4일            4.993417             14.626269                2.984063   
 TEST_00+5일            5.373074             14.963423                3.297331   
 TEST_00+6일            7.222497             11.357335                5.951017   
 TEST_00+7일           20.280096              6.671472               11.686394   
 
 영업장명_메뉴명    느티나무 셀프BBQ_대여료 60,000원  느티나무 셀프BBQ_대여료 90,000원  \
 영업일자                                                         
 TEST_00+1일                3.546378                0.694694   
 TEST_00+2일                1.435863                0.533181   
 TEST_00+3일      

In [35]:
# ===== 8. 최종 제출 파일 생성 =====
submission = pd.concat(all_preds)
submission = submission.reset_index().rename(columns={"index": "영업일자"})
submission = submission[sample.columns]  # 제출 샘플과 열 순서/이름을 정확히 일치시킴
out_path = "new_submission.csv"
submission.to_csv(out_path, index=False, encoding="utf-8-sig")

print(f"✅ 최종 제출 파일 저장 완료: {out_path}")


✅ 최종 제출 파일 저장 완료: new_submission.csv
