# Import

## Import Files

In [None]:
!pip install gdown



In [None]:
import gdown
import os

file_id = '1_Xo2vU82JSSadBdD1Kb7iImHnEYdoFGh'
output = 'open.zip' # 저장할 파일 이름

# 'output'으로 지정된 파일이 현재 경로에 존재하지 않을 경우에만 다운로드 실행
if not os.path.exists(output):
    print(f"'{output}' 파일이 없어 다운로드를 시작합니다.")
    gdown.download(id=file_id, output=output)
else:
    print(f"'{output}' 파일이 이미 존재합니다. 다운로드를 건너뜁니다.")

'open.zip' 파일이 이미 존재합니다. 다운로드를 건너뜁니다.


In [None]:
# !unzip -qq '/파일 경로/파일명.zip' -d '저장할 dir 위치 경로'
!unzip -qq '/content/open.zip' -d '/content/'

replace /content/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A


## Baseline

In [None]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from tqdm import tqdm
import holidays
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV

# Fixed RandomSeed & Setting Hyperparameter

## Baseline

In [None]:
def set_seed(seed=42):
    random.seed(seed)    # 1. 파이썬 내장 random 라이브러리의 시드를 고정합니다.
    np.random.seed(seed)    # 2. NumPy 라이브러리의 난수 생성 시드를 고정합니다.
    torch.manual_seed(seed)    # 3. PyTorch의 CPU 연산에 대한 난수 생성 시드를 고정합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)    # 4. 파이썬의 해시 시드를 고정하여 딕셔너리 등의 순서를 보장합니다.

    # 5. CUDA (GPU) 사용이 가능한 경우, GPU 관련 시드 및 설정을 고정합니다.
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)    # 5-1. 현재 사용 중인 GPU의 난수 생성 시드를 고정합니다.
        torch.cuda.manual_seed_all(seed)    # 5-2. 여러 개의 GPU를 사용하는 경우, 모든 GPU의 시드를 고정합니다.
        torch.backends.cudnn.deterministic = True   # 5-3. cuDNN 라이브러리가 항상 결정적인(deterministic) 알고리즘만 사용하도록 설정합니다.
        torch.backends.cudnn.benchmark = False  # 5-4. cuDNN의 벤치마크 기능을 비활성화합니다.

set_seed(42)    # 위에서 정의한 함수를 seed 값 42로 실행하여 코드 전체의 재현성을 확보합니다.

In [None]:
"""
LOOKBACK=28: 과거 28일치 데이터를 보고
PREDICT=7: 미래 7일치를 에측
BATCH_SIZE=16: 한번에 16개의 data를 묶어 학습
EPOCHS=50: 전체 데이터를 50번 반복학습
"""
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data Load

In [None]:
train = pd.read_csv('./train/train.csv')

In [None]:

import holidays

def dataPreProcessing(df):
    # 1. 열 이름 정리
    df = df.rename(columns={
        '영업일자': 'date_time',
        '영업장명_메뉴명': 'market_menu',
        '매출수량': 'sales_amount'
    }) if '영업일자' in df.columns else df

    # 2. 날짜 파생
    df['date_time'] = pd.to_datetime(df['date_time'])
    df['weekday'] = df['date_time'].dt.weekday
    df['weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['month'] = df['date_time'].dt.month
    df['dayofweek'] = df['date_time'].dt.dayofweek
    df['day'] = df['date_time'].dt.day

    # 3. 공휴일
    kr_holidays = holidays.KR()
    df['holiday'] = df['date_time'].dt.date.apply(lambda d: int(d in kr_holidays))

    # 4. 방학 시즌 (1,2,7,8월)
    df['vacation_season'] = df['month'].isin([1, 2, 7, 8]).astype(int)

    # 5. 메뉴명 분해
    df[['store_name', 'menu_name']] = df['market_menu'].str.split('_', n=1, expand=True)

    # 6. 사용유형 추정
    def classify_usage_type(menu):
        if pd.isna(menu): return '기타'
        if '어린이' in menu: return '어린이'
        elif re.search(r'단체|플래터|무제한|[3-9]인|인분|세트', str(menu)): return '단체'
        elif '2인' in menu: return '커플'
        elif '1인' in menu or '단품' in menu or 'Gls' in menu: return '1인'
        else: return '일반'

    df['usage_type'] = df['menu_name'].apply(classify_usage_type)

    # 7. 고급 메뉴 여부
    df['is_premium'] = df['menu_name'].str.contains('한우|프리미엄|수제|특선|와인').fillna(False).astype(int)

    # 10. 라벨 인코딩 (사용유형)
    usage_map = {'1인': 0, '커플': 1, '단체': 2, '어린이': 3, '일반': 4, '기타': 5}
    df['usage_type_encoded'] = df['usage_type'].map(usage_map).fillna(5).astype(int)

    return df


In [None]:
# 새로 추가: 그룹별 시계열 피처
def add_ts_features(df, lags=(1,7,14), roll_window=7):
    df = df.sort_values(['market_menu','date_time']).copy()
    g = df.groupby('market_menu')['sales_amount']

    for lag in lags:
        df[f'lag_{lag}'] = g.shift(lag)

    def _rmean(s): return s.shift(1).rolling(roll_window, min_periods=1).mean()
    def _rstd(s):  return s.shift(1).rolling(roll_window, min_periods=1).std()

    df['rolling_mean_7'] = df.groupby('market_menu')['sales_amount'].apply(_rmean).reset_index(level=0, drop=True)
    df['rolling_std_7']  = df.groupby('market_menu')['sales_amount'].apply(_rstd).reset_index(level=0, drop=True)

    # 수치 결측 안전 처리만 최소로
    num_cols = [c for c in df.columns if df[c].dtype.kind in 'if']
    df[num_cols] = df[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
    return df

In [None]:

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    out = np.zeros_like(denom)
    mask = denom > 0
    out[mask] = np.abs(y_true[mask] - y_pred[mask]) / denom[mask]
    return out.mean() * 100.0

def add_days_since_last_sale(df):
    # 정렬 & 라벨 안전화
    df = df.sort_values(['market_menu','date_time']).copy()
    df['sales_amount'] = pd.to_numeric(df['sales_amount'], errors='coerce').fillna(0).clip(lower=0)

    # 그룹별 '마지막 양수 판매일'을 forward-fill로 계산
    pos_date = df['date_time'].where(df['sales_amount'] > 0)
    df['last_pos_date'] = pos_date.groupby(df['market_menu']).ffill()

    # 일수 계산(벡터화): pandas Timedelta → .dt.days 사용
    delta = df['date_time'] - df['last_pos_date']
    df['days_since_last_sale'] = delta.dt.days.fillna(999).astype(int)

    # 보조 컬럼 정리
    df = df.drop(columns=['last_pos_date'])
    return df



In [None]:

train_df = dataPreProcessing(train)
train_feat = add_ts_features(train_df)

train_feat = add_days_since_last_sale(train_feat)
train_df

Unnamed: 0,date_time,market_menu,sales_amount,weekday,weekend,month,dayofweek,day,holiday,vacation_season,store_name,menu_name,usage_type,is_premium,usage_type_encoded
0,2023-01-01,느티나무 셀프BBQ_1인 수저세트,0,6,1,1,6,1,1,1,느티나무 셀프BBQ,1인 수저세트,단체,0,2
1,2023-01-02,느티나무 셀프BBQ_1인 수저세트,0,0,0,1,0,2,0,1,느티나무 셀프BBQ,1인 수저세트,단체,0,2
2,2023-01-03,느티나무 셀프BBQ_1인 수저세트,0,1,0,1,1,3,0,1,느티나무 셀프BBQ,1인 수저세트,단체,0,2
3,2023-01-04,느티나무 셀프BBQ_1인 수저세트,0,2,0,1,2,4,0,1,느티나무 셀프BBQ,1인 수저세트,단체,0,2
4,2023-01-05,느티나무 셀프BBQ_1인 수저세트,0,3,0,1,3,5,0,1,느티나무 셀프BBQ,1인 수저세트,단체,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102671,2024-06-11,화담숲카페_현미뻥스크림,12,1,0,6,1,11,0,0,화담숲카페,현미뻥스크림,일반,0,4
102672,2024-06-12,화담숲카페_현미뻥스크림,10,2,0,6,2,12,0,0,화담숲카페,현미뻥스크림,일반,0,4
102673,2024-06-13,화담숲카페_현미뻥스크림,14,3,0,6,3,13,0,0,화담숲카페,현미뻥스크림,일반,0,4
102674,2024-06-14,화담숲카페_현미뻥스크림,12,4,0,6,4,14,0,0,화담숲카페,현미뻥스크림,일반,0,4


In [None]:



#market_menu별로 시계열 순서를 유지한 채로, 마지막 N일을 validation set으로 떼어냄
def time_series_train_val_split(df, val_days=7):

    train_list = []
    val_list = []

    for menu, group in df.groupby('market_menu'):
        group = group.sort_values('date_time')
        if len(group) <= val_days:
            train_list.append(group)
            continue

        train_group = group.iloc[:-val_days]
        val_group = group.iloc[-val_days:]

        train_list.append(train_group)
        val_list.append(val_group)

    train_df = pd.concat(train_list)
    val_df = pd.concat(val_list)
    return train_df, val_df


In [None]:
train_df['sales_amount'] = (
    pd.to_numeric(train_df['sales_amount'], errors='coerce')
      .fillna(0)
      .clip(lower=0)
)

In [None]:
feature_cols = ['weekend','holiday','vacation_season','usage_type_encoded',
                'is_premium','lag_1','lag_7','lag_14','rolling_mean_7','rolling_std_7','days_since_last_sale']


train_split, val_split = time_series_train_val_split(train_feat, val_days=7)

# ← 여기서 모델 정의 + 학습
model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(train_split[feature_cols], train_split['sales_amount'])

y_pred = model.predict(val_split[feature_cols])
print(f"Validation SMAPE: {smape(val_split['sales_amount'], y_pred):.2f}")



Validation SMAPE: 138.53


In [None]:
#최근 데이터 순서와 기간을 고려
def time_series_local_cv(df, feature_cols, n_splits=5, val_days=7):
    df = df.sort_values(['market_menu', 'date_time'])
    smape_scores = []

    unique_dates = sorted(df['date_time'].unique())

    for i in range(n_splits):
        split_point = unique_dates[-(val_days * (i + 1))]

        train_split = df[df['date_time'] < split_point]
        val_split = df[(df['date_time'] >= split_point) &
                       (df['date_time'] < split_point + pd.Timedelta(days=val_days))]

        model = XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
        model.fit(train_split[feature_cols], train_split['sales_amount'])

        y_pred = model.predict(val_split[feature_cols])
        score = smape(val_split['sales_amount'], y_pred)
        smape_scores.append(score)

        print(f"Fold {i+1} | Validation Date from {split_point.date()} → SMAPE: {score:.2f}")

    avg_score = np.mean(smape_scores)
    print(f"\n📊 Average Local CV SMAPE (n={n_splits} folds): {avg_score:.2f}")
    return smape_scores


In [None]:
# Run Local CV
time_series_local_cv(train_feat, feature_cols, n_splits=5, val_days=7)

Fold 1 | Validation Date from 2024-06-09 → SMAPE: 140.34
Fold 2 | Validation Date from 2024-06-02 → SMAPE: 132.85
Fold 3 | Validation Date from 2024-05-26 → SMAPE: 135.35
Fold 4 | Validation Date from 2024-05-19 → SMAPE: 129.40
Fold 5 | Validation Date from 2024-05-12 → SMAPE: 132.79

📊 Average Local CV SMAPE (n=5 folds): 134.15


[np.float64(140.34055895783737),
 np.float64(132.85075417186223),
 np.float64(135.34844496133425),
 np.float64(129.4039144348293),
 np.float64(132.7859433018116)]

# Define Model

## Baseline

# Train

## Baseline

# Prediction

## Baseline

## gagyeomkim

## KGY

In [None]:

# Step 6: 예측
y_pred = model.predict(val_split[feature_cols])

# 성능 평가 (SMAPE 사용)
score = smape(val_split['sales_amount'], y_pred)
print(f"Validation SMAPE: {score:.2f}")


Validation SMAPE: 138.53


In [None]:
def build_test_features(train_df, raw_test_df, lags=(1,7,14), roll_window=7):
    # 1) 공통 전처리(한글→영문 리네임, 캘린더 파생 등)
    T = dataPreProcessing(raw_test_df).copy()

    # 2) test 28일에 실제 매출이 있으면 그대로 사용 (없으면 0으로 대체)
    if 'sales_amount' not in T.columns:
        T['sales_amount'] = 0.0

    combo = T.sort_values(['market_menu','date_time'])
    combo = add_ts_features(combo, lags=lags, roll_window=roll_window)
    combo = add_days_since_last_sale(combo)

    feat_test = combo[combo['date_time'].isin(T['date_time']) &
                      (combo['market_menu'].isin(T['market_menu']))].copy()
    return feat_test



In [None]:
def block_cv_28to7(df_prepped, feature_cols, n_folds=5, ctx_days=28, horizon=7, gap=0):
    """
    [컨텍스트 28일] → [다음 7일]을 누수 없이 자동회귀 롤아웃으로 검증
    - 컨텍스트 28일만 입력으로 사용 (train tail 사용 금지)
    - 검증 7일의 실제값은 피처 계산에 사용하지 않음 (teacher forcing 금지)
    """
    df_prepped = df_prepped.sort_values(['market_menu','date_time']).copy()
    uniq = np.array(sorted(df_prepped['date_time'].unique()))
    scores = []

    for i in range(1, n_folds+1):
        need = ctx_days + horizon + gap
        if len(uniq) < need * i:
            break

        s = uniq[-need * i]  # 컨텍스트 시작일
        ctx_start, ctx_end = s, s + pd.Timedelta(days=ctx_days-1)
        val_start = ctx_end + pd.Timedelta(days=1+gap)
        val_end   = val_start + pd.Timedelta(days=horizon-1)

        # -------------------------
        # 1) 학습: 컨텍스트 시작 이전만
        # -------------------------
        train_part = df_prepped[df_prepped['date_time'] < ctx_start].copy()
        tr_feat  = add_ts_features(train_part)
        tr_feat  = add_days_since_last_sale(tr_feat)
        tr_feat['sales_amount'] = pd.to_numeric(tr_feat['sales_amount'], errors='coerce').fillna(0).clip(lower=0)

        model = XGBRegressor(
            n_estimators=600, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
        )
        model.fit(tr_feat[feature_cols], tr_feat['sales_amount'])

        # -------------------------
        # 2) 추론: 컨텍스트 28일만 사용
        # -------------------------
        ctx_only = df_prepped[(df_prepped['date_time'] >= ctx_start) &
                              (df_prepped['date_time'] <= ctx_end)].copy()

        # 컨텍스트로 피처 생성
        combo = ctx_only.sort_values(['market_menu','date_time']).copy()
        combo = add_ts_features(combo)
        combo = add_days_since_last_sale(combo)

        # 검증 정답(비교용) — 피처엔 사용 금지
        val_truth = df_prepped[(df_prepped['date_time'] >= val_start) &
                               (df_prepped['date_time'] <= val_end)][['market_menu','date_time','sales_amount']].copy()

        # 예측할 메뉴 집합(컨텍스트∩검증 교집합이 안전)
        menus = sorted(set(combo['market_menu']) & set(val_truth['market_menu']))

        preds = []
        for h in range(horizon):
            fut_date = val_start + pd.Timedelta(days=h)

            # 미래 하루 행 생성(매뉴별), 캘린더/도메인 파생 붙이기
            fut = pd.DataFrame({
                'date_time': [fut_date]*len(menus),
                'market_menu': menus,
                'sales_amount': 0.0
            })
            fut = dataPreProcessing(fut)[[
                'date_time','market_menu','sales_amount',
                'weekend','holiday','vacation_season',
                'usage_type_encoded','is_premium'
            ]]

            # 컨텍스트 + 지금까지의 예측을 가진 combo에 이어붙이고 피처 갱신
            combo = pd.concat([combo, fut], ignore_index=True, sort=False)
            combo = combo.sort_values(['market_menu','date_time'])
            combo = add_ts_features(combo)
            combo = add_days_since_last_sale(combo)

            mask = combo['date_time'].eq(fut_date)
            Xf = (combo.loc[mask, feature_cols]
                        .apply(pd.to_numeric, errors='coerce')
                        .fillna(0).astype(float))
            y_hat = model.predict(Xf).clip(0)

            # 다음 step을 위해 예측값을 기록(teacher forcing 방지)
            combo.loc[mask, 'sales_amount'] = y_hat

            preds.append(pd.DataFrame({
                'market_menu': combo.loc[mask, 'market_menu'].values,
                'date_time': fut_date,
                'pred': y_hat
            }))

        pred_df = pd.concat(preds, ignore_index=True)
        merged = val_truth.merge(pred_df, on=['market_menu','date_time'], how='left')
        merged['pred'] = merged['pred'].fillna(0.0)

        s_score = smape(merged['sales_amount'].to_numpy(), merged['pred'].to_numpy())
        print(f"Fold{i}: ctx {ctx_start.date()}~{ctx_end.date()} → val {val_start.date()}~{val_end.date()} | SMAPE {s_score:.4f}")
        scores.append(s_score)

    if scores:
        print("Avg SMAPE:", np.mean(scores))
    return scores




In [None]:
from xgboost import XGBClassifier, XGBRegressor

from xgboost import XGBClassifier, XGBRegressor
from sklearn.calibration import CalibratedClassifierCV
import numpy as np
import pandas as pd

def block_cv_28to7_2stage(
    df_prepped, feature_cols, n_folds=5, ctx_days=28, horizon=7, gap=0,
    tau_grid=(0.3, 0.4, 0.5, 0.6), default_tau=0.5,
    calibrate=False,          # Platt calibration 적용 여부
    use_soft=True,            # soft 게이팅(p_pos**alpha) 사용 여부
    alpha=0.35                # soft 게이팅 지수
):
    """
    [컨텍스트 28일] → [다음 7일] 자동회귀 롤아웃 CV (누수 없음).
    - 컨텍스트 28일만 입력 사용 (train tail/미래 실제값 금지)
    - use_soft=True면 최종 스코어도 soft 방식으로 계산
    """
    df_prepped = df_prepped.sort_values(['market_menu','date_time']).copy()
    uniq = np.array(sorted(df_prepped['date_time'].unique()))
    scores = []

    for i in range(1, n_folds+1):
        need = ctx_days + horizon + gap
        if len(uniq) < need * i:
            break

        s = uniq[-need * i]
        ctx_start, ctx_end = s, s + pd.Timedelta(days=ctx_days-1)
        val_start = ctx_end + pd.Timedelta(days=1+gap)
        val_end   = val_start + pd.Timedelta(days=horizon-1)

        # -------------------------
        # 1) 학습: 컨텍스트 시작 이전만
        # -------------------------
        train_part = df_prepped[df_prepped['date_time'] < ctx_start].copy()
        tr_feat = add_ts_features(train_part)
        tr_feat = add_days_since_last_sale(tr_feat)

        y = (pd.to_numeric(tr_feat['sales_amount'], errors='coerce')
               .fillna(0).clip(lower=0).to_numpy())
        X_tr = (tr_feat[feature_cols]
                .apply(pd.to_numeric, errors='coerce')
                .fillna(0).astype(float))

        # 분류기(base) → (옵션) Platt 보정 → proba 함수
        pos_rate = (y > 0).mean()
        neg_pos  = (1 - pos_rate) / max(pos_rate, 1e-6)

        clf_base = XGBClassifier(
            n_estimators=500, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1,
            scale_pos_weight=neg_pos
        )
        clf_base.fit(X_tr, (y > 0).astype(int))

        if calibrate:
            cal = CalibratedClassifierCV(clf_base, method='sigmoid', cv=3)
            cal.fit(X_tr, (y > 0).astype(int))
            proba = lambda X: cal.predict_proba(
                X.apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
            )[:, 1]
        else:
            proba = lambda X: clf_base.predict_proba(
                X.apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
            )[:, 1]

        # 회귀기
        w = 1.0 / (1.0 + y)
        t = np.log1p(y)
        reg = XGBRegressor(
            n_estimators=800, max_depth=7, learning_rate=0.04,
            subsample=0.8, colsample_bytree=0.9, random_state=42, n_jobs=-1
        )
        reg.fit(X_tr, t, sample_weight=w)

        # -------------------------
        # 2) 추론: 컨텍스트 28일만 사용 (오토리그레시브 롤아웃)
        # -------------------------
        ctx_only = df_prepped[(df_prepped['date_time'] >= ctx_start) &
                              (df_prepped['date_time'] <= ctx_end)].copy()

        combo = ctx_only.sort_values(['market_menu','date_time']).copy()
        combo = add_ts_features(combo)
        combo = add_days_since_last_sale(combo)

        val_truth = df_prepped[(df_prepped['date_time'] >= val_start) &
                               (df_prepped['date_time'] <= val_end)][['market_menu','date_time','sales_amount']].copy()
        menus = sorted(set(combo['market_menu']) & set(val_truth['market_menu']))

        rows = []
        for h in range(horizon):
            fut_date = val_start + pd.Timedelta(days=h)

            fut = pd.DataFrame({
                'date_time': [fut_date]*len(menus),
                'market_menu': menus,
                'sales_amount': 0.0
            })
            fut = dataPreProcessing(fut)[[
                'date_time','market_menu','sales_amount',
                'weekend','holiday','vacation_season',
                'usage_type_encoded','is_premium'
            ]]

            combo = pd.concat([combo, fut], ignore_index=True, sort=False)
            combo = combo.sort_values(['market_menu','date_time'])
            combo = add_ts_features(combo)
            combo = add_days_since_last_sale(combo)

            mask = combo['date_time'].eq(fut_date)
            Xf = (combo.loc[mask, feature_cols]
                        .apply(pd.to_numeric, errors='coerce')
                        .fillna(0).astype(float))

            p_pos = proba(Xf)
            y_hat = np.expm1(reg.predict(Xf))
            y_hat = np.nan_to_num(y_hat, nan=0.0, posinf=0.0, neginf=0.0).clip(0)

            # 롤아웃용 기록값
            if use_soft:
                y_for_next = (y_hat * np.power(p_pos, alpha)).clip(0)
            else:
                y_for_next = y_hat

            combo.loc[mask, 'sales_amount'] = y_for_next

            rows.append(pd.DataFrame({
                'market_menu': combo.loc[mask, 'market_menu'].values,
                'date_time': fut_date,
                'p': p_pos,
                'yhat': y_hat
            }))

        preds_df = pd.concat(rows, ignore_index=True)
        merged = val_truth.merge(preds_df, on=['market_menu','date_time'], how='left')
        merged['yhat'] = merged['yhat'].fillna(0.0)
        merged['p']    = merged['p'].fillna(0.0)

        # -------------------------
        # 3) 최종 예측: soft vs hard
        # -------------------------
        if use_soft:
            final_pred = (merged['yhat'].to_numpy() * np.power(merged['p'].to_numpy(), alpha)).clip(0)
        else:
            # hard: per-menu τ 탐색
            best_tau = {}
            for menu, g in merged.groupby('market_menu'):
                y_true = g['sales_amount'].to_numpy()
                p      = g['p'].to_numpy()
                yhat   = g['yhat'].to_numpy()
                best_s, best_t = 1e9, default_tau
                for t_ in tau_grid:
                    pred_ = np.where(p < t_, 0.0, yhat).clip(0)
                    s_ = smape(y_true, pred_)
                    if s_ < best_s:
                        best_s, best_t = s_, t_
                best_tau[menu] = best_t

            taus = merged['market_menu'].map(best_tau).fillna(default_tau).to_numpy()
            final_pred = np.where(merged['p'].to_numpy() < taus, 0.0, merged['yhat'].to_numpy()).clip(0)

        s_score = smape(merged['sales_amount'].to_numpy(), final_pred)
        print(f"[2stage NL] Fold{i}: ctx {ctx_start.date()}~{ctx_end.date()} → val {val_start.date()}~{val_end.date()} | SMAPE {s_score:.4f}")
        scores.append(s_score)

    if scores:
        print("Avg SMAPE (2-stage, no-leak):", np.mean(scores))
    return scores




In [None]:
print("\n=== Block CV (28→7, gap=0) / 1-stage ===")
_ = block_cv_28to7(train_df, feature_cols, n_folds=5, ctx_days=28, horizon=7, gap=0)

print("\n=== Block CV (28→7, gap=0) / 2-stage ===")
_ = block_cv_28to7_2stage(train_df, feature_cols, n_folds=5, ctx_days=28, horizon=7, gap=0)


=== Block CV (28→7, gap=0) / 1-stage ===
Fold1: ctx 2024-05-12~2024-06-08 → val 2024-06-09~2024-06-15 | SMAPE 110.9638
Fold2: ctx 2024-04-07~2024-05-04 → val 2024-05-05~2024-05-11 | SMAPE 135.8765
Fold3: ctx 2024-03-03~2024-03-30 → val 2024-03-31~2024-04-06 | SMAPE 123.2955
Fold4: ctx 2024-01-28~2024-02-24 → val 2024-02-25~2024-03-02 | SMAPE 129.9081
Fold5: ctx 2023-12-24~2024-01-20 → val 2024-01-21~2024-01-27 | SMAPE 124.5387
Avg SMAPE: 124.91651477793069

=== Block CV (28→7, gap=0) / 2-stage ===
[2stage NL] Fold1: ctx 2024-05-12~2024-06-08 → val 2024-06-09~2024-06-15 | SMAPE 128.0442
[2stage NL] Fold2: ctx 2024-04-07~2024-05-04 → val 2024-05-05~2024-05-11 | SMAPE 138.9935
[2stage NL] Fold3: ctx 2024-03-03~2024-03-30 → val 2024-03-31~2024-04-06 | SMAPE 141.2198
[2stage NL] Fold4: ctx 2024-01-28~2024-02-24 → val 2024-02-25~2024-03-02 | SMAPE 142.1096
[2stage NL] Fold5: ctx 2023-12-24~2024-01-20 → val 2024-01-21~2024-01-27 | SMAPE 145.6584
Avg SMAPE (2-stage, no-leak): 139.205116718116

In [None]:
def forecast_next7_for_block(
    train_df,
    raw_test_df,
    feature_cols,
    clf,
    reg,
    tau_by_menu=None,          # hard 모드에서만 사용
    default_tau=0.35,          # hard 모드의 기본 τ (off는 <0), soft 모드에선 무시
    block_id="TEST_XX",
    gate_mode="soft",          # "soft"(기본) | "hard" | "off"
    alpha=0.30,
    prob_gamma=0.6,            # soft 게이팅 지수: y_final = y_hat * p_pos**alpha
    tau_floor_prob=0.0,       # p_pos가 이보다 작으면 floor 미적용
    floor_ratio=0.12,          # floor = (컨텍스트 양수 평균)*floor_ratio
    scale_clip=(0.8, 12.0),     # 컨텍스트 스케일 클리핑 범위
    apply_scale=True,
    apply_floor=True
):
    """
    반환: DataFrame[token, 영업장명_메뉴명, 매출수량]  (+1~+7일)
    전제: dataPreProcessing, add_ts_features, add_days_since_last_sale가 이미 정의되어 있음.
    """

    # 0) 컨텍스트 28일 준비
    T = dataPreProcessing(raw_test_df).copy()
    if 'sales_amount' not in T.columns:
        T['sales_amount'] = 0.0

     # 컨텍스트만으로 파생
    combo = T.sort_values(['market_menu','date_time']).copy()
    combo = add_ts_features(combo)
    combo = add_days_since_last_sale(combo)

    preds = []
    ctx_last = combo['date_time'].max()
    ctx_start = ctx_last - pd.Timedelta(days=27)   # 28일 창
    menus = T['market_menu'].drop_duplicates().tolist()

    # 1) 컨텍스트 기반 보정량 계산 (soft 모드용)
    #    - 메뉴별 floor(양수 평균*ratio)
    #    - 최근 7일 실제합/예측합 비율로 scale
    floor_map, scale_by_menu = {}, {}
    if gate_mode == "soft":
        # (A) floor
        ctx_hist = combo[(combo['date_time'] >= ctx_start) & (combo['date_time'] <= ctx_last)].copy()
        q = 0.45
        pos_q = (ctx_hist[ctx_hist['sales_amount'] > 0]
             .groupby('market_menu')['sales_amount']
             .quantile(q).fillna(0.0))
        floor_map = (floor_ratio * pos_q).to_dict() if apply_floor else {}

        # (B) scale (최근 7일)
        ctx7_mask = combo['date_time'].between(ctx_last - pd.Timedelta(days=6), ctx_last)
        scale_by_menu = {}
        if ctx7_mask.any():
            X_ctx7 = (ctx_hist.loc[ctx7_mask, feature_cols]
                  .apply(pd.to_numeric, errors='coerce').fillna(0).astype(float))
            y_ctx7_hat = np.expm1(reg.predict(X_ctx7))
            y_ctx7_hat = np.nan_to_num(y_ctx7_hat, nan=0.0, posinf=0.0, neginf=0.0).clip(0)

            hat_sum_by_menu = pd.Series(y_ctx7_hat,
                                    index=ctx_hist.loc[ctx7_mask, 'market_menu']).groupby(level=0).sum()
            true_sum_by_menu = ctx_hist.loc[ctx7_mask].groupby('market_menu')['sales_amount'].sum()
            scale_series = (true_sum_by_menu / (hat_sum_by_menu + 1e-6)).fillna(1.0)
            global_scale = float(true_sum_by_menu.sum() / (hat_sum_by_menu.sum() + 1e-6))
            scale_series = 0.5*scale_series + 0.5*global_scale
            if apply_scale:
                scale_series = scale_series.clip(lower=0.8, upper=12.0)
                scale_by_menu = scale_series.to_dict()
            else:
                scale_by_menu = {k: 1.0 for k in true_sum_by_menu.index}

    # 2) 7-step 롤아웃
    for h in range(1, 8):
        fut_date = ctx_last + pd.Timedelta(days=h)
        # 미래 한 줄(각 메뉴) 생성 + 공통 파생
        fut = pd.DataFrame({
            'date_time': [fut_date]*len(menus),
            'market_menu': menus,
            'sales_amount': 0.0
        })
        fut = dataPreProcessing(fut)[[
            'date_time','market_menu','sales_amount',
            'weekend','holiday','vacation_season',
            'usage_type_encoded','is_premium','store_name','menu_name'
        ]]

        # 컨테이너에 붙여 동일 피처 생성
        combo = pd.concat([combo, fut], ignore_index=True, sort=False)
        combo = combo.sort_values(['market_menu','date_time'])
        combo = add_ts_features(combo)
        combo = add_days_since_last_sale(combo)

        # 방금 추가한 미래행만 예측
        mask = (combo['date_time'] == fut_date)
        Xf = combo.loc[mask, feature_cols].copy()
        Xf = Xf.apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)

        p_pos = clf.predict_proba(Xf)[:, 1]
        # 선택: 확률 팽창(작을수록 덜 깎이게 함)
        if prob_gamma is not None and prob_gamma != 1.0:
            p_pos = 1.0 - np.power(1.0 - p_pos, prob_gamma)
        y_hat = np.expm1(reg.predict(Xf))
        y_hat = np.nan_to_num(y_hat, nan=0.0, posinf=0.0, neginf=0.0).clip(0)

        if gate_mode == "off" or (default_tau is not None and default_tau < 0):
            # 게이팅 완전 OFF
            y_final = y_hat.copy()

        elif gate_mode == "hard":
            # 하드 게이팅 (기존 방식 유지 옵션)
            if tau_by_menu is None:
                tau_vec = np.full(len(Xf), (default_tau if default_tau is not None else 0.35))
            else:
                tau_vec = np.array([
                    tau_by_menu.get(m, (default_tau if default_tau is not None else 0.35))
                    for m in combo.loc[mask, 'market_menu']
                ])
            y_final = np.where(p_pos < tau_vec, 0.0, y_hat).clip(0)

        else:
            # SOFT 게이팅 + 스케일 + 바닥값
            # 1) 소프트 게이팅
            y_soft = y_hat * np.power(p_pos, (alpha if alpha is not None else 0.0))

            # 2) 메뉴별 스케일
            if apply_scale and len(scale_by_menu):
                s_vec = np.array([scale_by_menu.get(m, 1.0) for m in combo.loc[mask, 'market_menu']])
                y_scaled = (y_soft * s_vec).clip(0)
            else:
                y_scaled = y_soft

            # 3) 바닥값 (확률이 너무 낮으면 floor 미적용)
            if apply_floor and len(floor_map):
                floor_vec = np.array([floor_map.get(m, 0.0) for m in combo.loc[mask, 'market_menu']])
                apply_mask = p_pos >= (tau_floor_prob if tau_floor_prob is not None else 0.0)
                # p_pos가 너무 낮으면 floor 미적용 → 그대로 y_scaled
                y_final = np.where(apply_mask, np.maximum(y_scaled, floor_vec), y_scaled).clip(0)
            else:
                y_final = y_scaled

        ctx_hist = combo[combo['date_time'] <= ctx_last].copy()  # 컨텍스트 28일까지만
        dow = fut_date.dayofweek
        true_day_sums = (
            ctx_hist.assign(dow=ctx_hist['date_time'].dt.dayofweek)
                    .query('dow == @dow')
                    .groupby('date_time')['sales_amount'].sum()
        )
        if len(true_day_sums):
            target_sum = float(true_day_sums.median())  # 필요시 mean()으로 교체 가능
            pred_sum = float(y_final.sum()) + 1e-6
            k = np.clip(target_sum / pred_sum, 0.5, 8.0)  # 과도 왜곡 방지
            y_final = (y_final * k).clip(0)

        # 다음 step의 라그/롤링 반영을 위해 예측값을 combo에 기록
        combo.loc[mask, 'sales_amount'] = y_final

        # 결과 저장
        preds.append(pd.DataFrame({
            'token': [f"{block_id}+{h}일"] * np.sum(mask),
            '영업장명_메뉴명': combo.loc[mask, 'market_menu'].values,
            '매출수량': y_final
        }))

    return pd.concat(preds, ignore_index=True)


In [None]:
# 전체 train으로 최종 학습
tr_feat_full = add_ts_features(train_df)
tr_feat_full = add_days_since_last_sale(tr_feat_full)
# X/y 정리 (숫자형 강제)
Xf_full = tr_feat_full[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
y_full  = (pd.to_numeric(tr_feat_full['sales_amount'], errors='coerce')
           .fillna(0).clip(lower=0).to_numpy())

# 분류기 (불균형 보정)
from xgboost import XGBClassifier, XGBRegressor
pos_rate = (y_full > 0).mean()
neg_pos  = (1 - pos_rate) / max(pos_rate, 1e-6)

clf = XGBClassifier(
    n_estimators=600, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1,
    scale_pos_weight=neg_pos
)
clf.fit(Xf_full, (y_full > 0).astype(int))

# 회귀기 (log1p + 가중치)
w_full = 1.0 / (1.0 + y_full)
reg = XGBRegressor(
    n_estimators=900, max_depth=7, learning_rate=0.04,
    subsample=0.8, colsample_bytree=0.9, random_state=42, n_jobs=-1
)
reg.fit(Xf_full, np.log1p(y_full), sample_weight=w_full)


In [None]:
def pick_tau_by_menu_last_block(train_df, feature_cols, tau_grid=np.arange(0.25, 0.76, 0.05)):
    # 마지막 컨텍스트 28일 → 다음 7일을 검증으로 사용
    df = train_df.sort_values(['market_menu','date_time']).copy()
    uniq = np.array(sorted(df['date_time'].unique()))
    ctx_days, horizon = 28, 7
    s = uniq[-(ctx_days + horizon)]
    ctx_start, ctx_end = s, s + pd.Timedelta(days=ctx_days-1)
    val_start = ctx_end + pd.Timedelta(days=1)
    val_end   = val_start + pd.Timedelta(days=horizon-1)

    train_part = df[df['date_time'] < ctx_start].copy()
    tail = train_part.groupby('market_menu', as_index=False).tail(max(28,14,7))
    combo = pd.concat([
        tail,
        df[(df['date_time']>=ctx_start) & (df['date_time']<=ctx_end)],
        df[(df['date_time']>=val_start) & (df['date_time']<=val_end)],
    ], ignore_index=True, sort=False).sort_values(['market_menu','date_time'])

    # 학습
    tr_feat = add_ts_features(train_part); tr_feat = add_days_since_last_sale(tr_feat)
    X_tr = tr_feat[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
    y    = (pd.to_numeric(tr_feat['sales_amount'], errors='coerce').fillna(0).clip(lower=0).to_numpy())

    pos_rate = (y > 0).mean(); neg_pos = (1 - pos_rate) / max(pos_rate, 1e-6)
    clf = XGBClassifier(n_estimators=600, max_depth=6, learning_rate=0.05,
                        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1,
                        scale_pos_weight=neg_pos)
    clf.fit(X_tr, (y>0).astype(int))

    w = 1.0/(1.0+y); t = np.log1p(y)
    reg = XGBRegressor(n_estimators=900, max_depth=7, learning_rate=0.04,
                       subsample=0.8, colsample_bytree=0.9, random_state=42, n_jobs=-1)
    reg.fit(X_tr, t, sample_weight=w)

    # 검증 블록
    combo_ft = add_ts_features(combo); combo_ft = add_days_since_last_sale(combo_ft)
    val_block = combo_ft[(combo_ft['date_time']>=val_start)&(combo_ft['date_time']<=val_end)].copy()
    X_val = val_block[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)
    p_pos = clf.predict_proba(X_val)[:,1]
    y_hat = np.expm1(reg.predict(X_val)); y_hat = np.nan_to_num(y_hat, nan=0.0, posinf=0.0, neginf=0.0).clip(0)

    # 메뉴별 τ 고르기
    tau_map = {}
    tmp = val_block.assign(p=p_pos, yhat=y_hat)
    for menu, g in tmp.groupby('market_menu'):
        best_s, best_t = 1e9, 0.5
        for t_ in tau_grid:
            pred_ = np.where(g['p'].to_numpy() < t_, 0.0, g['yhat'].to_numpy()).clip(0)
            s_ = smape(g['sales_amount'].to_numpy(), pred_)
            if s_ < best_s:
                best_s, best_t = s_, t_
        tau_map[menu] = best_t
    return tau_map




τ keys: 1351


In [None]:
tau_by_menu = pick_tau_by_menu_last_block(train_df, feature_cols)
print("메뉴별 τ 개수:", len(tau_by_menu))

메뉴별 τ 개수: 1351


# Submission

## Baseline

### KGY VERSION

In [None]:
# ------------------------------------------------------
# A. 블록별 예측 → full_pred_df (중복 없이 하나로)
# ------------------------------------------------------
def infer_all_blocks(test_glob_pattern,
                     train_df, feature_cols, clf, reg,
                     gate_kwargs):
    """
    test_glob_pattern: './test/TEST_*.csv'
    gate_kwargs: forecast_next7_for_block에 바로 전달할 게이팅/스케일 kwargs
                 (예: {'gate_mode':'soft', 'alpha':0.5, ...})
    반환: full_pred_df (token, 영업장명_메뉴명, 매출수량 포함)
    """
    test_files = sorted(glob.glob(test_glob_pattern))
    all_preds = []

    # tau_by_menu가 있으면 자동 첨부 (있을 때만)
    if 'tau_by_menu' not in gate_kwargs:
        try:
            gate_kwargs = {**gate_kwargs, 'tau_by_menu': tau_by_menu}
        except NameError:
            pass  # 사용 안 함

    for path in test_files:
        raw_test = pd.read_csv(path)
        block_id = os.path.splitext(os.path.basename(path))[0]  # e.g., TEST_00

        pred7 = forecast_next7_for_block(
            train_df=train_df,
            raw_test_df=raw_test,
            feature_cols=feature_cols,
            clf=clf, reg=reg,
            block_id=block_id,
            **gate_kwargs
        )
        all_preds.append(pred7)

    full_pred_df = pd.concat(all_preds, ignore_index=True)
    return full_pred_df


# ------------------------------------------------------
# B. full_pred_df → 제출 DF (벡터화, 조각화 경고 없음)
# ------------------------------------------------------
def build_submission(full_pred_df, sample_submission_path, save_path=None):
    sample_submission = pd.read_csv(sample_submission_path)
    sub_tokens = sample_submission['영업일자'].astype(str)
    menu_cols = [c for c in sample_submission.columns if c != '영업일자']

    # dedup + pivot_table
    dedup = (full_pred_df
             .groupby(['token','영업장명_메뉴명'], as_index=False)['매출수량']
             .sum())
    wide_pred = dedup.pivot_table(index='token',
                                  columns='영업장명_메뉴명',
                                  values='매출수량',
                                  aggfunc='sum',
                                  fill_value=0.0)

    # 순서/컬럼 맞춰 한 번에 생성(조각화 경고 X)
    arr = (wide_pred
           .reindex(index=sub_tokens, columns=menu_cols, fill_value=0.0)
           .astype('float64'))
    sub_out = arr.reset_index().rename(columns={'token':'영업일자'})

    vals = sub_out[menu_cols].to_numpy(dtype='float64')
    print("[SUB] sum:", float(np.nansum(vals)))
    print("[SUB] nonzero:", int(np.count_nonzero(vals)))
    print("[SUB] zero_ratio:", float((vals == 0).mean()))

    if save_path is not None:
        os.makedirs(os.path.dirname(save_path) or '.', exist_ok=True)
        sub_out.to_csv(save_path, index=False, encoding='utf-8-sig', float_format='%.6f')
        print("💾 Saved:", save_path)

    return sub_out, wide_pred

# ======================================================
# 사용 예시
#   1) OFF로 스모크 체크 (원인 분리)
#   2) ON으로 최종 제출 생성
# ======================================================

# 1) 게이팅 OFF (진짜 off)
RUN_OFF_CHECK = True
if RUN_OFF_CHECK:
    gate_off = dict(gate_mode='off',
                    alpha=0.5,                 # off에서는 무시됨
                    tau_floor_prob=0.0,        # 무시됨
                    floor_ratio=0.10,          # 무시됨
                    scale_clip=(0.5, 6.0),     # 무시됨
                    apply_scale=True,
                    apply_floor=True)
    full_off = infer_all_blocks('./test/TEST_*.csv',
                                train_df, feature_cols, clf, reg,
                                gate_off)
    sub_off, wide_off = build_submission(full_off,
                                         './sample_submission.csv',
                                         './submission/xgb_off.csv')

# 2) 게이팅 ON (소프트 게이팅 추천)
gate_on = dict(gate_mode='soft',
               alpha=0.30,            # 0.5~0.8 튜닝
               tau_floor_prob=0.00,  # floor 항상 허용(값 축소 방지)
               floor_ratio=0.15,     # 0.10~0.20 튜닝
               scale_clip=(0.8, 12.0),
               prob_gamma=0.60,
               apply_scale=True,
               apply_floor=True)

# 예측 후
full_pred_df = infer_all_blocks('./test/TEST_*.csv',
                                train_df, feature_cols, clf, reg,
                                gate_on)  # 혹은 gate_off

sub_out, wide_pred = build_submission(full_pred_df, './sample_submission.csv')

# ⬇️ 네가 쓰던 저장 스타일 그대로
os.makedirs('./submission', exist_ok=True)
out_path = './submission/xgb_submission.csv'
sub_out.to_csv(out_path, index=False, encoding='utf-8-sig', float_format='%.6f')
print("✅ 저장 완료 →", out_path)



[SUB] sum: 24270.699296848597
[SUB] nonzero: 13510
[SUB] zero_ratio: 0.0
[SUB] day total min/mean/max: 200.22442292556866 346.72427566926564 615.6228600000005
💾 Saved: ./submission/xgb_off.csv
[SUB] sum: 38377.58721184859
[SUB] nonzero: 13510
[SUB] zero_ratio: 0.0
[SUB] day total min/mean/max: 256.84000000000003 548.2512458835512 1128.44
💾 Saved: ./submission/xgb_submission.csv
✅ 저장 완료 → ./submission/xgb_submission.csv


In [None]:
# 제출 빌드 뒤
vals = sub_out.iloc[:,1:].to_numpy(dtype='float64')
print("sum:", float(np.nansum(vals)))
print("nonzero:", int(np.count_nonzero(vals)))
print("zero_ratio:", float((vals == 0).mean()))

totals = sub_out.iloc[:,1:].sum(axis=1)
print("day total min/mean/max:", totals.min(), totals.mean(), totals.max())
print(sub_out.iloc[:,1:].sum(axis=0).sort_values(ascending=False).head(10))


sum: 39481.21865
nonzero: 10248
zero_ratio: 0.24145077720207253
day total min/mean/max: 130.8875 564.0174092857144 1265.1630500000012
영업장명_메뉴명
화담숲주막_해물파전            2805.671963
포레스트릿_꼬치어묵            1798.250575
카페테리아_단체식 18000(신)    1515.085000
미라시아_브런치(대인) 주말       1198.400000
카페테리아_단체식 13000(신)    1136.113950
연회장_Regular Coffee    1096.049225
포레스트릿_떡볶이             1095.447700
포레스트릿_생수              1070.093575
화담숲카페_아메리카노 ICE        810.083588
카페테리아_수제 등심 돈까스        797.683837
dtype: float64
