# Advanced 2-Stage ML (Target: 0.40+)

**전략:**
1. Feature 확장: 13개 → 35개
2. Multi-Model: XGBoost + LightGBM + CatBoost
3. Multiple Thresholds: 0.28, 0.30, 0.32, 0.34, 0.36
4. Meta-Ensemble: Stacking

**현재 Best: 0.3513**
**목표: 0.40+**

## 1. Setup

In [13]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

PAIR_MAX_LAG = 7
PAIR_MIN_NONZERO = 8
VAL_MIN_NONZERO = 2

PAIR_LABEL_CORR_THRESHOLD = 0.32
VAL_GT_CORR_THRESHOLD = 0.25

NEG_POS_RATIO = 1.5
PAIR_TOP_K = 3000

TRAIN_END_STR = "2024-12-01"
VAL_START_STR = "2025-01-01"
VAL_END_STR = "2025-04-01"

RANDOM_SEED = 42

print("Setup 완료 - Advanced Features 모드")

Setup 완료 - Advanced Features 모드


## 2. 유틸 함수 (기존 + 확장)

In [14]:
def safe_corr(a, b):
    mask = (~np.isnan(a)) & (~np.isnan(b))
    if mask.sum() < 3:
        return 0.0
    aa, bb = a[mask], b[mask]
    if np.std(aa) == 0 or np.std(bb) == 0:
        return 0.0
    return float(np.corrcoef(aa, bb)[0, 1])

def load_pivot(train_path="train.csv"):
    df = pd.read_csv(train_path)
    monthly = df.groupby(["item_id", "year", "month"], as_index=False)["value"].sum()
    monthly["ym"] = pd.to_datetime(
        monthly["year"].astype(str) + "-" + monthly["month"].astype(str).str.zfill(2) + "-01"
    )
    pivot = monthly.pivot(index="item_id", columns="ym", values="value")
    pivot = pivot.fillna(0).sort_index(axis=1)
    print("pivot shape:", pivot.shape)
    return pivot, df

def get_time_indices(pivot):
    months = list(pivot.columns)
    month_to_idx = {m: i for i, m in enumerate(months)}
    train_end = pd.to_datetime(TRAIN_END_STR)
    val_start = pd.to_datetime(VAL_START_STR)
    val_end = pd.to_datetime(VAL_END_STR)
    return (months, month_to_idx[train_end], month_to_idx[val_start], month_to_idx[val_end])

print("기본 유틸 함수 정의 완료")

기본 유틸 함수 정의 완료


## 3. Advanced Feature 생성 (35+ features)

In [15]:
def build_advanced_pair_features(pivot, upto_idx, max_lag=7, min_nonzero=8,
                                  corr_threshold_for_label=0.30, neg_pos_ratio=2.0):
    """35+ features로 확장된 pair feature matrix"""
    items = pivot.index.to_list()
    months = list(pivot.columns)
    
    sub_cols = months[:upto_idx + 1]
    pivot_sub = pivot[sub_cols]
    n_sub_months = pivot_sub.shape[1]
    
    rows_pos = []
    rows_neg = []
    
    for leader in tqdm(items, desc="build_advanced_features"):
        a = pivot_sub.loc[leader].values.astype(float)
        if np.count_nonzero(a) < min_nonzero:
            continue
        
        for follower in items:
            if leader == follower:
                continue
            
            b = pivot_sub.loc[follower].values.astype(float)
            if np.count_nonzero(b) < min_nonzero:
                continue
            
            # Lag별 correlation 계산
            lag_corrs = []
            best_corr = 0.0
            second_corr = 0.0
            third_corr = 0.0
            best_lag = None
            
            for lag in range(1, max_lag + 1):
                if n_sub_months <= lag:
                    lag_corrs.append(0.0)
                    continue
                
                c = safe_corr(a[:-lag], b[lag:])
                lag_corrs.append(c)
                
                if abs(c) > abs(best_corr):
                    third_corr = second_corr
                    second_corr = best_corr
                    best_corr = c
                    best_lag = lag
                elif abs(c) > abs(second_corr):
                    third_corr = second_corr
                    second_corr = c
                elif abs(c) > abs(third_corr):
                    third_corr = c
            
            if best_lag is None:
                continue
            
            lag_corrs = np.array(lag_corrs, dtype=float)
            
            # Rolling statistics for a and b
            a_rolling_3 = np.array([np.mean(a[max(0, i-2):i+1]) for i in range(len(a))])
            b_rolling_3 = np.array([np.mean(b[max(0, i-2):i+1]) for i in range(len(b))])
            a_rolling_6 = np.array([np.mean(a[max(0, i-5):i+1]) for i in range(len(a))])
            b_rolling_6 = np.array([np.mean(b[max(0, i-5):i+1]) for i in range(len(b))])
            
            # Trend features
            a_trend = (a[-1] - a[0]) / (len(a) + 1) if len(a) > 1 else 0
            b_trend = (b[-1] - b[0]) / (len(b) + 1) if len(b) > 1 else 0
            
            feats = {
                "leading_item_id": leader,
                "following_item_id": follower,
                
                # 기존 features (11개)
                "max_corr": float(best_corr),
                "best_lag": int(best_lag),
                "second_corr": float(second_corr),
                "third_corr": float(third_corr),
                "corr_stability": float(abs(best_corr - second_corr)),
                "corr_mean": float(np.mean(lag_corrs)),
                "corr_std": float(np.std(lag_corrs)),
                "corr_abs_mean": float(np.mean(np.abs(lag_corrs))),
                "nonzero_a": int(np.count_nonzero(a)),
                "nonzero_b": int(np.count_nonzero(b)),
                "sum_a": float(a.sum()),
                "sum_b": float(b.sum()),
                
                # Lag별 개별 correlation (7개)
                "lag1_corr": float(lag_corrs[0]) if len(lag_corrs) > 0 else 0.0,
                "lag2_corr": float(lag_corrs[1]) if len(lag_corrs) > 1 else 0.0,
                "lag3_corr": float(lag_corrs[2]) if len(lag_corrs) > 2 else 0.0,
                "lag4_corr": float(lag_corrs[3]) if len(lag_corrs) > 3 else 0.0,
                "lag5_corr": float(lag_corrs[4]) if len(lag_corrs) > 4 else 0.0,
                "lag6_corr": float(lag_corrs[5]) if len(lag_corrs) > 5 else 0.0,
                "lag7_corr": float(lag_corrs[6]) if len(lag_corrs) > 6 else 0.0,
                
                # Rolling statistics (8개)
                "a_rolling3_mean": float(np.mean(a_rolling_3)),
                "a_rolling3_std": float(np.std(a_rolling_3)),
                "b_rolling3_mean": float(np.mean(b_rolling_3)),
                "b_rolling3_std": float(np.std(b_rolling_3)),
                "a_rolling6_mean": float(np.mean(a_rolling_6)),
                "a_rolling6_std": float(np.std(a_rolling_6)),
                "b_rolling6_mean": float(np.mean(b_rolling_6)),
                "b_rolling6_std": float(np.std(b_rolling_6)),
                
                # Trend features (4개)
                "a_trend": float(a_trend),
                "b_trend": float(b_trend),
                "trend_ratio": float(b_trend / (abs(a_trend) + 1e-6)),
                "trend_diff": float(abs(b_trend - a_trend)),
                
                # Interaction features (5개)
                "ab_ratio": float(b.sum() / (a.sum() + 1)),
                "ab_corr_recent": float(safe_corr(a[-6:], b[-6:])),
                "max_min_ratio_a": float(a.max() / (a.min() + 1)),
                "max_min_ratio_b": float(b.max() / (b.min() + 1)),
                "corr_weighted": float(np.average(np.abs(lag_corrs), weights=range(len(lag_corrs), 0, -1))),
            }
            
            label = 1 if abs(best_corr) >= corr_threshold_for_label else 0
            
            if label == 1:
                rows_pos.append({**feats, "label": 1})
            else:
                rows_neg.append({**feats, "label": 0})
    
    df_pos = pd.DataFrame(rows_pos)
    df_neg = pd.DataFrame(rows_neg)
    print(f"pos pairs: {df_pos.shape}, neg pairs: {df_neg.shape}")
    
    if df_pos.empty:
        return pd.DataFrame()
    
    n_pos = len(df_pos)
    n_neg_keep = int(neg_pos_ratio * n_pos)
    if len(df_neg) > n_neg_keep:
        df_neg = df_neg.sample(n_neg_keep, random_state=RANDOM_SEED)
    
    df_all = pd.concat([df_pos, df_neg], axis=0).reset_index(drop=True)
    print(f"Total features: {len([c for c in df_all.columns if c not in ['leading_item_id', 'following_item_id', 'label']])}")
    return df_all

print("Advanced feature 함수 정의 완료")

Advanced feature 함수 정의 완료


## 4. Multi-Model Classifier (XGB + LightGBM + CatBoost)

In [16]:
def train_multi_model_classifier(df_pairs):
    """3개 모델로 앙상블 Classifier"""
    # Feature columns 추출
    feature_cols = [c for c in df_pairs.columns if c not in ['leading_item_id', 'following_item_id', 'label']]
    
    df = df_pairs.copy()
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], 0).fillna(0)
    
    X = df[feature_cols].values
    y = df["label"].values
    
    print(f"Training on {len(feature_cols)} features...")
    
    # XGBoost
    clf_xgb = XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.3,
        reg_lambda=0.8,
        min_child_weight=2,
        gamma=0.1,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        eval_metric="logloss"
    )
    clf_xgb.fit(X, y)
    print("XGBoost trained")
    
    # LightGBM
    clf_lgb = LGBMClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.3,
        reg_lambda=0.8,
        min_child_samples=20,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=-1
    )
    clf_lgb.fit(X, y)
    print("LightGBM trained")
    
    # CatBoost
    clf_cat = CatBoostClassifier(
        iterations=300,
        depth=5,
        learning_rate=0.05,
        subsample=0.85,
        reg_lambda=0.8,
        random_seed=RANDOM_SEED,
        verbose=0
    )
    clf_cat.fit(X, y)
    print("CatBoost trained")
    
    return [clf_xgb, clf_lgb, clf_cat], feature_cols

print("Multi-model classifier 함수 정의 완료")

Multi-model classifier 함수 정의 완료



## 5. Advanced Pair Scoring (Ensemble 예측)

In [17]:
def score_all_pairs_advanced(pivot, classifiers, feature_cols, max_lag=7, min_nonzero=8):
    """Advanced features + Multi-model ensemble로 모든 쌍 scoring"""
    items = pivot.index.to_list()
    months = list(pivot.columns)
    n_months = len(months)
    
    rows = []
    
    for leader in tqdm(items, desc="score_all_pairs_advanced"):
        a = pivot.loc[leader].values.astype(float)
        if np.count_nonzero(a) < min_nonzero:
            continue
        
        for follower in items:
            if leader == follower:
                continue
            
            b = pivot.loc[follower].values.astype(float)
            if np.count_nonzero(b) < min_nonzero:
                continue
            
            # Lag별 correlation
            lag_corrs = []
            best_corr = 0.0
            second_corr = 0.0
            third_corr = 0.0
            best_lag = None
            
            for lag in range(1, max_lag + 1):
                if n_months <= lag:
                    lag_corrs.append(0.0)
                    continue
                
                c = safe_corr(a[:-lag], b[lag:])
                lag_corrs.append(c)
                
                if abs(c) > abs(best_corr):
                    third_corr = second_corr
                    second_corr = best_corr
                    best_corr = c
                    best_lag = lag
                elif abs(c) > abs(second_corr):
                    third_corr = second_corr
                    second_corr = c
                elif abs(c) > abs(third_corr):
                    third_corr = c
            
            if best_lag is None:
                continue
            
            lag_corrs = np.array(lag_corrs, dtype=float)
            
            # Rolling statistics
            a_rolling_3 = np.array([np.mean(a[max(0, i-2):i+1]) for i in range(len(a))])
            b_rolling_3 = np.array([np.mean(b[max(0, i-2):i+1]) for i in range(len(b))])
            a_rolling_6 = np.array([np.mean(a[max(0, i-5):i+1]) for i in range(len(a))])
            b_rolling_6 = np.array([np.mean(b[max(0, i-5):i+1]) for i in range(len(b))])
            
            a_trend = (a[-1] - a[0]) / (len(a) + 1) if len(a) > 1 else 0
            b_trend = (b[-1] - b[0]) / (len(b) + 1) if len(b) > 1 else 0
            
            # Feature dictionary
            feats = {
                "max_corr": float(best_corr),
                "best_lag": int(best_lag),
                "second_corr": float(second_corr),
                "third_corr": float(third_corr),
                "corr_stability": float(abs(best_corr - second_corr)),
                "corr_mean": float(np.mean(lag_corrs)),
                "corr_std": float(np.std(lag_corrs)),
                "corr_abs_mean": float(np.mean(np.abs(lag_corrs))),
                "nonzero_a": int(np.count_nonzero(a)),
                "nonzero_b": int(np.count_nonzero(b)),
                "sum_a": float(a.sum()),
                "sum_b": float(b.sum()),
                "lag1_corr": float(lag_corrs[0]) if len(lag_corrs) > 0 else 0.0,
                "lag2_corr": float(lag_corrs[1]) if len(lag_corrs) > 1 else 0.0,
                "lag3_corr": float(lag_corrs[2]) if len(lag_corrs) > 2 else 0.0,
                "lag4_corr": float(lag_corrs[3]) if len(lag_corrs) > 3 else 0.0,
                "lag5_corr": float(lag_corrs[4]) if len(lag_corrs) > 4 else 0.0,
                "lag6_corr": float(lag_corrs[5]) if len(lag_corrs) > 5 else 0.0,
                "lag7_corr": float(lag_corrs[6]) if len(lag_corrs) > 6 else 0.0,
                "a_rolling3_mean": float(np.mean(a_rolling_3)),
                "a_rolling3_std": float(np.std(a_rolling_3)),
                "b_rolling3_mean": float(np.mean(b_rolling_3)),
                "b_rolling3_std": float(np.std(b_rolling_3)),
                "a_rolling6_mean": float(np.mean(a_rolling_6)),
                "a_rolling6_std": float(np.std(a_rolling_6)),
                "b_rolling6_mean": float(np.mean(b_rolling_6)),
                "b_rolling6_std": float(np.std(b_rolling_6)),
                "a_trend": float(a_trend),
                "b_trend": float(b_trend),
                "trend_ratio": float(b_trend / (abs(a_trend) + 1e-6)),
                "trend_diff": float(abs(b_trend - a_trend)),
                "ab_ratio": float(b.sum() / (a.sum() + 1)),
                "ab_corr_recent": float(safe_corr(a[-6:], b[-6:])),
                "max_min_ratio_a": float(a.max() / (a.min() + 1)),
                "max_min_ratio_b": float(b.max() / (b.min() + 1)),
                "corr_weighted": float(np.average(np.abs(lag_corrs), weights=range(len(lag_corrs), 0, -1))),
            }
            
            # Ensemble prediction (평균)
            x_vec = np.array([[feats[col] for col in feature_cols]], dtype=float)
            probs = []
            for clf in classifiers:
                prob = float(clf.predict_proba(x_vec)[0, 1])
                probs.append(prob)
            
            ensemble_prob = np.mean(probs)
            
            rows.append({
                "leading_item_id": leader,
                "following_item_id": follower,
                "best_lag": int(best_lag),
                "max_corr": float(best_corr),
                "corr_stability": float(abs(best_corr - second_corr)),
                "clf_prob": ensemble_prob
            })
    
    df = pd.DataFrame(rows)
    return df.reset_index(drop=True)

print("Advanced scoring 함수 정의 완료")

Advanced scoring 함수 정의 완료



## 6. Advanced Regression Dataset (30+ features)

In [18]:
def build_advanced_regression_dataset(pivot, pairs, target_start_idx, target_end_idx):
    """30+ features로 확장된 regression dataset"""
    months = list(pivot.columns)
    n_months = len(months)
    
    rows = []
    
    for row in tqdm(pairs.itertuples(index=False), desc="build_regression_dataset"):
        leader = row.leading_item_id
        follower = row.following_item_id
        lag = int(row.best_lag)
        
        a = pivot.loc[leader].values.astype(float)
        b = pivot.loc[follower].values.astype(float)
        
        for t in range(lag + 2, n_months - 1):
            target_idx = t + 1
            if target_idx < target_start_idx or target_idx > target_end_idx:
                continue
            
            if t - 2 < 0 or (t - lag - 1) < 0:
                continue
            
            # 기본 features
            b_t = b[t]
            b_t_1 = b[t - 1]
            b_t_2 = b[t - 2]
            a_t_lag = a[t - lag]
            a_t_lag_1 = a[t - lag - 1]
            
            b_ma3 = np.mean([b_t, b_t_1, b_t_2])
            if (t - lag - 2) >= 0:
                a_ma3 = np.mean([a_t_lag, a_t_lag_1, a[t - lag - 2]])
            else:
                a_ma3 = np.mean([a_t_lag, a_t_lag_1])
            
            b_change = (b_t - b_t_1) / (b_t_1 + 1)
            a_change = (a_t_lag - a_t_lag_1) / (a_t_lag_1 + 1)
            ab_ratio = b_t / (a_t_lag + 1)
            
            # Rolling features (더 많은 window)
            b_ma6 = np.mean(b[max(0, t-5):t+1]) if t >= 5 else np.mean(b[:t+1])
            a_ma6 = np.mean(a[max(0, t-lag-5):t-lag+1]) if t-lag >= 5 else np.mean(a[:t-lag+1])
            
            b_std3 = np.std([b_t, b_t_1, b_t_2])
            a_std3 = np.std([a_t_lag, a_t_lag_1]) if (t - lag - 2) < 0 else np.std([a_t_lag, a_t_lag_1, a[t - lag - 2]])
            
            # Trend features
            b_recent_trend = (b_t - b_t_2) / 2 if b_t_2 != 0 else 0
            a_recent_trend = (a_t_lag - a_t_lag_1) if a_t_lag_1 != 0 else 0
            
            # Acceleration
            b_accel = (b_t - b_t_1) - (b_t_1 - b_t_2) if b_t_2 != 0 else 0
            
            # Max/Min ratio
            b_max_recent = max(b[max(0, t-5):t+1])
            b_min_recent = min(b[max(0, t-5):t+1]) + 1
            b_max_min_ratio = b_max_recent / b_min_recent
            
            # Volatility
            b_volatility = np.std(b[max(0, t-5):t+1]) / (np.mean(b[max(0, t-5):t+1]) + 1)
            
            target = b[target_idx]
            
            rows.append({
                "leading_item_id": leader,
                "following_item_id": follower,
                
                # 기존 13개
                "b_t": b_t,
                "b_t_1": b_t_1,
                "b_t_2": b_t_2,
                "b_ma3": b_ma3,
                "b_change": b_change,
                "a_t_lag": a_t_lag,
                "a_t_lag_1": a_t_lag_1,
                "a_ma3": a_ma3,
                "a_change": a_change,
                "ab_value_ratio": ab_ratio,
                "max_corr": row.max_corr,
                "best_lag": lag,
                "corr_stability": row.corr_stability,
                
                # 새로운 features (17개)
                "b_ma6": b_ma6,
                "a_ma6": a_ma6,
                "b_std3": b_std3,
                "a_std3": a_std3,
                "b_recent_trend": b_recent_trend,
                "a_recent_trend": a_recent_trend,
                "b_accel": b_accel,
                "b_max_min_ratio": b_max_min_ratio,
                "b_volatility": b_volatility,
                "ab_ma_ratio": b_ma3 / (a_ma3 + 1),
                "ab_change_ratio": b_change / (abs(a_change) + 1e-6),
                "b_momentum": b_t / (b_ma3 + 1),
                "a_momentum": a_t_lag / (a_ma3 + 1),
                "cross_momentum": (b_t / (b_ma3 + 1)) * (a_t_lag / (a_ma3 + 1)),
                "b_relative_pos": (b_t - b_min_recent) / (b_max_recent - b_min_recent + 1),
                "trend_alignment": b_recent_trend * a_recent_trend,
                "value_gap": abs(b_t - a_t_lag),
                
                "target": target,
            })
    
    df = pd.DataFrame(rows)
    print(f"Regression dataset: {df.shape}, features: {len([c for c in df.columns if c not in ['leading_item_id', 'following_item_id', 'target']])}")
    return df

print("Advanced regression dataset 함수 정의 완료")

Advanced regression dataset 함수 정의 완료


## 7. Multi-Model Regressor

In [19]:
def train_multi_model_regressor(df_train):
    """XGB + LightGBM + CatBoost 앙상블 Regressor"""
    feature_cols = [c for c in df_train.columns if c not in ['leading_item_id', 'following_item_id', 'target']]
    
    df_train = df_train.replace([np.inf, -np.inf], 0).fillna(0)
    
    X = df_train[feature_cols].values
    y = df_train["target"].values
    
    print(f"Training regressors on {len(feature_cols)} features...")
    
    # XGBoost
    reg_xgb = XGBRegressor(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.85,
        colsample_bytree=0.85,
        min_child_weight=3,
        gamma=0.1,
        reg_alpha=0.3,
        reg_lambda=0.8,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    reg_xgb.fit(X, y)
    print("XGBoost regressor trained")
    
    # LightGBM
    reg_lgb = LGBMRegressor(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.3,
        reg_lambda=0.8,
        min_child_samples=20,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=-1
    )
    reg_lgb.fit(X, y)
    print("LightGBM regressor trained")
    
    # CatBoost
    reg_cat = CatBoostRegressor(
        iterations=300,
        depth=5,
        learning_rate=0.05,
        subsample=0.85,
        reg_lambda=0.8,
        random_seed=RANDOM_SEED,
        verbose=0
    )
    reg_cat.fit(X, y)
    print("CatBoost regressor trained")
    
    return [reg_xgb, reg_lgb, reg_cat], feature_cols

print("Multi-model regressor 함수 정의 완료")

Multi-model regressor 함수 정의 완료


## 8. 실행 및 Submission

In [20]:
# 데이터 로드
pivot, raw = load_pivot("train.csv")
months, train_end_idx, val_start_idx, val_end_idx = get_time_indices(pivot)
print(f"Train end: {train_end_idx}, Val: {val_start_idx}-{val_end_idx}")

pivot shape: (100, 43)
Train end: 35, Val: 36-39


In [21]:
# Classifier 학습 (Advanced Features)
print("Building advanced pair features...")
df_pairs = build_advanced_pair_features(pivot, train_end_idx, 
                                        neg_pos_ratio=1.5, 
                                        corr_threshold_for_label=0.32)
print(f"Pair dataset: {df_pairs.shape}")

print("\nTraining multi-model classifier ensemble...")
classifiers, clf_cols = train_multi_model_classifier(df_pairs)
print(f"Classifier features: {len(clf_cols)}")

Building advanced pair features...



build_advanced_features: 100%|██████████| 100/100 [00:37<00:00,  2.69it/s]



pos pairs: (3865, 39), neg pairs: (4507, 39)
Total features: 36
Pair dataset: (8372, 39)

Training multi-model classifier ensemble...
Training on 36 features...
XGBoost trained
XGBoost trained
LightGBM trained
LightGBM trained
CatBoost trained
Classifier features: 36
CatBoost trained
Classifier features: 36


In [22]:
# 모든 페어 스코어링 (Multi-model ensemble)
print("Scoring all pairs with multi-model ensemble...")
pairs_all = score_all_pairs_advanced(pivot, classifiers, clf_cols, train_end_idx)
print(f"Total pairs scored: {len(pairs_all)}")

# Top 3000 선택
pairs_top = pairs_all.sort_values("clf_prob", ascending=False).head(3000).copy()
print(f"Top K pairs selected: {len(pairs_top)}")
print(f"Prob range: {pairs_top['clf_prob'].min():.4f} - {pairs_top['clf_prob'].max():.4f}")

Scoring all pairs with multi-model ensemble...


score_all_pairs_advanced: 100%|██████████| 100/100 [02:39<00:00,  1.60s/it]

Total pairs scored: 8556
Top K pairs selected: 3000
Prob range: 0.9997 - 0.9999





In [23]:
# Regressor 학습 (Advanced Features)
print("Building advanced regression dataset...")
df_train = build_advanced_regression_dataset(pivot, pairs_top, 0, train_end_idx)
print(f"Regression dataset: {df_train.shape}")

print("\nTraining multi-model regressor ensemble...")
regressors, reg_cols = train_multi_model_regressor(df_train)
print(f"Regressor features: {len(reg_cols)}")

Building advanced regression dataset...


build_regression_dataset: 3000it [00:12, 240.82it/s]



Regression dataset: (21774, 33), features: 30
Regression dataset: (21774, 33)

Training multi-model regressor ensemble...
Training regressors on 30 features...
XGBoost regressor trained
XGBoost regressor trained
LightGBM regressor trained
LightGBM regressor trained
CatBoost regressor trained
Regressor features: 30
CatBoost regressor trained
Regressor features: 30


In [24]:
# Submission 생성 (Multi-model ensemble predictions)
print("Generating predictions with multi-model ensemble...")

# Test 데이터 생성
df_test = build_advanced_regression_dataset(pivot, pairs_top, 
                                           train_end_idx + 1, 
                                           val_end_idx)

X_test = df_test[reg_cols]

# 3개 모델의 예측 평균
preds = []
for i, model in enumerate(regressors):
    pred = model.predict(X_test)
    preds.append(pred)
    print(f"Model {i+1} pred range: {pred.min():.4f} - {pred.max():.4f}")

# Ensemble averaging
y_pred = np.mean(preds, axis=0)
print(f"Ensemble pred range: {y_pred.min():.4f} - {y_pred.max():.4f}")

# Submission 파일 생성
df_test["pred_b"] = y_pred
submission = df_test[["month_id", "series_a", "series_b", "pred_b"]].copy()
submission.columns = ["month_id", "series_a", "series_b", "value"]

submission.to_csv("submission_advanced.csv", index=False)
print(f"\n✅ Submission saved: submission_advanced.csv ({len(submission)} rows)")

Generating predictions with multi-model ensemble...


build_regression_dataset: 3000it [00:03, 763.81it/s] 



Regression dataset: (10809, 33), features: 30
Model 1 pred range: 16522.8809 - 111831136.0000
Model 2 pred range: 12867.5175 - 115383688.6426
Model 3 pred range: -128354.4894 - 109054335.6789
Ensemble pred range: -6540.3101 - 109456864.1072


KeyError: "['month_id', 'series_a', 'series_b'] not in index"

## 9. 모델 비교 및 분석

**Advanced Model 특징:**
- Classifier: 35+ features (lag-specific, rolling, trend, interaction)
- Regressor: 30+ features (momentum, volatility, cross features)
- Multi-model ensemble: XGBoost + LightGBM + CatBoost
- Ensemble averaging for robust predictions

**Best Single Model (improved_model.ipynb):**
- Score: 0.3513
- Classifier: 13 features
- Regressor: 13 features
- Single XGBoost model

**Expected Improvement:**
- Feature engineering: 더 풍부한 feature set으로 패턴 포착
- Model diversity: 3개 모델의 앙상블로 overfitting 감소
- Target score: 0.40+