In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.family'] = 'Malgun Gothic' # 한글 폰트 설정
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import torch #cuda용 
import warnings # 경고 메시지 무시
warnings.filterwarnings('ignore')
from datetime import datetime
import math
from tqdm import tqdm

In [3]:
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"PyTorch CUDA 버전: {torch.version.cuda}")
    print(f"GPU 이름: {torch.cuda.get_device_name(0)}")
else:
    print("CPU 버전이 설치됨")

CUDA 사용 가능: True
PyTorch CUDA 버전: 11.8
GPU 이름: NVIDIA GeForce GTX 1650


In [4]:
building = pd.read_csv('../data/building_info.csv')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [5]:
building.head()

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,호텔,82912.71,77586.0,-,-,-
1,2,상용,40658.9,30392.82,-,-,-
2,3,병원,560431.0,418992.0,278.58,-,-
3,4,호텔,41813.29,23715.71,-,-,-
4,5,학교,403749.39,248507.0,1983.05,1025,250


In [6]:
building.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   건물번호          100 non-null    int64  
 1   건물유형          100 non-null    object 
 2   연면적(m2)       100 non-null    float64
 3   냉방면적(m2)      100 non-null    float64
 4   태양광용량(kW)     100 non-null    object 
 5   ESS저장용량(kWh)  100 non-null    object 
 6   PCS용량(kW)     100 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 5.6+ KB


In [5]:
train.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.0,5794.8
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.0,5591.85
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.0,5338.17
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.0,4554.42
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.0,3602.25


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  204000 non-null  object 
 1   건물번호           204000 non-null  int64  
 2   일시             204000 non-null  object 
 3   기온(°C)         204000 non-null  float64
 4   강수량(mm)        204000 non-null  float64
 5   풍속(m/s)        204000 non-null  float64
 6   습도(%)          204000 non-null  float64
 7   일조(hr)         204000 non-null  float64
 8   일사(MJ/m2)      204000 non-null  float64
 9   전력소비량(kWh)     204000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 15.6+ MB


In [6]:
test.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(°C),강수량(mm),풍속(m/s),습도(%)
0,1_20240825 00,1,20240825 00,26.5,0.0,0.7,80.0
1,1_20240825 01,1,20240825 01,26.1,0.0,0.0,80.0
2,1_20240825 02,1,20240825 02,25.9,0.0,0.3,83.0
3,1_20240825 03,1,20240825 03,25.7,0.0,1.1,83.0
4,1_20240825 04,1,20240825 04,25.5,0.0,1.0,86.0


In [7]:
train['is_train'] = 1
test['is_train'] = 0

In [8]:
train = train.drop(['일조(hr)', '일사(MJ/m2)'], axis=1)

In [9]:
combined = pd.concat([train, test], ignore_index=True)

In [10]:
print(f"전체 데이터 크기: {combined.shape}")
print(f"train 데이터 개수: {combined[combined['is_train']==1].shape[0]}")
print(f"test 데이터 개수: {combined[combined['is_train']==0].shape[0]}")

전체 데이터 크기: (220800, 9)
train 데이터 개수: 204000
test 데이터 개수: 16800


In [11]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220800 entries, 0 to 220799
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   num_date_time  220800 non-null  object 
 1   건물번호           220800 non-null  int64  
 2   일시             220800 non-null  object 
 3   기온(°C)         220800 non-null  float64
 4   강수량(mm)        220800 non-null  float64
 5   풍속(m/s)        220800 non-null  float64
 6   습도(%)          220800 non-null  float64
 7   전력소비량(kWh)     204000 non-null  float64
 8   is_train       220800 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 15.2+ MB


In [12]:
def smape(y_true, y_pred):
    """SMAPE 계산 함수"""
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def time_features(data):
    """시간 관련 피처 생성"""
    date = pd.to_datetime(data['일시'])
    
    data['hour'] = date.dt.hour
    data['dow'] = date.dt.weekday
    data['month'] = date.dt.month
    data['week'] = date.dt.isocalendar().week.astype(np.int32)
    data['day'] = date.dt.day
    
    # 순환적 시간 변수
    data['sin_time'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['cos_time'] = np.cos(2 * np.pi * data['hour'] / 24)
    
    # 휴일 변수 (주말: 1, 평일: 0)
    data['holiday'] = data.apply(lambda x: 0 if x['dow'] < 5 else 1, axis=1)
    
    return data

def weather_features(data):
    """기상 관련 복합 지수 계산"""
    # THI (Temperature-Humidity Index): 체감온도 지수
    data['THI'] = 9/5*data['기온(°C)'] - 0.55*(1-data['습도(%)']/100)*(9/5*data['습도(%)'] - 26) + 32
    
    # WC (Wind Chill): 풍속냉각지수
    data['WC'] = 13.12 + 0.6215*data['기온(°C)'] - 13.947*data['풍속(m/s)']**0.16 + 0.486*data['기온(°C)']*data['풍속(m/s)']**0.16
    
    # 강수량이 있는 시점 표시
    data['weather'] = (data['강수량(mm)'] > 0).astype(int)
    
    return data

def temp_stats_features(data):
    """온도 통계 피처 생성"""
    # 건물별, 일별 온도 통계
    daily_stats = data.groupby(['건물번호', 'day', 'month'])['기온(°C)'].agg(['mean', 'max', 'min']).reset_index()
    daily_stats.columns = ['건물번호', 'day', 'month', 'avg_temp', 'max_temp', 'min_temp']
    daily_stats['temp_diff'] = daily_stats['max_temp'] - daily_stats['min_temp']
    
    data = pd.merge(data, daily_stats, on=['건물번호', 'day', 'month'], how='left')
    
    return data

def power_stats_features(train_data, test_data):
    """과거 전력 사용량 통계 피처 생성"""
    # 건물별, 시간별, 요일별 평균
    power_stats = train_data.groupby(['건물번호', 'hour', 'dow'])['전력소비량(kWh)'].agg(['mean', 'std']).reset_index()
    power_stats.columns = ['건물번호', 'hour', 'dow', 'power_hour_dow_mean', 'power_hour_dow_std']
    
    train_data = pd.merge(train_data, power_stats, on=['건물번호', 'hour', 'dow'], how='left')
    test_data = pd.merge(test_data, power_stats, on=['건물번호', 'hour', 'dow'], how='left')
    
    # 건물별, 시간별, 휴일별 평균
    power_holiday_stats = train_data.groupby(['건물번호', 'hour', 'holiday'])['전력소비량(kWh)'].agg(['mean', 'std']).reset_index()
    power_holiday_stats.columns = ['건물번호', 'hour', 'holiday', 'power_holiday_mean', 'power_holiday_std']
    
    train_data = pd.merge(train_data, power_holiday_stats, on=['건물번호', 'hour', 'holiday'], how='left')
    test_data = pd.merge(test_data, power_holiday_stats, on=['건물번호', 'hour', 'holiday'], how='left')
    
    return train_data, test_data

def feature_engineering(data, is_train=True):
    """전체 피처 엔지니어링 파이프라인"""
    # 결측치 처리
    data = data.fillna(method='ffill').fillna(0)
    
    # 피처 생성
    data = time_features(data)
    data = weather_features(data)
    data = temp_stats_features(data)
    
    return data

In [13]:

def get_models():
    """GPU 사용 모델 정의"""
    models = {
        'XGBoost': xgb.XGBRegressor(
            n_estimators=1000,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method='gpu_hist',
            gpu_id=0,
            random_state=42,
            verbosity=0
        ),
        'LightGBM': lgb.LGBMRegressor(
            n_estimators=1000,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            device='gpu',
            random_state=42,
            verbose=-1
        ),
        'CatBoost': cb.CatBoostRegressor(
            iterations=500,
            depth=6,
            learning_rate=0.1,
            l2_leaf_reg=3,
            task_type='GPU',
            random_state=42,
            verbose=False
        )
    }
    return models

In [14]:

def cross_validation_models(X_train, y_train):
    """KFold 교차검증으로 모델 성능 평가"""
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = {model_name: [] for model_name in get_models().keys()}
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        print(f"\n=== Fold {fold+1}/5 ===")
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        models = get_models()
        
        for model_name, model in models.items():
            print(f"{model_name} 학습중...")
            model.fit(X_tr, y_tr)
            pred = model.predict(X_val)
            score = smape(y_val, pred)
            results[model_name].append(score)
            print(f"{model_name} SMAPE: {score:.4f}")
    
    print("\n=== 최종 결과 ===")
    for model_name, scores in results.items():
        print(f"{model_name} - 평균: {np.mean(scores):.4f}, 표준편차: {np.std(scores):.4f}")
    
    return results

def train_individual_building_models(train_data, test_data, feature_cols):
    """건물별 개별 모델 학습"""
    building_predictions = {}
    building_scores = []
    
    for building_num in tqdm(range(1, 101), desc="건물별 모델 학습"):
        # 해당 건물 데이터 추출
        building_train = train_data[train_data['건물번호'] == building_num].copy()
        building_test = test_data[test_data['건물번호'] == building_num].copy()
        
        if len(building_train) == 0:
            continue
            
        # 피처와 타겟 분리
        X = building_train[feature_cols]
        y = building_train['전력소비량(kWh)']
        X_test = building_test[feature_cols]
        
        # train/validation 분할 (시계열 특성 고려)
        split_point = int(len(X) * 0.8)
        X_train_b, X_val_b = X[:split_point], X[split_point:]
        y_train_b, y_val_b = y[:split_point], y[split_point:]
        
        # XGBoost 모델 학습
        model = xgb.XGBRegressor(
            n_estimators=500,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.9,
            colsample_bytree=0.8,
            random_state=42,
            verbosity=0
        )
        
        model.fit(X_train_b, y_train_b)
        
        # 검증 점수 계산
        val_pred = model.predict(X_val_b)
        score = smape(y_val_b, val_pred)
        building_scores.append(score)
        
        # 테스트 예측
        test_pred = model.predict(X_test)
        building_predictions[building_num] = test_pred
    
    print(f"\n건물별 모델 평균 SMAPE: {np.mean(building_scores):.4f}")
    
    return building_predictions, building_scores

In [39]:
def advanced_ensemble_pipeline(combined_data, drop_cols):
    """고급 앙상블 + 메타모델 + 슈퍼메타피쳐 파이프라인"""
    import os
    from datetime import datetime
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import Ridge
    from sklearn.preprocessing import StandardScaler
    
    # GPU 사용 확인
    print("=== GPU 사용 확인 ===")
    try:
        # XGBoost GPU 확인
        import xgboost as xgb
        print(f"XGBoost version: {xgb.__version__}")
        
        # LightGBM GPU 확인
        import lightgbm as lgb
        print(f"LightGBM version: {lgb.__version__}")
        
        # CatBoost GPU 확인
        import catboost as cb
        print(f"CatBoost version: {cb.__version__}")
        
        # CUDA 사용 가능 확인
        import torch
        if torch.cuda.is_available():
            print(f"CUDA available: {torch.cuda.get_device_name(0)}")
        else:
            print("CUDA not available - falling back to CPU")
    except Exception as e:
        print(f"GPU 확인 중 오류: {e}")
    
    # 1. 데이터 분리 및 기본 피처 엔지니어링
    print("=== 1단계: 데이터 준비 ===")
    train_data = combined_data[combined_data['is_train']==1].copy()
    test_data = combined_data[combined_data['is_train']==0].copy()
    
    train_data = feature_engineering(train_data, is_train=True)
    test_data = feature_engineering(test_data, is_train=False)
    train_data, test_data = power_stats_features(train_data, test_data)
    
    feature_cols = [col for col in train_data.columns if col not in drop_cols and col != '전력소비량(kWh)']
    print(f"기본 피처 수: {len(feature_cols)}")
    
    # 2. Level 1: GPU 모델들 (5-fold)
    print("\n=== 2단계: Level 1 모델 학습 ===")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # GPU 모델들 가져오기 (GPU 설정 강화)
    gpu_models = get_models()
    
    # GPU 설정 검증 및 강화
    try:
        # XGBoost GPU 설정 검증
        if hasattr(gpu_models['XGBoost'], 'tree_method'):
            print(f"XGBoost tree_method: {gpu_models['XGBoost'].tree_method}")
        
        # LightGBM GPU 설정 검증  
        if hasattr(gpu_models['LightGBM'], 'device'):
            print(f"LightGBM device: {gpu_models['LightGBM'].device}")
            
        # CatBoost GPU 설정 검증
        if hasattr(gpu_models['CatBoost'], 'task_type'):
            print(f"CatBoost task_type: {gpu_models['CatBoost'].task_type}")
            
    except Exception as e:
        print(f"GPU 모델 설정 확인 중 오류: {e}")
    
    level1_models = {
        'xgb': gpu_models['XGBoost'],
        'lgb': gpu_models['LightGBM'], 
        'cat': gpu_models['CatBoost'],
        'rf': RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42, n_jobs=-1)
    }
    
    # Level 1 예측값 저장용
    oof_predictions = pd.DataFrame(index=train_data.index)
    test_predictions = pd.DataFrame()
    
    X_train = train_data[feature_cols]
    y_train = train_data['전력소비량(kWh)']
    X_test = test_data[feature_cols]
    
    for name, model in level1_models.items():
        print(f"Learning {name}...")
        oof_pred = np.zeros(len(X_train))
        test_pred = np.zeros(len(X_test))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            model.fit(X_tr, y_tr)
            oof_pred[val_idx] = model.predict(X_val)
            test_pred += model.predict(X_test) / 5
        
        oof_predictions[f'{name}_pred'] = oof_pred
        test_predictions[f'{name}_pred'] = test_pred
        
        score = smape(y_train, oof_pred)
        print(f"{name} SMAPE: {score:.4f}")
    
    # 3. Level 1 앙상블 점수
    ensemble_pred = oof_predictions.mean(axis=1)
    ensemble_score = smape(y_train, ensemble_pred)
    print(f"\nLevel 1 앙상블 SMAPE: {ensemble_score:.4f}")
    
    # 4. 슈퍼메타피쳐 생성
    print("\n=== 3단계: 슈퍼메타피쳐 생성 ===")
    
    # 예측값 통계 피쳐
    oof_predictions['pred_mean'] = oof_predictions.mean(axis=1)
    oof_predictions['pred_std'] = oof_predictions.std(axis=1)
    oof_predictions['pred_max'] = oof_predictions.max(axis=1)
    oof_predictions['pred_min'] = oof_predictions.min(axis=1)
    oof_predictions['pred_range'] = oof_predictions['pred_max'] - oof_predictions['pred_min']
    
    test_predictions['pred_mean'] = test_predictions.mean(axis=1)
    test_predictions['pred_std'] = test_predictions.std(axis=1)
    test_predictions['pred_max'] = test_predictions.max(axis=1)
    test_predictions['pred_min'] = test_predictions.min(axis=1)
    test_predictions['pred_range'] = test_predictions['pred_max'] - test_predictions['pred_min']
    
    # 건물별 예측 패턴 피쳐
    building_meta_train = []
    building_meta_test = []
    
    for building_num in range(1, 101):
        # Train 건물별 메타피쳐
        building_mask_train = train_data['건물번호'] == building_num
        if building_mask_train.sum() > 0:
            building_preds = oof_predictions[building_mask_train]
            building_meta = {
                'building_pred_volatility': building_preds.std(axis=0).mean(),
                'building_pred_consistency': 1 / (building_preds.std(axis=1).mean() + 1e-8),
                'building_pred_bias': building_preds.mean(axis=1).std()
            }
            building_meta_train.extend([building_meta] * building_mask_train.sum())
        
        # Test 건물별 메타피쳐  
        building_mask_test = test_data['건물번호'] == building_num
        if building_mask_test.sum() > 0:
            building_test_preds = test_predictions[building_mask_test]
            building_meta = {
                'building_pred_volatility': building_test_preds.std(axis=0).mean() if len(building_test_preds) > 0 else 0,
                'building_pred_consistency': 1 / (building_test_preds.std(axis=1).mean() + 1e-8) if len(building_test_preds) > 0 else 1,
                'building_pred_bias': building_test_preds.mean(axis=1).std() if len(building_test_preds) > 0 else 0
            }
            building_meta_test.extend([building_meta] * building_mask_test.sum())
    
    # 메타피쳐 DataFrame으로 변환
    building_meta_train_df = pd.DataFrame(building_meta_train, index=train_data.index)
    building_meta_test_df = pd.DataFrame(building_meta_test, index=test_data.index)
    
    # 5. Level 2: 메타모델 학습
    print("\n=== 4단계: Level 2 메타모델 학습 ===")
    
    # 메타피쳐 결합
    meta_features_train = pd.concat([
        oof_predictions, 
        building_meta_train_df,
        train_data[['건물번호', 'hour', 'dow', 'month']].reset_index(drop=True)
    ], axis=1)
    
    meta_features_test = pd.concat([
        test_predictions,
        building_meta_test_df, 
        test_data[['건물번호', 'hour', 'dow', 'month']].reset_index(drop=True)
    ], axis=1)
    
    print(f"메타피쳐 수: {len(meta_features_train.columns)}")
    
    # 메타모델들 (GPU 설정 재확인)
    meta_models = {
        'ridge': Ridge(alpha=1.0),
        'rf_meta': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1),
        'xgb_meta': xgb.XGBRegressor(
            n_estimators=300, max_depth=4, learning_rate=0.1,
            tree_method='gpu_hist', gpu_id=0, 
            random_state=42, verbosity=0
        ),
        'lgb_meta': lgb.LGBMRegressor(
            n_estimators=300, max_depth=4, learning_rate=0.1,
            device='gpu', random_state=42, verbose=-1
        ), 
        'cat_meta': cb.CatBoostRegressor(
            iterations=200, depth=4, learning_rate=0.1,
            task_type='GPU', random_state=42, verbose=False
        )
    }
    
    # 메타모델 학습 및 예측
    meta_oof = pd.DataFrame(index=train_data.index)
    meta_test = pd.DataFrame()
    
    scaler = StandardScaler()
    meta_features_train_scaled = pd.DataFrame(
        scaler.fit_transform(meta_features_train), 
        columns=meta_features_train.columns,
        index=meta_features_train.index
    )
    meta_features_test_scaled = pd.DataFrame(
        scaler.transform(meta_features_test),
        columns=meta_features_test.columns, 
        index=meta_features_test.index
    )
    
    for name, meta_model in meta_models.items():
        print(f"Learning meta model {name}...")
        meta_oof_pred = np.zeros(len(meta_features_train_scaled))
        meta_test_pred = np.zeros(len(meta_features_test_scaled))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(meta_features_train_scaled)):
            X_tr = meta_features_train_scaled.iloc[train_idx]
            X_val = meta_features_train_scaled.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            meta_model.fit(X_tr, y_tr)
            meta_oof_pred[val_idx] = meta_model.predict(X_val)
            meta_test_pred += meta_model.predict(meta_features_test_scaled) / 5
        
        meta_oof[f'{name}_pred'] = meta_oof_pred
        meta_test[f'{name}_pred'] = meta_test_pred
        
        score = smape(y_train, meta_oof_pred)
        print(f"{name} meta SMAPE: {score:.4f}")
    
    # 6. 최종 앙상블 (성능 기반 동적 가중치)
    print("\n=== 5단계: 최종 앙상블 ===")
    
    # 각 메타모델 성능 수집
    meta_scores = {}
    for col in meta_oof.columns:
        score = smape(y_train, meta_oof[col])
        meta_scores[col] = score
        print(f"{col} SMAPE: {score:.4f}")
    
    # 성능 기반 가중치 계산 (성능이 좋을수록 높은 가중치)
    scores_array = np.array(list(meta_scores.values()))
    # 역수를 취해서 낮은 SMAPE가 높은 가중치를 가지도록
    inverse_scores = 1 / (scores_array + 1e-8)
    weights = inverse_scores / inverse_scores.sum()
    
    print(f"동적 가중치: {dict(zip(meta_scores.keys(), weights))}")
    
    final_oof = (meta_oof * weights).sum(axis=1)
    final_test = (meta_test * weights).sum(axis=1)
    
    final_score = smape(y_train, final_oof)
    print(f"최종 앙상블 SMAPE: {final_score:.4f}")
    
    # Level 1 앙상블과 비교
    if final_score > ensemble_score:
        print(f"Level 1 앙상블이 더 좋음. Level 1 사용")
        final_test = test_predictions.mean(axis=1)
        final_score = ensemble_score
    
    # 7. 건물별 개별 모델 (최적 모델 자동 선택)
    print("\n=== 6단계: 건물별 개별 모델 ===")
    building_predictions = {}
    building_scores = []
    building_best_models = {}
    
    # 건물별 모델 후보군 (GPU 설정 명시적으로)
    building_model_candidates = {
        'xgb': lambda: xgb.XGBRegressor(
            n_estimators=500, max_depth=6, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.8,
            tree_method='gpu_hist', gpu_id=0,
            early_stopping_rounds=50, random_state=42, verbosity=0
        ),
        'lgb': lambda: lgb.LGBMRegressor(
            n_estimators=500, max_depth=6, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.8,
            device='gpu', random_state=42, verbose=-1
        ),
        'cat': lambda: cb.CatBoostRegressor(
            iterations=300, depth=6, learning_rate=0.05,
            l2_leaf_reg=3, task_type='GPU',
            random_state=42, verbose=False
        )
    }
    
    for building_num in tqdm(range(1, 101), desc="건물별 모델"):
        building_train = train_data[train_data['건물번호'] == building_num].copy()
        building_test = test_data[test_data['건물번호'] == building_num].copy()
        
        if len(building_train) == 0:
            continue
        
        X_building = building_train[feature_cols]
        y_building = building_train['전력소비량(kWh)']
        X_test_building = building_test[feature_cols]
        
        # 80:20 분할
        split_point = int(len(X_building) * 0.8)
        X_train_b, X_val_b = X_building[:split_point], X_building[split_point:]
        y_train_b, y_val_b = y_building[:split_point], y_building[split_point:]
        
        # 각 모델 성능 비교
        best_score = float('inf')
        best_model = None
        best_model_name = None
        
        for model_name, model_func in building_model_candidates.items():
            try:
                model = model_func()
                
                if model_name in ['xgb']:
                    model.fit(X_train_b, y_train_b, 
                             eval_set=[(X_val_b, y_val_b)], verbose=False)
                else:
                    model.fit(X_train_b, y_train_b)
                
                val_pred = model.predict(X_val_b)
                score = smape(y_val_b, val_pred)
                
                if score < best_score:
                    best_score = score
                    best_model = model
                    best_model_name = model_name
            except:
                continue
        
        if best_model is not None:
            building_scores.append(best_score)
            building_best_models[building_num] = best_model_name
            
            test_pred = best_model.predict(X_test_building)
            building_predictions[building_num] = test_pred
    
    building_avg_score = np.mean(building_scores) if building_scores else final_score
    print(f"건물별 모델 평균 SMAPE: {building_avg_score:.4f}")
    
    # 사용된 모델 통계
    model_usage = {}
    for model_name in building_best_models.values():
        model_usage[model_name] = model_usage.get(model_name, 0) + 1
    print(f"건물별 최적 모델 사용 현황: {model_usage}")
    
    # 8. 최종 예측 결합 (앙상블 vs 건물별 모델 선택)
    print("\n=== 7단계: 최종 예측 결합 ===")
    final_predictions = []
    
    for idx, row in test_data.iterrows():
        building_num = row['건물번호']
        
        # 건물별 모델 예측값
        if building_num in building_predictions:
            test_idx = test_data[test_data['건물번호'] == building_num].index.get_loc(idx)
            building_pred = building_predictions[building_num][test_idx]
        else:
            building_pred = final_test.iloc[test_data.index.get_loc(idx)]
        
        # 앙상블 예측값
        ensemble_pred_val = final_test.iloc[test_data.index.get_loc(idx)]
        
        # 건물별 성능이 앙상블보다 좋으면 건물별 모델 사용
        building_score = building_scores[building_num-1] if building_num <= len(building_scores) else final_score
        
        if building_score < final_score * 0.95:  # 5% 이상 좋으면 건물별 모델
            final_pred = building_pred
        else:
            final_pred = (ensemble_pred_val * 0.7 + building_pred * 0.3)  # 조합
        
        final_predictions.append({
            'num_date_time': row['num_date_time'],
            'answer': final_pred
        })
    
    # 9. 제출파일 생성
    print("\n=== 8단계: 제출파일 생성 ===")
    submission_df = pd.DataFrame(final_predictions)
    submission_df = submission_df.sort_values('num_date_time').reset_index(drop=True)
    
    # 파일 저장
    os.makedirs('../submission', exist_ok=True)
    now = datetime.now()
    filename = f"{now.strftime('%Y%m%d_%H%M')}_Advanced_Ensemble_SMAPE_{final_score:.4f}.csv"
    filepath = os.path.join('../submission', filename)
    
    submission_df.to_csv(filepath, index=False)
    
    print(f"\n=== 최종 결과 ===")
    print(f"Level 1 앙상블 SMAPE: {ensemble_score:.4f}")
    print(f"Level 2 메타모델 SMAPE: {final_score:.4f}")
    print(f"건물별 모델 평균 SMAPE: {building_avg_score:.4f}")
    print(f"제출파일 저장: {filepath}")
    
    return {
        'submission_df': submission_df,
        'final_score': final_score,
        'level1_models': level1_models,
        'meta_models': meta_models,
        'building_scores': building_scores,
        'filepath': filepath
    }

In [40]:
# 1. 먼저 combined 데이터가 준비되어 있는지 확인
print(combined.shape)
print(combined.columns.tolist())

(220800, 9)
['num_date_time', '건물번호', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '전력소비량(kWh)', 'is_train']


In [41]:
# 1. 데이터 분리 및 피처 엔지니어링
drop_cols = ['num_date_time', '일시', 'is_train']
train_data = combined[combined['is_train']==1].copy()
test_data = combined[combined['is_train']==0].copy()

train_data = feature_engineering(train_data, is_train=True)
test_data = feature_engineering(test_data, is_train=False)
train_data, test_data = power_stats_features(train_data, test_data)

feature_cols = [col for col in train_data.columns if col not in drop_cols and col != '전력소비량(kWh)']

In [43]:
# 고급 앙상블 파이프라인 실행
results = advanced_ensemble_pipeline(combined, drop_cols)

=== GPU 사용 확인 ===
XGBoost version: 3.0.2
LightGBM version: 4.6.0
CatBoost version: 1.2.8
CUDA available: NVIDIA GeForce GTX 1650
=== 1단계: 데이터 준비 ===
기본 피처 수: 24

=== 2단계: Level 1 모델 학습 ===
XGBoost tree_method: gpu_hist
LightGBM device: gpu
Learning xgb...
xgb SMAPE: 5.7418
Learning lgb...
lgb SMAPE: 5.9409
Learning cat...
cat SMAPE: 7.9685
Learning rf...
rf SMAPE: 8.0214

Level 1 앙상블 SMAPE: 6.3094

=== 3단계: 슈퍼메타피쳐 생성 ===

=== 4단계: Level 2 메타모델 학습 ===
메타피쳐 수: 16
Learning meta model ridge...
ridge meta SMAPE: 5.7895
Learning meta model rf_meta...
rf_meta meta SMAPE: 9.6621
Learning meta model xgb_meta...
xgb_meta meta SMAPE: 5.0352
Learning meta model lgb_meta...
lgb_meta meta SMAPE: 5.0560
Learning meta model cat_meta...
cat_meta meta SMAPE: 5.9555

=== 5단계: 최종 앙상블 ===
ridge_pred SMAPE: 5.7895
rf_meta_pred SMAPE: 9.6621
xgb_meta_pred SMAPE: 5.0352
lgb_meta_pred SMAPE: 5.0560
cat_meta_pred SMAPE: 5.9555
동적 가중치: {'ridge_pred': np.float64(0.205498660427877), 'rf_meta_pred': np.float64(0.12

건물별 모델: 100%|██████████| 100/100 [06:11<00:00,  3.71s/it]


건물별 모델 평균 SMAPE: 6.4778
건물별 최적 모델 사용 현황: {'xgb': 35, 'cat': 34, 'lgb': 31}

=== 7단계: 최종 예측 결합 ===

=== 8단계: 제출파일 생성 ===

=== 최종 결과 ===
Level 1 앙상블 SMAPE: 6.3094
Level 2 메타모델 SMAPE: 5.4334
건물별 모델 평균 SMAPE: 6.4778
제출파일 저장: ../submission\20250723_1349_Advanced_Ensemble_SMAPE_5.4334.csv


In [45]:
def improved_advanced_ensemble_pipeline(combined_data, drop_cols):
    """개선된 고급 앙상블 파이프라인 - 성능 최적화"""
    import os
    from datetime import datetime
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import Ridge, ElasticNet
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import KFold
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    import pandas as pd
    import numpy as np
    from tqdm import tqdm
    
    print("=== 개선된 고급 앙상블 파이프라인 ===")
    
    # 데이터 준비
    print("=== 1단계: 데이터 준비 ===")
    train_data = combined_data[combined_data['is_train']==1].copy()
    test_data = combined_data[combined_data['is_train']==0].copy()
    
    train_data = feature_engineering(train_data, is_train=True)
    test_data = feature_engineering(test_data, is_train=False)
    train_data, test_data = power_stats_features(train_data, test_data)
    
    feature_cols = [col for col in train_data.columns if col not in drop_cols and col != '전력소비량(kWh)']
    print(f"피처 수: {len(feature_cols)}")
    
    # Level 1: 성능 검증된 모델들만 + 최적화된 파라미터
    print("\n=== 2단계: Level 1 최적화 모델 학습 ===")
    kf = KFold(n_splits=7, shuffle=True, random_state=42)  # 7-fold로 증가
    
    level1_models = {
        'xgb': xgb.XGBRegressor(
            n_estimators=600,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.5,
            reg_lambda=0.5,
            min_child_weight=5,
            tree_method='gpu_hist',
            random_state=42,
            verbosity=0
        ),
        'lgb': lgb.LGBMRegressor(
            n_estimators=600,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.85,
            colsample_bytree=0.85,
            reg_alpha=0.5,
            reg_lambda=0.5,
            num_leaves=128,
            min_child_samples=80,
            device='gpu',
            random_state=42,
            verbose=-1
        ),
        'cat': cb.CatBoostRegressor(
            iterations=500,
            depth=6,
            learning_rate=0.05,
            l2_leaf_reg=3,
            task_type='GPU',
            random_state=42,
            verbose=False
        )
        # RandomForest 제거 (성능이 떨어짐)
    }
    
    # Level 1 예측값 저장
    oof_predictions = pd.DataFrame(index=train_data.index)
    test_predictions = pd.DataFrame()
    
    X_train = train_data[feature_cols]
    y_train = train_data['전력소비량(kWh)']
    X_test = test_data[feature_cols]
    
    for name, model in level1_models.items():
        print(f"Learning {name}...")
        oof_pred = np.zeros(len(X_train))
        test_pred = np.zeros(len(X_test))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            model.fit(X_tr, y_tr)
            oof_pred[val_idx] = model.predict(X_val)
            test_pred += model.predict(X_test) / 7  # 7-fold
        
        oof_predictions[f'{name}_pred'] = oof_pred
        test_predictions[f'{name}_pred'] = test_pred
        
        score = smape(y_train, oof_pred)
        print(f"{name} SMAPE: {score:.4f}")
    
    # Level 1 앙상블
    ensemble_pred = oof_predictions.mean(axis=1)
    ensemble_score = smape(y_train, ensemble_pred)
    print(f"Level 1 앙상블 SMAPE: {ensemble_score:.4f}")
    
    # 강화된 메타피쳐 생성
    print("\n=== 3단계: 강화된 메타피쳐 생성 ===")
    
    # 기본 통계 메타피쳐
    oof_predictions['pred_mean'] = oof_predictions.mean(axis=1)
    oof_predictions['pred_std'] = oof_predictions.std(axis=1)
    oof_predictions['pred_max'] = oof_predictions.max(axis=1)
    oof_predictions['pred_min'] = oof_predictions.min(axis=1)
    oof_predictions['pred_range'] = oof_predictions['pred_max'] - oof_predictions['pred_min']
    
    test_predictions['pred_mean'] = test_predictions.mean(axis=1)
    test_predictions['pred_std'] = test_predictions.std(axis=1)
    test_predictions['pred_max'] = test_predictions.max(axis=1)
    test_predictions['pred_min'] = test_predictions.min(axis=1)
    test_predictions['pred_range'] = test_predictions['pred_max'] - test_predictions['pred_min']
    
    # 고급 메타피쳐 (안전하게)
    oof_predictions['pred_agreement'] = oof_predictions['pred_range'] / (oof_predictions['pred_mean'] + 1e-8)
    oof_predictions['pred_confidence'] = 1 / (oof_predictions['pred_std'] + 1e-8)
    
    test_predictions['pred_agreement'] = test_predictions['pred_range'] / (test_predictions['pred_mean'] + 1e-8)
    test_predictions['pred_confidence'] = 1 / (test_predictions['pred_std'] + 1e-8)
    
    # Level 2: 다양한 메타모델
    print("\n=== 4단계: Level 2 메타모델 ===")
    
    # 메타피쳐 결합
    meta_features_train = pd.concat([
        oof_predictions, 
        train_data[['건물번호', 'hour', 'dow', 'month']].reset_index(drop=True)
    ], axis=1)
    
    meta_features_test = pd.concat([
        test_predictions,
        test_data[['건물번호', 'hour', 'dow', 'month']].reset_index(drop=True)
    ], axis=1)
    
    # NaN 처리
    meta_features_train = meta_features_train.fillna(0).replace([np.inf, -np.inf], 0)
    meta_features_test = meta_features_test.fillna(0).replace([np.inf, -np.inf], 0)
    
    print(f"메타피쳐 수: {len(meta_features_train.columns)}")
    
    # 성능 검증된 메타모델들
    meta_models = {
        'ridge1': Ridge(alpha=0.5),
        'ridge2': Ridge(alpha=1.0),
        'ridge3': Ridge(alpha=2.0),
        'elastic': ElasticNet(alpha=1.0, l1_ratio=0.3),
        'xgb_meta': xgb.XGBRegressor(
            n_estimators=300,
            max_depth=4,
            learning_rate=0.1,
            tree_method='gpu_hist',
            random_state=42,
            verbosity=0
        )
    }
    
    # 메타모델 학습
    meta_oof = pd.DataFrame(index=train_data.index)
    meta_test = pd.DataFrame()
    
    scaler = StandardScaler()
    meta_features_train_scaled = pd.DataFrame(
        scaler.fit_transform(meta_features_train), 
        columns=meta_features_train.columns,
        index=meta_features_train.index
    )
    meta_features_test_scaled = pd.DataFrame(
        scaler.transform(meta_features_test),
        columns=meta_features_test.columns, 
        index=meta_features_test.index
    )
    
    for name, meta_model in meta_models.items():
        print(f"Learning meta model {name}...")
        meta_oof_pred = np.zeros(len(meta_features_train_scaled))
        meta_test_pred = np.zeros(len(meta_features_test_scaled))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(meta_features_train_scaled)):
            X_tr = meta_features_train_scaled.iloc[train_idx]
            X_val = meta_features_train_scaled.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            meta_model.fit(X_tr, y_tr)
            meta_oof_pred[val_idx] = meta_model.predict(X_val)
            meta_test_pred += meta_model.predict(meta_features_test_scaled) / 7
        
        meta_oof[f'{name}_pred'] = meta_oof_pred
        meta_test[f'{name}_pred'] = meta_test_pred
        
        score = smape(y_train, meta_oof_pred)
        print(f"{name} meta SMAPE: {score:.4f}")
    
    # 최고 성능 메타모델 선택
    print("\n=== 5단계: 최고 성능 메타모델 선택 ===")
    
    meta_scores = {}
    for col in meta_oof.columns:
        score = smape(y_train, meta_oof[col])
        meta_scores[col] = score
    
    # 최고 성능 모델 찾기
    best_meta_model = min(meta_scores.keys(), key=lambda x: meta_scores[x])
    best_meta_score = meta_scores[best_meta_model]
    
    print(f"최고 메타모델: {best_meta_model} (SMAPE: {best_meta_score:.4f})")
    print(f"Level 1 앙상블: {ensemble_score:.4f}")
    
    # 최고 성능 선택
    if best_meta_score < ensemble_score:
        print(f"메타모델 사용: {best_meta_score:.4f}")
        final_test = meta_test[best_meta_model]
        final_score = best_meta_score
    else:
        print(f"Level 1 앙상블 사용: {ensemble_score:.4f}")
        final_test = test_predictions.mean(axis=1)
        final_score = ensemble_score
    
    # 제출파일 생성
    print("\n=== 6단계: 제출파일 생성 ===")
    
    final_predictions = []
    for idx, row in test_data.iterrows():
        test_row_idx = test_data.index.get_loc(idx)
        pred_value = final_test.iloc[test_row_idx]
        
        final_predictions.append({
            'num_date_time': row['num_date_time'],
            'answer': pred_value
        })
    
    submission_df = pd.DataFrame(final_predictions)
    submission_df = submission_df.sort_values('num_date_time').reset_index(drop=True)
    
    # 파일 저장
    os.makedirs('../submission', exist_ok=True)
    now = datetime.now()
    filename = f"{now.strftime('%Y%m%d_%H%M')}_Improved_Advanced_SMAPE_{final_score:.4f}.csv"
    filepath = os.path.join('../submission', filename)
    
    submission_df.to_csv(filepath, index=False)
    
    print(f"\n=== 최종 결과 ===")
    print(f"최종 SMAPE: {final_score:.4f}")
    print(f"제출파일 저장: {filepath}")
    
    return {
        'submission_df': submission_df,
        'final_score': final_score,
        'level1_models': level1_models,
        'meta_models': meta_models,
        'filepath': filepath
    }

In [46]:
results = improved_advanced_ensemble_pipeline(combined, drop_cols)

=== 개선된 고급 앙상블 파이프라인 ===
=== 1단계: 데이터 준비 ===
피처 수: 24

=== 2단계: Level 1 최적화 모델 학습 ===
Learning xgb...
xgb SMAPE: 5.2632
Learning lgb...
lgb SMAPE: 5.8264
Learning cat...
cat SMAPE: 8.3109
Level 1 앙상블 SMAPE: 6.1449

=== 3단계: 강화된 메타피쳐 생성 ===

=== 4단계: Level 2 메타모델 ===
메타피쳐 수: 14
Learning meta model ridge1...
ridge1 meta SMAPE: 5.2887
Learning meta model ridge2...
ridge2 meta SMAPE: 5.2869
Learning meta model ridge3...
ridge3 meta SMAPE: 5.2845
Learning meta model elastic...
elastic meta SMAPE: 16.4648
Learning meta model xgb_meta...
xgb_meta meta SMAPE: 5.1870

=== 5단계: 최고 성능 메타모델 선택 ===
최고 메타모델: xgb_meta_pred (SMAPE: 5.1870)
Level 1 앙상블: 6.1449
메타모델 사용: 5.1870

=== 6단계: 제출파일 생성 ===

=== 최종 결과 ===
최종 SMAPE: 5.1870
제출파일 저장: ../submission\20250723_1355_Improved_Advanced_SMAPE_5.1870.csv


In [44]:
def perfect_optuna_tuning(combined_data, drop_cols, n_trials=100):
    """과적합 방지 + 안정성 보장하는 완벽한 Optuna 튜닝"""
    import optuna
    from sklearn.model_selection import StratifiedKFold, cross_val_score
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
    import warnings
    warnings.filterwarnings('ignore')
    
    print("=== 완벽한 Optuna 하이퍼파라미터 튜닝 ===")
    
    # 데이터 준비
    train_data = combined_data[combined_data['is_train']==1].copy()
    train_data = feature_engineering(train_data, is_train=True)
    train_data, _ = power_stats_features(train_data, train_data)
    
    feature_cols = [col for col in train_data.columns if col not in drop_cols and col != '전력소비량(kWh)']
    X = train_data[feature_cols]
    y = train_data['전력소비량(kWh)']
    
    print(f"데이터 크기: {X.shape}")
    print(f"피처 수: {len(feature_cols)}")
    
    # 안정적인 CV 전략 (건물번호 기반 stratified)
    def get_cv_folds():
        # 건물번호별로 균등하게 분할
        building_groups = train_data['건물번호'].values
        from sklearn.model_selection import GroupKFold
        cv = GroupKFold(n_splits=5)
        return list(cv.split(X, y, groups=building_groups))
    
    cv_folds = get_cv_folds()
    
    # XGBoost 최적화 (안전한 범위)
    def xgb_objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 300, 800),  # 적당한 범위
            'max_depth': trial.suggest_int('max_depth', 4, 8),           # 과적합 방지
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),  # 안전한 범위
            'subsample': trial.suggest_float('subsample', 0.7, 0.9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.9),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.5, 3.0),     # 정규화 강화
            'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 3.0),   # 정규화 강화
            'min_child_weight': trial.suggest_int('min_child_weight', 3, 10),
            'tree_method': 'gpu_hist',
            'random_state': 42,
            'verbosity': 0
        }
        
        model = xgb.XGBRegressor(**params)
        
        # 안정적인 CV 평가
        cv_scores = []
        for train_idx, val_idx in cv_folds:
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_tr, y_tr)
            pred = model.predict(X_val)
            score = smape(y_val, pred)
            cv_scores.append(score)
        
        # 평균 + 표준편차로 안정성 고려
        mean_score = np.mean(cv_scores)
        std_score = np.std(cv_scores)
        
        # 표준편차가 큰 경우 패널티 (불안정한 모델 배제)
        penalty = std_score * 0.1  # 표준편차에 비례한 패널티
        return mean_score + penalty
    
    # LightGBM 최적화 (안전한 범위)
    def lgb_objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 300, 800),
            'max_depth': trial.suggest_int('max_depth', 4, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
            'subsample': trial.suggest_float('subsample', 0.7, 0.9),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.9),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.5, 3.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 3.0),
            'num_leaves': trial.suggest_int('num_leaves', 31, 255),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
            'device': 'gpu',
            'random_state': 42,
            'verbose': -1
        }
        
        model = lgb.LGBMRegressor(**params)
        
        cv_scores = []
        for train_idx, val_idx in cv_folds:
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_tr, y_tr)
            pred = model.predict(X_val)
            score = smape(y_val, pred)
            cv_scores.append(score)
        
        mean_score = np.mean(cv_scores)
        std_score = np.std(cv_scores)
        penalty = std_score * 0.1
        return mean_score + penalty
    
    # CatBoost 최적화 (안전한 범위)
    def cat_objective(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 200, 600),
            'depth': trial.suggest_int('depth', 4, 8),
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
            'border_count': trial.suggest_int('border_count', 32, 128),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
            'task_type': 'GPU',
            'random_state': 42,
            'verbose': False
        }
        
        model = cb.CatBoostRegressor(**params)
        
        cv_scores = []
        for train_idx, val_idx in cv_folds:
            X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            model.fit(X_tr, y_tr)
            pred = model.predict(X_val)
            score = smape(y_val, pred)
            cv_scores.append(score)
        
        mean_score = np.mean(cv_scores)
        std_score = np.std(cv_scores)
        penalty = std_score * 0.1
        return mean_score + penalty
    
    # 각 모델별 최적화 실행
    results = {}
    
    print(f"\n=== XGBoost 최적화 (시도 횟수: {n_trials}) ===")
    study_xgb = optuna.create_study(direction='minimize', study_name='xgboost_perfect')
    study_xgb.optimize(xgb_objective, n_trials=n_trials, show_progress_bar=True)
    
    best_xgb_params = study_xgb.best_params
    best_xgb_score = study_xgb.best_value
    results['xgb'] = {'params': best_xgb_params, 'score': best_xgb_score}
    
    print(f"XGBoost 최고 점수: {best_xgb_score:.4f}")
    print(f"XGBoost 최적 파라미터: {best_xgb_params}")
    
    print(f"\n=== LightGBM 최적화 (시도 횟수: {n_trials}) ===")
    study_lgb = optuna.create_study(direction='minimize', study_name='lightgbm_perfect')
    study_lgb.optimize(lgb_objective, n_trials=n_trials, show_progress_bar=True)
    
    best_lgb_params = study_lgb.best_params
    best_lgb_score = study_lgb.best_value
    results['lgb'] = {'params': best_lgb_params, 'score': best_lgb_score}
    
    print(f"LightGBM 최고 점수: {best_lgb_score:.4f}")
    print(f"LightGBM 최적 파라미터: {best_lgb_params}")
    
    print(f"\n=== CatBoost 최적화 (시도 횟수: {n_trials}) ===")
    study_cat = optuna.create_study(direction='minimize', study_name='catboost_perfect')
    study_cat.optimize(cat_objective, n_trials=n_trials, show_progress_bar=True)
    
    best_cat_params = study_cat.best_params
    best_cat_score = study_cat.best_value
    results['cat'] = {'params': best_cat_params, 'score': best_cat_score}
    
    print(f"CatBoost 최고 점수: {best_cat_score:.4f}")
    print(f"CatBoost 최적 파라미터: {best_cat_params}")
    
    # 최종 결과 비교
    print(f"\n=== 최적화 결과 요약 ===")
    for model_name, result in results.items():
        print(f"{model_name}: {result['score']:.4f}")
    
    # 최고 성능 모델 찾기
    best_model = min(results.keys(), key=lambda x: results[x]['score'])
    print(f"\n최고 성능 모델: {best_model} (SMAPE: {results[best_model]['score']:.4f})")
    
    # 안정성 검증 (최고 모델로 추가 검증)
    print(f"\n=== 안정성 검증 ===")
    best_params = results[best_model]['params']
    
    if best_model == 'xgb':
        final_model = xgb.XGBRegressor(**best_params)
    elif best_model == 'lgb':
        final_model = lgb.LGBMRegressor(**best_params)
    else:
        final_model = cb.CatBoostRegressor(**best_params)
    
    # 10-fold 안정성 검증
    from sklearn.model_selection import KFold
    stability_cv = KFold(n_splits=10, shuffle=True, random_state=42)
    stability_scores = []
    
    for train_idx, val_idx in stability_cv.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        final_model.fit(X_tr, y_tr)
        pred = final_model.predict(X_val)
        score = smape(y_val, pred)
        stability_scores.append(score)
    
    stability_mean = np.mean(stability_scores)
    stability_std = np.std(stability_scores)
    
    print(f"10-fold 안정성 검증:")
    print(f"평균 SMAPE: {stability_mean:.4f}")
    print(f"표준편차: {stability_std:.4f}")
    print(f"변동계수: {stability_std/stability_mean:.3f}")
    
    if stability_std/stability_mean > 0.1:
        print("⚠️  높은 변동성 감지 - 파라미터 재조정 권장")
    else:
        print("✅ 안정적인 모델 확인")
    
    return {
        'results': results,
        'best_model': best_model,
        'best_params': best_params,
        'best_score': results[best_model]['score'],
        'stability_mean': stability_mean,
        'stability_std': stability_std
    }

In [None]:
# 사용법
results = perfect_optuna_tuning(combined_data, drop_cols, n_trials=50)
print(f"최고 모델: {results['best_model']}")
print(f"안정적 성능: {results['stability_mean']:.4f} ± {results['stability_std']:.4f}")