In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.family'] = 'Malgun Gothic' # 한글 폰트 설정
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import torch #cuda용 
import warnings # 경고 메시지 무시
warnings.filterwarnings('ignore')
from datetime import datetime
import math
from tqdm import tqdm

In [2]:
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"PyTorch CUDA 버전: {torch.version.cuda}")
    print(f"GPU 이름: {torch.cuda.get_device_name(0)}")
else:
    print("CPU 버전이 설치됨")

CUDA 사용 가능: True
PyTorch CUDA 버전: 11.8
GPU 이름: NVIDIA GeForce GTX 1650


In [14]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# 데이터 로드 및 분할
building = pd.read_csv('../data/building_info.csv')
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['is_train'] = 1
test['is_train'] = 0

# 전체 데이터 합치기
all_data = pd.concat([train, test], ignore_index=True)

# train과 test 분리
train_df = all_data[all_data['is_train'] == 1].drop('is_train', axis=1)
test_df = all_data[all_data['is_train'] == 0].drop('is_train', axis=1)

# 시간 특성 생성
date = pd.to_datetime(train_df['일시'])
train_df['hour'] = date.dt.hour
train_df['day'] = date.dt.weekday
train_df['month'] = date.dt.month
train_df['week'] = date.dt.isocalendar().week

date = pd.to_datetime(test_df['일시'])
test_df['hour'] = date.dt.hour
test_df['day'] = date.dt.weekday
test_df['month'] = date.dt.month
test_df['week'] = date.dt.isocalendar().week

# 건물정보 전처리
cols = ['태양광용량(kW)','ESS저장용량(kWh)','PCS용량(kW)']
for col in cols:
    building[col] = building[col].apply(lambda x: 0 if x == '-' else float(x))

# train 데이터 결측치 처리
train_ft = train_df.drop(columns=['num_date_time', '일시','일조(hr)', '일사(MJ/m2)'])
test_ft = test_df.drop(columns=['num_date_time', '일시'])

train_ft['강수량(mm)'] = train_ft['강수량(mm)'].fillna(0)

# 풍속 결측치 처리
for i in train_ft[train_ft['풍속(m/s)'].isnull()].index:
    month = train_ft.iloc[i]['month']
    hour = train_ft.iloc[i]['hour']
    num = train_ft.iloc[i]['건물번호']
    data_mean = train_ft[(train_ft['건물번호']==num)&(train_ft['month']==month) & (train_ft['hour']==hour)]['풍속(m/s)'].describe()['mean']
    train_ft.loc[i,'풍속(m/s)'] = data_mean

# 습도 결측치 처리
for i in train_ft[train_ft['습도(%)'].isnull()].index:
    month = train_ft.iloc[i]['month']
    hour = train_ft.iloc[i]['hour']
    num = train_ft.iloc[i]['건물번호']
    data_mean = train_ft[(train_ft['건물번호']==num)&(train_ft['month']==month) & (train_ft['hour']==hour)]['습도(%)'].describe()['mean']
    train_ft.loc[i,'습도(%)'] = data_mean

# 건물정보 합치기
train_ft = train_ft.merge(building, how='left', on='건물번호')
test_ft = test_ft.merge(building, how='left', on='건물번호')

# 범주형 인코딩
conv_dict = dict(zip(list(train_ft['건물유형'].unique()), range(len(train_ft['건물유형'].unique()))))
train_ft['건물유형'] = train_ft['건물유형'].map(conv_dict)
test_ft['건물유형'] = test_ft['건물유형'].map(conv_dict)

# 타겟 생성
target = train_df['전력소비량(kWh)']

# 공휴일 특성
train_ft['holiday'] = train_ft.apply(lambda x: 0 if x['day'] < 5 else 1, axis=1)
test_ft['holiday'] = test_ft.apply(lambda x: 0 if x['day'] < 5 else 1, axis=1)

# 시간 순환 특성
train_ft['sin_time'] = np.sin(2*np.pi*train_ft.hour/24)
train_ft['cos_time'] = np.cos(2*np.pi*train_ft.hour/24)
test_ft['sin_time'] = np.sin(2*np.pi*test_ft.hour/24)
test_ft['cos_time'] = np.cos(2*np.pi*test_ft.hour/24)

# 컬럼명 확인 및 디버깅
print("train_ft 컬럼들:")
print(train_ft.columns.tolist())
print("\ntest_ft 컬럼들:")
print(test_ft.columns.tolist())

# 가능한 기온 컬럼명들 확인
temp_cols = [col for col in train_ft.columns if '기온' in col or 'temp' in col.lower()]
humidity_cols = [col for col in train_ft.columns if '습도' in col or 'humid' in col.lower()]

print(f"\n기온 관련 컬럼: {temp_cols}")
print(f"습도 관련 컬럼: {humidity_cols}")

# 실제 컬럼명 사용
if temp_cols and humidity_cols:
    temp_col = temp_cols[0]  # 첫 번째 기온 컬럼
    humidity_col = humidity_cols[0]  # 첫 번째 습도 컬럼
    
    print(f"\n사용할 컬럼: 기온={temp_col}, 습도={humidity_col}")
    
    # 불쾌지수
    train_ft['THI'] = 9/5*train_ft[temp_col] - 0.55*(1-train_ft[humidity_col]/100)*(9/5*train_ft[temp_col]-26)+32
    test_ft['THI'] = 9/5*test_ft[temp_col] - 0.55*(1-test_ft[humidity_col]/100)*(9/5*test_ft[temp_col]-26)+32
    
    train_ft['THI'] = pd.cut(train_ft['THI'], bins=[0, 68, 75, 80, 200], labels=[1,2,3,4])
    train_ft['THI'] = train_ft['THI'].astype(int)
    test_ft['THI'] = pd.cut(test_ft['THI'], bins=[0, 68, 75, 80, 200], labels=[1,2,3,4])
    test_ft['THI'] = test_ft['THI'].astype(int)
    
    # CDH 함수
    def CDH(xs):
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[:(i+1)]-26))
            else:
                ys.append(np.sum(xs[(i-11):(i+1)]-26))
        return np.array(ys)
    
    # CDH 특성 생성
    cdhs = np.array([])
    for num in range(1, 101, 1):
        temp = train_ft[train_ft['건물번호'] == num]
        cdh = CDH(temp[temp_col].values)
        cdhs = np.concatenate([cdhs, cdh])
    train_ft['CDH'] = cdhs
    
    cdhs = np.array([])
    for num in range(1, 101, 1):
        temp = test_ft[test_ft['건물번호'] == num]
        cdh = CDH(temp[temp_col].values)
        cdhs = np.concatenate([cdhs, cdh])
    test_ft['CDH'] = cdhs
else:
    print("기온 또는 습도 컬럼을 찾을 수 없습니다. 컬럼명을 직접 확인해주세요.")

# 요일 시간별 사용량 특성들
power_mean = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['건물번호', 'hour', 'day','month'], aggfunc=np.mean).reset_index()
power_mean = power_mean.drop_duplicates(subset=['건물번호', 'hour', 'day'], keep='first')

train_ft = train_ft.merge(power_mean[['건물번호', 'hour', 'day', '전력소비량(kWh)']], on=['건물번호', 'hour', 'day'], how='left', suffixes=('', '_mean'))
train_ft.rename(columns={'전력소비량(kWh)_mean': 'day_hour_mean'}, inplace=True)

test_ft = test_ft.merge(power_mean[['건물번호', 'hour', 'day', '전력소비량(kWh)']], on=['건물번호', 'hour', 'day'], how='left', suffixes=('', '_mean'))
test_ft.rename(columns={'전력소비량(kWh)': 'day_hour_mean'}, inplace=True)

# 월-시간별 평균
power_mean = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['건물번호', 'hour', 'day','month'], aggfunc=np.mean).reset_index()
power_mean = power_mean.drop_duplicates(subset=['건물번호', 'hour', 'month'], keep='first')

train_ft = train_ft.merge(power_mean[['건물번호', 'hour', 'month', '전력소비량(kWh)']], on=['건물번호', 'hour', 'month'], how='left', suffixes=('', '_mean'))
train_ft.rename(columns={'전력소비량(kWh)_mean': 'month_hour_mean'}, inplace=True)

test_ft = test_ft.merge(power_mean[['건물번호', 'hour', 'month', '전력소비량(kWh)']], on=['건물번호', 'hour', 'month'], how='left', suffixes=('', '_mean'))
test_ft.rename(columns={'전력소비량(kWh)': 'month_hour_mean'}, inplace=True)

# 시간별 평균/표준편차/중앙값
power_mean = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['건물번호', 'hour'], aggfunc=np.mean).reset_index()
train_ft = train_ft.merge(power_mean[['건물번호', 'hour', '전력소비량(kWh)']], on=['건물번호', 'hour'], how='left', suffixes=('', '_mean'))
train_ft.rename(columns={'전력소비량(kWh)_mean': 'hour_mean'}, inplace=True)

test_ft = test_ft.merge(power_mean[['건물번호', 'hour', '전력소비량(kWh)']], on=['건물번호', 'hour'], how='left', suffixes=('', '_mean'))
test_ft.rename(columns={'전력소비량(kWh)': 'hour_mean'}, inplace=True)

power_std = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['건물번호', 'hour'], aggfunc=np.std).reset_index()
train_ft = train_ft.merge(power_std[['건물번호', 'hour', '전력소비량(kWh)']], on=['건물번호', 'hour'], how='left', suffixes=('', '_std'))
train_ft.rename(columns={'전력소비량(kWh)_std': 'hour_std'}, inplace=True)

test_ft = test_ft.merge(power_std[['건물번호', 'hour', '전력소비량(kWh)']], on=['건물번호', 'hour'], how='left', suffixes=('', '_std'))
test_ft.rename(columns={'전력소비량(kWh)': 'hour_std'}, inplace=True)

power_std = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['건물번호', 'hour', 'day'], aggfunc=np.std).reset_index()
train_ft = train_ft.merge(power_std[['건물번호', 'hour', 'day', '전력소비량(kWh)']], on=['건물번호', 'hour', 'day'], how='left', suffixes=('', '_std'))
train_ft.rename(columns={'전력소비량(kWh)_std': 'day_hour_std'}, inplace=True)

test_ft = test_ft.merge(power_std[['건물번호', 'hour', 'day', '전력소비량(kWh)']], on=['건물번호', 'hour', 'day'], how='left', suffixes=('', '_std'))
test_ft.rename(columns={'전력소비량(kWh)': 'day_hour_std'}, inplace=True)

power_median = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['건물번호', 'hour', 'day'], aggfunc=np.median).reset_index()
train_ft = train_ft.merge(power_median[['건물번호', 'hour', 'day', '전력소비량(kWh)']], on=['건물번호', 'hour', 'day'], how='left', suffixes=('', '_median'))
train_ft.rename(columns={'전력소비량(kWh)_median': 'day_hour_median'}, inplace=True)

test_ft = test_ft.merge(power_median[['건물번호', 'hour', 'day', '전력소비량(kWh)']], on=['건물번호', 'hour', 'day'], how='left', suffixes=('', '_median'))
test_ft.rename(columns={'전력소비량(kWh)': 'day_hour_median'}, inplace=True)

# 불필요 컬럼 제거
train_ft = train_ft.drop(columns=['holiday', '건물유형', '강수량(mm)', '풍속(m/s)','전력소비량(kWh)'])
test_ft = test_ft.drop(columns=['holiday', '건물유형', '강수량(mm)', '풍속(m/s)'])

# 실제 존재하는 컬럼만 선택
print("최종 train_ft 컬럼들:")
print(train_ft.columns.tolist())

# 기본 컬럼들
base_cols = ['건물번호', 'hour', 'day', 'month', 'week', '연면적(m2)',
           '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'sin_time',
           'cos_time', 'day_hour_mean', 'month_hour_mean','hour_mean', 'hour_std','day_hour_std',
           'day_hour_median']

# 실제 존재하는 컬럼만 필터링
existing_cols = [col for col in base_cols if col in train_ft.columns]

# 기온, 습도, THI, CDH 컬럼 추가 (존재하는 경우만)
for col in train_ft.columns:
    if any(keyword in col for keyword in ['기온', '습도', 'THI', 'CDH', '이동평균']):
        if col not in existing_cols:
            existing_cols.append(col)

print(f"선택된 컬럼들: {existing_cols}")

train_ft = train_ft[existing_cols]
test_ft = test_ft[existing_cols]

train_ft['전력소비량(kWh)'] = train_df['전력소비량(kWh)']

# 클러스터링
train_ft['holiday'] = train_ft.apply(lambda x: 0 if x['day'] < 5 else 1, axis=1)
test_ft['holiday'] = test_ft.apply(lambda x: 0 if x['day'] < 5 else 1, axis=1)

weekday_mean = train_ft[train_ft.holiday==0].pivot_table(values='전력소비량(kWh)', index='건물번호', columns='hour', aggfunc='first')
weekend_mean = train_ft[train_ft.holiday==1].pivot_table(values='전력소비량(kWh)', index='건물번호', columns='hour', aggfunc='first')

tmp = pd.merge(weekday_mean, weekend_mean, how='left', on='건물번호')

SEED = 42
kmeans = KMeans(n_clusters=5, random_state=SEED, n_init='auto')
kmeans.fit(tmp)

cluster_ = kmeans.predict(tmp)
tmp['cluster'] = cluster_

train_ft = train_ft.merge(tmp[['cluster']], how='left', on='건물번호')
test_ft = test_ft.merge(tmp[['cluster']], how='left', on='건물번호')

train_ft = train_ft.drop(columns=['holiday'])
test_ft = test_ft.drop(columns=['holiday'])

power_mean = pd.pivot_table(train_ft, values='전력소비량(kWh)', index=['cluster', 'hour', 'day','month'], aggfunc=np.mean).reset_index()
power_mean = power_mean.drop_duplicates(subset=['cluster', 'hour', 'day'], keep='first')

train_ft = train_ft.merge(power_mean[['cluster', 'hour', 'day', '전력소비량(kWh)']], on=['cluster', 'hour', 'day'], how='left', suffixes=('', '_mean'))
train_ft.rename(columns={'전력소비량(kWh)_mean': 'cluster_day_hour_mean'}, inplace=True)

test_ft = test_ft.merge(power_mean[['cluster', 'hour', 'day', '전력소비량(kWh)']], on=['cluster', 'hour', 'day'], how='left', suffixes=('', '_mean'))
test_ft.rename(columns={'전력소비량(kWh)': 'cluster_day_hour_mean'}, inplace=True)

train_ft = train_ft.drop(columns=['전력소비량(kWh)'])

# 이동평균 (기온, 습도 컬럼이 있는 경우만)
if temp_cols and humidity_cols:
    temp_col = temp_cols[0]
    humidity_col = humidity_cols[0]
    
    # 일별 통계 특성 추가
    def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
        result_dict = {}
        grouped_temp = dataframe.groupby(['건물번호', 'month', 'day'])[target_column].agg(aggregation_func)
        
        for (building, month, day), value in grouped_temp.items():
            result_dict.setdefault(building, {}).setdefault(month, {})[day] = value
        
        dataframe[output_column] = [
            result_dict.get(row['건물번호'], {}).get(row['month'], {}).get(row['day'], None)
            for _, row in dataframe.iterrows()
        ]
    
    # 일별 기온 통계 추가
    calculate_day_values(train_ft, temp_col, 'day_max_temperature', 'max')
    calculate_day_values(train_ft, temp_col, 'day_mean_temperature', 'mean')
    calculate_day_values(train_ft, temp_col, 'day_min_temperature', 'min')
    train_ft['day_temperature_range'] = train_ft['day_max_temperature'] - train_ft['day_min_temperature']
    
    calculate_day_values(test_ft, temp_col, 'day_max_temperature', 'max')
    calculate_day_values(test_ft, temp_col, 'day_mean_temperature', 'mean')
    calculate_day_values(test_ft, temp_col, 'day_min_temperature', 'min')
    test_ft['day_temperature_range'] = test_ft['day_max_temperature'] - test_ft['day_min_temperature']
    
    # WCT(체감온도) 변수 추가 - 풍속 컬럼이 있다면
    if any('풍속' in col for col in train_ft.columns):
        windspeed_col = [col for col in train_ft.columns if '풍속' in col][0]
        # 원래 데이터에서 풍속 가져오기 (drop 전)
        train_windspeed = train_df['풍속(m/s)']
        test_windspeed = test_df['풍속(m/s)']
        
        train_ft['WCT'] = 13.12 + 0.6125*train_ft[temp_col] - 11.37*(train_windspeed**0.16) + 0.3965*(train_windspeed**0.16)*train_ft[temp_col]
        test_ft['WCT'] = 13.12 + 0.6125*test_ft[temp_col] - 11.37*(test_windspeed**0.16) + 0.3965*(test_windspeed**0.16)*test_ft[temp_col]
    
    # 시간 푸리에 변환 (기존과 다른 방식)
    train_ft['sin_hour2'] = np.sin(2 * np.pi * train_ft['hour']/23.0)
    train_ft['cos_hour2'] = np.cos(2 * np.pi * train_ft['hour']/23.0)
    test_ft['sin_hour2'] = np.sin(2 * np.pi * test_ft['hour']/23.0)
    test_ft['cos_hour2'] = np.cos(2 * np.pi * test_ft['hour']/23.0)
    
    # 날짜 푸리에 변환
    train_ft['sin_date'] = -np.sin(2 * np.pi * (train_ft['month']+train_ft['day']/31)/12)
    train_ft['cos_date'] = -np.cos(2 * np.pi * (train_ft['month']+train_ft['day']/31)/12)
    test_ft['sin_date'] = -np.sin(2 * np.pi * (test_ft['month']+test_ft['day']/31)/12)
    test_ft['cos_date'] = -np.cos(2 * np.pi * (test_ft['month']+test_ft['day']/31)/12)
    
    # 월 푸리에 변환  
    train_ft['sin_month'] = -np.sin(2 * np.pi * train_ft['month']/12.0)
    train_ft['cos_month'] = -np.cos(2 * np.pi * train_ft['month']/12.0)
    test_ft['sin_month'] = -np.sin(2 * np.pi * test_ft['month']/12.0)
    test_ft['cos_month'] = -np.cos(2 * np.pi * test_ft['month']/12.0)
    
    # 요일 푸리에 변환
    train_ft['sin_dayofweek'] = -np.sin(2 * np.pi * (train_ft['day']+1)/7.0)
    train_ft['cos_dayofweek'] = -np.cos(2 * np.pi * (train_ft['day']+1)/7.0)
    test_ft['sin_dayofweek'] = -np.sin(2 * np.pi * (test_ft['day']+1)/7.0)
    test_ft['cos_dayofweek'] = -np.cos(2 * np.pi * (test_ft['day']+1)/7.0)
    
    # 이동평균
    window_size = 96
    train_ft['기온_4일_이동평균'] = train_ft.groupby('건물번호')[temp_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    train_ft['습도_4일_이동평균'] = train_ft.groupby('건물번호')[humidity_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    
    window_size = 168
    train_ft['기온_7일_이동평균'] = train_ft.groupby('건물번호')[temp_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    train_ft['습도_7일_이동평균'] = train_ft.groupby('건물번호')[humidity_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    
    # 이동평균을 위해 train 데이터를 test에 추가
    for i in range(1, 101):
        test_ft = pd.concat([train_ft[train_ft['건물번호']==i], test_ft], axis=0)
    
    window_size = 96
    test_ft['기온_4일_이동평균'] = test_ft.groupby('건물번호')[temp_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    test_ft['습도_4일_이동평균'] = test_ft.groupby('건물번호')[humidity_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    
    window_size = 168
    test_ft['기온_7일_이동평균'] = test_ft.groupby('건물번호')[temp_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    test_ft['습도_7일_이동평균'] = test_ft.groupby('건물번호')[humidity_col].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    
    # 실제 test 데이터만 남기기 (train 길이만큼 제거)
    test_ft = test_ft.tail(len(test_df))

print("전처리 완료!")
print(f"Train shape: {train_ft.shape}")
print(f"Test shape: {test_ft.shape}")
print(f"Target shape: {target.shape}")

# 머신러닝 학습용 데이터 준비
X_train = train_ft.copy()
y_train = target.copy()
X_test = test_ft.copy()

print("\n머신러닝 학습 준비 완료!")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")  
print(f"X_test shape: {X_test.shape}")

# 결측치 확인
print(f"\nX_train 결측치: {X_train.isnull().sum().sum()}")
print(f"X_test 결측치: {X_test.isnull().sum().sum()}")
print(f"y_train 결측치: {y_train.isnull().sum()}")

train_ft 컬럼들:
['건물번호', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '전력소비량(kWh)', 'hour', 'day', 'month', 'week', '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'holiday', 'sin_time', 'cos_time']

test_ft 컬럼들:
['건물번호', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)', 'hour', 'day', 'month', 'week', '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'holiday', 'sin_time', 'cos_time']

기온 관련 컬럼: ['기온(°C)']
습도 관련 컬럼: ['습도(%)']

사용할 컬럼: 기온=기온(°C), 습도=습도(%)
최종 train_ft 컬럼들:
['건물번호', '기온(°C)', '습도(%)', 'hour', 'day', 'month', 'week', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'sin_time', 'cos_time', 'THI', 'CDH', 'day_hour_mean', 'month_hour_mean', 'hour_mean', 'hour_std', 'day_hour_std', 'day_hour_median']
선택된 컬럼들: ['건물번호', 'hour', 'day', 'month', 'week', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', 'sin_time', 'cos_time', 'day_hour_mean', 'month_hour_mean', 'hour_mean', 'hour_

In [25]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb

# SMAPE 계산 함수
def smape(y_true, y_pred):
    """SMAPE (Symmetric Mean Absolute Percentage Error) 계산"""
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def main_prediction(X_train, y_train, X_test, test_df):
    """XGBoost + LightGBM 전용 앙상블 파이프라인"""
    
    print("XGBoost + LightGBM 간단 앙상블 파이프라인 시작...")
    
    # 결측치 처리
    if X_train.isnull().sum().sum() > 0:
        X_train = X_train.fillna(X_train.median())
    if X_test.isnull().sum().sum() > 0:
        X_test = X_test.fillna(X_train.median())
    
    # GPU 사용 확인
    print("=== GPU 사용 확인 ===")
    try:
        print(f"XGBoost version: {xgb.__version__}")
        print(f"LightGBM version: {lgb.__version__}")
        
        import torch
        if torch.cuda.is_available():
            print(f"CUDA available: {torch.cuda.get_device_name(0)}")
        else:
            print("CUDA not available - falling back to CPU")
    except Exception as e:
        print(f"GPU 확인 중 오류: {e}")
    
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    
    # 1. Level 1: XGBoost + LightGBM (5-fold)
    print("\n=== Level 1: XGBoost + LightGBM ===")
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Level 1 모델들
    level1_models = {
        'xgb': xgb.XGBRegressor(
            n_estimators=1000,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method='gpu_hist',
            gpu_id=0,
            random_state=42,
            verbosity=0
        ),
        'lgb': lgb.LGBMRegressor(
            n_estimators=1000,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            device='gpu',
            random_state=42,
            verbose=-1
        )
    }
    
    # Level 1 예측값 저장용
    oof_predictions = pd.DataFrame(index=X_train.index)
    test_predictions = pd.DataFrame()
    
    for name, model in level1_models.items():
        print(f"Learning {name}...")
        oof_pred = np.zeros(len(X_train))
        test_pred = np.zeros(len(X_test))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            # 모델별 특별 처리
            if name == 'xgb':
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:  # lgb
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.log_evaluation(0)])
            
            oof_pred[val_idx] = model.predict(X_val)
            test_pred += model.predict(X_test) / 5
        
        oof_predictions[f'{name}_pred'] = oof_pred
        test_predictions[f'{name}_pred'] = test_pred
        
        score = smape(y_train, oof_pred)
        print(f"{name} SMAPE: {score:.4f}")
    
    # Level 1 앙상블 점수
    ensemble_pred = oof_predictions.mean(axis=1)
    ensemble_score = smape(y_train, ensemble_pred)
    print(f"Level 1 앙상블 SMAPE: {ensemble_score:.4f}")
    
    # 2. Level 2: XGBoost + LightGBM 메타모델
    print("\n=== Level 2: 메타모델 ===")
    
    # 메타피쳐 생성 (간단하게)
    meta_features_train = oof_predictions.copy()
    meta_features_train['pred_mean'] = oof_predictions.mean(axis=1)
    meta_features_train['pred_diff'] = oof_predictions['xgb_pred'] - oof_predictions['lgb_pred']
    
    meta_features_test = test_predictions.copy()
    meta_features_test['pred_mean'] = test_predictions.mean(axis=1)
    meta_features_test['pred_diff'] = test_predictions['xgb_pred'] - test_predictions['lgb_pred']
    
    # 기본 특성 추가 (있는 것만)
    basic_features = []
    for col in ['건물번호', 'hour', 'day', 'month']:
        if col in X_train.columns:
            basic_features.append(col)
    
    if basic_features:
        meta_features_train = pd.concat([meta_features_train, X_train[basic_features]], axis=1)
        meta_features_test = pd.concat([meta_features_test, X_test[basic_features]], axis=1)
    
    print(f"메타피쳐 수: {len(meta_features_train.columns)}")
    
    # 메타모델들 (XGBoost + LightGBM만)
    meta_models = {
        'xgb_meta': xgb.XGBRegressor(
            n_estimators=500, 
            max_depth=4, 
            learning_rate=0.1,
            subsample=0.9, 
            colsample_bytree=0.9,
            tree_method='gpu_hist', 
            gpu_id=0, 
            random_state=42, 
            verbosity=0
        ),
        'lgb_meta': lgb.LGBMRegressor(
            n_estimators=500, 
            max_depth=4, 
            learning_rate=0.1,
            subsample=0.9, 
            colsample_bytree=0.9,
            device='gpu', 
            random_state=42, 
            verbose=-1
        )
    }
    
    # 메타모델 학습 및 예측
    meta_oof = pd.DataFrame(index=X_train.index)
    meta_test = pd.DataFrame()
    
    # 스케일링
    scaler = StandardScaler()
    meta_features_train_scaled = pd.DataFrame(
        scaler.fit_transform(meta_features_train), 
        columns=meta_features_train.columns,
        index=meta_features_train.index
    )
    meta_features_test_scaled = pd.DataFrame(
        scaler.transform(meta_features_test),
        columns=meta_features_test.columns, 
        index=meta_features_test.index
    )
    
    for name, meta_model in meta_models.items():
        print(f"Learning meta model {name}...")
        meta_oof_pred = np.zeros(len(meta_features_train_scaled))
        meta_test_pred = np.zeros(len(meta_features_test_scaled))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(meta_features_train_scaled)):
            X_tr = meta_features_train_scaled.iloc[train_idx]
            X_val = meta_features_train_scaled.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            if 'xgb' in name:
                meta_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:  # lgb
                meta_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.log_evaluation(0)])
            
            meta_oof_pred[val_idx] = meta_model.predict(X_val)
            meta_test_pred += meta_model.predict(meta_features_test_scaled) / 5
        
        meta_oof[f'{name}_pred'] = meta_oof_pred
        meta_test[f'{name}_pred'] = meta_test_pred
        
        score = smape(y_train, meta_oof_pred)
        print(f"{name} meta SMAPE: {score:.4f}")
    
    # 3. 최종 앙상블 (성능 기반 가중치)
    print("\n=== 최종 앙상블 ===")
    
    # 각 메타모델 성능 수집
    meta_scores = {}
    for col in meta_oof.columns:
        score = smape(y_train, meta_oof[col])
        meta_scores[col] = score
        print(f"{col} SMAPE: {score:.4f}")
    
    # 성능 기반 가중치 계산
    scores_array = np.array(list(meta_scores.values()))
    inverse_scores = 1 / (scores_array + 1e-8)
    weights = inverse_scores / inverse_scores.sum()
    
    print(f"동적 가중치: {dict(zip(meta_scores.keys(), weights))}")
    
    final_oof = (meta_oof * weights).sum(axis=1)
    final_test = (meta_test * weights).sum(axis=1)
    
    final_score = smape(y_train, final_oof)
    print(f"최종 앙상블 SMAPE: {final_score:.4f}")
    
    # Level 1 앙상블과 비교
    if final_score > ensemble_score:
        print(f"Level 1 앙상블이 더 좋음. Level 1 사용")
        final_test = test_predictions.mean(axis=1)
        final_score = ensemble_score
    
    # 4. 최종 예측값 생성
    print("\n=== 최종 예측값 생성 ===")
    
    # 음수 예측값 제거
    final_predictions = [max(0, pred) for pred in final_test.tolist()]
    
    # 5. 제출파일 생성
    print("\n=== 제출파일 생성 ===")
    
    # 제출 DataFrame 생성
    submission_df = pd.DataFrame({
        'num_date_time': test_df['num_date_time'],
        'answer': final_predictions
    })
    
    # 파일 저장
    os.makedirs('../submission', exist_ok=True)
    now = datetime.now()
    filename = f"{now.strftime('%Y%m%d_%H%M')}_XGB_LGB_Simple_SMAPE_{final_score:.4f}.csv"
    filepath = os.path.join('../submission', filename)
    
    submission_df.to_csv(filepath, index=False)
    
    print(f"\n=== 최종 결과 ===")
    print(f"Level 1 앙상블 SMAPE: {ensemble_score:.4f}")
    print(f"Level 2 메타모델 SMAPE: {final_score:.4f}")
    print(f"제출파일 저장: {filepath}")
    
    return {
        'submission_df': submission_df,
        'final_score': final_score,
        'level1_models': level1_models,
        'meta_models': meta_models,
        'filepath': filepath
    }

# 사용법:
# result = main_prediction(X_train, y_train, X_test, test_df)
# submission = result['submission_df']
# print(f"최종 SMAPE: {result['final_score']:.4f}")

In [26]:
# 바로 실행
result = main_prediction(X_train, y_train, X_test, test_df)

# 결과 확인
print(f"최종 SMAPE: {result['final_score']:.4f}")
submission = result['submission_df']

XGBoost + LightGBM 간단 앙상블 파이프라인 시작...
=== GPU 사용 확인 ===
XGBoost version: 3.0.2
LightGBM version: 4.6.0
CUDA available: NVIDIA GeForce GTX 1650
Train shape: (204000, 40), Test shape: (16800, 40)

=== Level 1: XGBoost + LightGBM ===
Learning xgb...
xgb SMAPE: 4.6531
Learning lgb...
lgb SMAPE: 5.9317
Level 1 앙상블 SMAPE: 5.0575

=== Level 2: 메타모델 ===
메타피쳐 수: 8
Learning meta model xgb_meta...
xgb_meta meta SMAPE: 4.5391
Learning meta model lgb_meta...
lgb_meta meta SMAPE: 4.4418

=== 최종 앙상블 ===
xgb_meta_pred SMAPE: 4.5391
lgb_meta_pred SMAPE: 4.4418
동적 가중치: {'xgb_meta_pred': np.float64(0.4945851637801683), 'lgb_meta_pred': np.float64(0.5054148362198316)}
최종 앙상블 SMAPE: 4.4490

=== 최종 예측값 생성 ===

=== 제출파일 생성 ===

=== 최종 결과 ===
Level 1 앙상블 SMAPE: 5.0575
Level 2 메타모델 SMAPE: 4.4490
제출파일 저장: ../submission\20250725_0928_XGB_LGB_Simple_SMAPE_4.4490.csv
최종 SMAPE: 4.4490


In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb

# SMAPE 계산 함수
def smape(y_true, y_pred):
    """SMAPE (Symmetric Mean Absolute Percentage Error) 계산"""
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

def time_aware_preprocessing(train_df, test_df, building):
    """시간 순서를 지키는 전처리 - 개선된 버전"""
    print("=== 시간 순서 준수 전처리 (개선) ===")
    
    # 대회 기준 시점: 2024년 8월 24일 23:59:59
    cutoff_date = pd.to_datetime('2024-08-24 23:59:59')
    
    # 시간 컬럼 변환
    train_df['일시'] = pd.to_datetime(train_df['일시'])
    test_df['일시'] = pd.to_datetime(test_df['일시'])
    
    # 기준 시점 이후 데이터 체크
    future_data_in_train = train_df[train_df['일시'] > cutoff_date]
    if len(future_data_in_train) > 0:
        print(f"⚠️ Train에서 미래 데이터 {len(future_data_in_train)}개 발견 - 제거")
        train_df = train_df[train_df['일시'] <= cutoff_date]
    
    print(f"Train 데이터 기간: {train_df['일시'].min()} ~ {train_df['일시'].max()}")
    print(f"Test 데이터 기간: {test_df['일시'].min()} ~ {test_df['일시'].max()}")
    
    # 시간 피처 생성 (더 풍부하게)
    for df in [train_df, test_df]:
        df['hour'] = df['일시'].dt.hour
        df['day'] = df['일시'].dt.weekday
        df['month'] = df['일시'].dt.month
        df['week'] = df['일시'].dt.isocalendar().week
        df['day_of_month'] = df['일시'].dt.day
        
        # 순환 피처 (더 다양하게)
        df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24)
        df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24)
        df['sin_day'] = np.sin(2 * np.pi * df['day'] / 7)
        df['cos_day'] = np.cos(2 * np.pi * df['day'] / 7)
        df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
        df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
        df['sin_week'] = np.sin(2 * np.pi * df['week'] / 52)
        df['cos_week'] = np.cos(2 * np.pi * df['week'] / 52)
        
        # 시간대 구분 (더 세밀하게)
        df['is_weekend'] = (df['day'] >= 5).astype(int)
        df['is_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
        df['is_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
        df['is_evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype(int)
        df['is_night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)
        
        # 계절성
        df['is_summer'] = ((df['month'] >= 6) & (df['month'] <= 8)).astype(int)
        df['is_winter'] = ((df['month'] == 12) | (df['month'] <= 2)).astype(int)
        
        # 업무시간
        df['is_business_hour'] = ((df['hour'] >= 9) & (df['hour'] <= 18) & (df['day'] < 5)).astype(int)
    
    # 날씨 파생 피처 (규칙 준수)
    for df in [train_df, test_df]:
        if '기온(C)' in df.columns:
            # 불쾌지수 (THI)
            if '습도(%)' in df.columns:
                df['THI'] = 9/5*df['기온(C)'] - 0.55*(1-df['습도(%)']/100)*(9/5*df['기온(C)']-26)+32
                df['THI_high'] = (df['THI'] > 80).astype(int)
                df['THI_low'] = (df['THI'] < 68).astype(int)
            
            # 온도 구간
            df['temp_very_hot'] = (df['기온(C)'] > 30).astype(int)
            df['temp_hot'] = ((df['기온(C)'] > 25) & (df['기온(C)'] <= 30)).astype(int)
            df['temp_moderate'] = ((df['기온(C)'] > 15) & (df['기온(C)'] <= 25)).astype(int)
            df['temp_cold'] = (df['기온(C)'] <= 15).astype(int)
        
        if '습도(%)' in df.columns:
            df['humidity_high'] = (df['습도(%)'] > 70).astype(int)
            df['humidity_low'] = (df['습도(%)'] < 40).astype(int)
    
    # 건물정보 처리 (더 세밀하게)
    building_cols = ['태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)']
    for col in building_cols:
        building[col] = building[col].apply(lambda x: 0 if x == '-' else float(x))
    
    # 건물 특성 파생변수
    building['has_solar'] = (building['태양광용량(kW)'] > 0).astype(int)
    building['has_ess'] = (building['ESS저장용량(kWh)'] > 0).astype(int)
    building['has_pcs'] = (building['PCS용량(kW)'] > 0).astype(int)
    building['energy_self_sufficient'] = (building['has_solar'] & building['has_ess']).astype(int)
    
    # 건물 크기 구간
    building['building_size'] = pd.qcut(building['연면적(m2)'], q=5, labels=[0,1,2,3,4]).astype(int)
    building['cooling_ratio'] = building['냉방면적(m2)'] / building['연면적(m2)']
    
    # 건물유형 인코딩
    building_types = building['건물유형'].unique()
    building_type_map = {bt: i for i, bt in enumerate(building_types)}
    building['건물유형_encoded'] = building['건물유형'].map(building_type_map)
    
    # 건물정보 병합
    train_df = train_df.merge(building, on='건물번호', how='left')
    test_df = test_df.merge(building, on='건물번호', how='left')
    
    # 규칙 준수 통계 피처 (Train 데이터만 사용, 시점 이전)
    print("=== 규칙 준수 통계 피처 생성 ===")
    
    # 건물별 기본 통계 (전체 Train 기간)
    building_stats = train_df.groupby('건물번호')['전력소비량(kWh)'].agg([
        'mean', 'std', 'min', 'max', 'median'
    ]).reset_index()
    building_stats.columns = ['건물번호', 'building_power_mean', 'building_power_std', 
                             'building_power_min', 'building_power_max', 'building_power_median']
    
    # 건물별 시간대 통계
    hourly_stats = train_df.groupby(['건물번호', 'hour'])['전력소비량(kWh)'].mean().reset_index()
    hourly_stats.columns = ['건물번호', 'hour', 'building_hour_mean']
    
    # 건물별 요일 통계
    daily_stats = train_df.groupby(['건물번호', 'day'])['전력소비량(kWh)'].mean().reset_index()
    daily_stats.columns = ['건물번호', 'day', 'building_day_mean']
    
    # 통계 피처 병합
    for df in [train_df, test_df]:
        df = df.merge(building_stats, on='건물번호', how='left')
        df = df.merge(hourly_stats, on=['건물번호', 'hour'], how='left')
        df = df.merge(daily_stats, on=['건물번호', 'day'], how='left')
    
    # 전체 통계 (건물 상관없이)
    overall_hourly = train_df.groupby('hour')['전력소비량(kWh)'].mean().reset_index()
    overall_hourly.columns = ['hour', 'overall_hour_mean']
    
    overall_daily = train_df.groupby('day')['전력소비량(kWh)'].mean().reset_index()
    overall_daily.columns = ['day', 'overall_day_mean']
    
    for df in [train_df, test_df]:
        df = df.merge(overall_hourly, on='hour', how='left')
        df = df.merge(overall_daily, on='day', how='left')
    
    # 피처 선택
    feature_cols = [
        '건물번호', 'hour', 'day', 'month', 'week', 'day_of_month',
        'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'sin_month', 'cos_month', 'sin_week', 'cos_week',
        'is_weekend', 'is_morning', 'is_afternoon', 'is_evening', 'is_night',
        'is_summer', 'is_winter', 'is_business_hour',
        '기온(C)', '습도(%)', 'THI', 'THI_high', 'THI_low',
        'temp_very_hot', 'temp_hot', 'temp_moderate', 'temp_cold',
        'humidity_high', 'humidity_low',
        '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)',
        'has_solar', 'has_ess', 'has_pcs', 'energy_self_sufficient',
        'building_size', 'cooling_ratio', '건물유형_encoded',
        'building_power_mean', 'building_power_std', 'building_power_min', 'building_power_max', 'building_power_median',
        'building_hour_mean', 'building_day_mean', 'overall_hour_mean', 'overall_day_mean'
    ]
    
    # 존재하는 컬럼만 선택
    available_cols = [col for col in feature_cols if col in train_df.columns]
    print(f"사용 가능 피처: {len(available_cols)}개")
    
    X_train = train_df[available_cols]
    y_train = train_df['전력소비량(kWh)']
    X_test = test_df[available_cols]
    
    # 결측치 처리
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    
    return X_train, y_train, X_test, test_df

def time_series_safe_prediction(X_train, y_train, X_test, test_df):
    """시계열 안전 예측 파이프라인"""
    
    print("시계열 안전 예측 파이프라인 시작...")
    print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
    
    # TimeSeriesSplit 사용 (데이터 누수 방지)
    tscv = TimeSeriesSplit(n_splits=5)
    
    # XGBoost + LightGBM 모델
    models = {
        'xgb': xgb.XGBRegressor(
            n_estimators=1000,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method='gpu_hist',
            gpu_id=0,
            random_state=42,
            verbosity=0
        ),
        'lgb': lgb.LGBMRegressor(
            n_estimators=1000,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            device='gpu',
            random_state=42,
            verbose=-1
        )
    }
    
    # 시계열 교차검증으로 성능 측정
    print("\n=== 시계열 교차검증 ===")
    cv_scores = {}
    test_predictions = {}
    
    for name, model in models.items():
        print(f"Learning {name}...")
        cv_score_list = []
        test_pred_list = []
        
        for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            if name == 'xgb':
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            else:
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.log_evaluation(0)])
            
            val_pred = model.predict(X_val)
            cv_score = smape(y_val, val_pred)
            cv_score_list.append(cv_score)
            
            test_pred = model.predict(X_test)
            test_pred_list.append(test_pred)
        
        cv_scores[name] = np.mean(cv_score_list)
        test_predictions[name] = np.mean(test_pred_list, axis=0)
        
        print(f"{name} CV SMAPE: {cv_scores[name]:.4f} (±{np.std(cv_score_list):.4f})")
    
    # 단순 앙상블 (성능 기반 가중치)
    print("\n=== 앙상블 ===")
    
    # 성능이 좋은 모델에 더 높은 가중치
    scores = np.array(list(cv_scores.values()))
    weights = 1 / (scores + 1e-8)
    weights = weights / weights.sum()
    
    weight_dict = dict(zip(cv_scores.keys(), weights))
    print(f"앙상블 가중치: {weight_dict}")
    
    # 최종 예측
    final_predictions = sum(test_predictions[name] * weight for name, weight in weight_dict.items())
    
    # 음수 제거
    final_predictions = np.maximum(final_predictions, 0)
    
    # 앙상블 CV 점수 (추정)
    ensemble_score = sum(cv_scores[name] * weight for name, weight in weight_dict.items())
    print(f"앙상블 예상 SMAPE: {ensemble_score:.4f}")
    
    # 제출파일 생성
    print("\n=== 제출파일 생성 ===")
    
    submission_df = pd.DataFrame({
        'num_date_time': test_df['num_date_time'],
        'answer': final_predictions
    })
    
    # 파일 저장
    os.makedirs('../submission', exist_ok=True)
    now = datetime.now()
    filename = f"{now.strftime('%Y%m%d_%H%M')}_TimeSafe_SMAPE_{ensemble_score:.4f}.csv"
    filepath = os.path.join('../submission', filename)
    
    submission_df.to_csv(filepath, index=False)
    
    print(f"\n=== 최종 결과 ===")
    print(f"시계열 안전 SMAPE: {ensemble_score:.4f}")
    print(f"제출파일 저장: {filepath}")
    print(f"데이터 누수 없음 ✅")
    
    return {
        'submission_df': submission_df,
        'final_score': ensemble_score,
        'cv_scores': cv_scores,
        'filepath': filepath
    }

# 사용법
def safe_main_prediction(train_df, test_df, building):
    """데이터 누수 없는 안전한 예측"""
    
    # 1. 시간 순서를 지키는 전처리
    X_train, y_train, X_test, test_df = time_aware_preprocessing(train_df, test_df, building)
    
    # 2. 시계열 안전 예측
    result = time_series_safe_prediction(X_train, y_train, X_test, test_df)
    
    return result

# 사용법:
# result = safe_main_prediction(train_df, test_df, building)
# submission = result['submission_df']
# print(f"안전한 SMAPE: {result['final_score']:.4f}")

In [None]:
# 개선된 버전 실행
result = safe_main_prediction(train_df, test_df, building)
print(f"개선된 SMAPE: {result['final_score']:.4f}")

=== 시간 순서 준수 전처리 ===
Train 데이터 기간: 2024-06-01 00:00:00 ~ 2024-08-24 23:00:00
Test 데이터 기간: 2024-08-25 00:00:00 ~ 2024-08-31 23:00:00
사용 가능 피처: 19개
시계열 안전 예측 파이프라인 시작...
Train shape: (204000, 19), Test shape: (16800, 19)

=== 시계열 교차검증 ===
Learning xgb...
