# XGB Regressor + Tab_MLP

In [None]:
from scipy.sparse import load_npz
import numpy as np
import pyarrow as pa, pyarrow.parquet as pq
import pandas as pd
import xgboost as xgb
import os, math, gc, time
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pyarrow as pa, pyarrow.parquet as pq

# ============================================================
# Colab: Tab-MLP 학습 → test_itp 예측 → CSV 생성 → Google Drive 저장
# ============================================================

!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# ========== 0) 구글 드라이브 마운트 & 출력 경로 ==========
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

PROJECT_DIR = "/content/drive/MyDrive"
os.makedirs(PROJECT_DIR, exist_ok=True)

RUN_TAG = time.strftime("%Y%m%d_%H%M%S")  # 파일명에 붙일 타임스탬프

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
print("DEVICE:", DEVICE)

# 최종 사용할 Feature (20개)
final_feats = [
    'humidity', 'wind_gust_spd', 'hour', 'doy', 'wind_spd_b',
    'ceiling', 'uv_idx', 'appr_temp', 'uv_cloud_adj', 'dow',
    'hour_sin', 'doy_sin', 'is_rain', 'rain', 'hour_cos',
    'doy_cos', 'snow', 'coord1', 'coord2', 'haze'
]

target_col = "nins"

# 보간 파라미터
MAX_GAP = 12  # 연속 결손 허용 길이
DAY_HOURS = (6, 18)  # 주간
print("Final feature count:", len(final_feats))

Mounted at /content/drive
DEVICE: cuda
Final feature count: 20


## XGB 예측값 불러오기

In [None]:
# preprocessing_train.py의 train, valid set 불러오기
# train, valid는 시계열 데이터의 특성 상 time leakage를 없애기 위해 관측소별 분리
train = pd.read_parquet('X_train_t.parquet')
valid = pd.read_parquet('X_valid_t.parquet')

# feature/label 분리
X_train = train[final_feats]
X_valid = valid[final_feats]
y_train = train[target_col].values
y_valid = valid[target_col].values

In [None]:
# 학습 완료한 최종 xgb 모델 불러오기
xgb_model_path = "xgb_full_final.json"

xgb_reg = xgb.XGBRegressor()
xgb_reg.load_model(xgb_model_path)

xgb_reg.set_params(
    tree_method="hist",
    device="cuda",
    n_jobs=-1
)

def xgb_predict(df):
    kwargs = {}
    # best iteration 불러와서 range 설정
    if getattr(xgb_reg, "best_iteration", None) is not None:
        kwargs["iteration_range"] = (0, xgb_reg.best_iteration + 1)
    return xgb_reg.predict(df, **kwargs)


In [None]:
# 1) XGB 예측값 (train, valid 각각)
y_pred_tr_xgb = xgb_predict(X_train).astype(np.float32)  # (N_tr,)
y_pred_va_xgb = xgb_predict(X_valid).astype(np.float32)  # (N_va,)

# 2) residual = y - y_hat_xgb
res_tr = (y_train - y_pred_tr_xgb).astype(np.float32).reshape(-1, 1)
res_va = (y_valid - y_pred_va_xgb).astype(np.float32).reshape(-1, 1)

print("MAE XGB train :", mean_absolute_error(y_train, y_pred_tr_xgb))
print("MAE XGB valid :", mean_absolute_error(y_valid, y_pred_va_xgb))

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


MAE XGB train : 30.09756088256836
MAE XGB valid : 30.45993423461914


In [None]:
X_train_nn = X_train.copy()
X_valid_nn = X_valid.copy()

X_train_nn["xgb_pred"] = y_pred_tr_xgb   # 길이 n_sample
X_valid_nn["xgb_pred"] = y_pred_va_xgb   # 길이 n_va

feature_cols_nn = list(X_train_nn.columns)

## MLP modeling

data loader & 결측 확인

In [None]:
# 1) 스케일링
scaler = StandardScaler()
X_train_np = scaler.fit_transform(X_train_nn.values.astype(np.float32))
X_valid_np = scaler.transform(X_valid_nn.values.astype(np.float32))

class ArrayDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y.astype(np.float32))  # (N,1)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_ds = ArrayDataset(X_train_np, res_tr)
valid_ds = ArrayDataset(X_valid_np, res_va)

train_loader = DataLoader(
    train_ds, batch_size=8192, shuffle=True,
    num_workers=2, pin_memory=True, drop_last=True
)

valid_loader = DataLoader(
    valid_ds, batch_size=16384, shuffle=False,
    num_workers=1, pin_memory=True
)

In [None]:
# 결측 & inf 체크
print("X_train_np   NaN:", np.isnan(X_train_np).any(), "Inf:", np.isinf(X_train_np).any())
print("X_valid_np   NaN:", np.isnan(X_valid_np).any(), "Inf:", np.isinf(X_valid_np).any())
print("res_tr       NaN:", np.isnan(res_tr).any(),     "Inf:", np.isinf(res_tr).any())
print("res_va       NaN:", np.isnan(res_va).any(),     "Inf:", np.isinf(res_va).any())

X_train_np   NaN: False Inf: False
X_valid_np   NaN: False Inf: False
res_tr       NaN: False Inf: False
res_va       NaN: False Inf: False


MLP class definition

In [None]:
# Residual Block
class ResidualBlock(nn.Module):
    def __init__(self, d, p=0.15):
        super().__init__()
        self.lin1 = nn.Linear(d, d)
        self.act  = nn.GELU()
        self.drop = nn.Dropout(p)
        self.lin2 = nn.Linear(d, d)

    def forward(self, x):
        skip = x
        x = self.lin1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.lin2(x)
        x = x + skip
        return x


# TabMLP_Attn
class TabMLP_Attn(nn.Module):
    """
    Tabular MLP + Multihead Attention 하이브리드
    - in_dim: 입력 피처 개수
    - hidden: MLP latent 차원
    - blocks: ResidualBlock 개수
    - p: MLP dropout
    - n_heads: MultiheadAttention head 수
    - n_tokens: latent를 몇 개 토큰으로 펼칠지
    """
    def __init__(self, in_dim, hidden=384, blocks=4, p=0.15, n_heads=4, n_tokens=8, in_drop=0.05, attn_drop=0.1):
        super().__init__()

        self.in_dim   = in_dim
        self.hidden   = hidden
        self.n_tokens = n_tokens

        # 1) 입력 정규화 + input dropout + 첫 Linear
        self.in_bn   = nn.BatchNorm1d(in_dim)
        self.in_drop = nn.Dropout(in_drop)
        self.fc_in   = nn.Linear(in_dim, hidden)

        # 2) Residual MLP blocks : ResidualBlock을 blocks만큼 쌓음, modules = [ResidualBock(hidden), ResidualBock(hidden), ResidualBock(hidden)]
        # self.blocks = [module[0],module[1],module[2]]
        self.blocks = nn.Sequential(
            *[ResidualBlock(hidden, p=p) for _ in range(blocks)]
        )

        # 3) Latent → 토큰 생성
        #   (B, hidden) → (B, n_tokens, hidden)
        self.token_proj = nn.Linear(hidden, hidden * n_tokens)

        # 4) Attention 블록
        self.attn_norm = nn.LayerNorm(hidden)
        self.attn = nn.MultiheadAttention(
            embed_dim=hidden,
            num_heads=n_heads,
            dropout=attn_drop,
            batch_first=True,  # (B, seq, dim) 형식 사용
        )

        # 5) Attention 출력 정리 + 최종 head
        self.post_attn_norm = nn.LayerNorm(hidden)
        self.head = nn.Sequential(
            nn.GELU(),
            nn.Linear(hidden, 1),
        )

    def forward(self, x):
        # x: (B, in_dim)
        # 1) 입력 정규화 + 드롭아웃 + 첫 FC(Linear trans.)
        x = self.in_bn(x)
        x = self.in_drop(x)
        x = self.fc_in(x)           # (B, hidden)

        # 2) Residual MLP
        x = self.blocks(x)          # (B, hidden)

        # 3) 토큰 생성
        # (B, hidden) → (B, n_tokens, hidden)
        B = x.size(0)
        tokens = self.token_proj(x)      # (B, hidden * n_tokens)
        tokens = tokens.view(B, self.n_tokens, self.hidden)  # (B, T, H)

        # 4) Attention
        tokens = self.attn_norm(tokens)          # LN → 안정화
        att_out, _ = self.attn(tokens, tokens, tokens)  # (B, T, H)

        # 5) Pooling (토큰 평균) + head
        att_out = self.post_attn_norm(att_out)
        pooled = att_out.mean(dim=1)             # (B, H)
        out = self.head(pooled)                  # (B, 1)
        return out


In [None]:
# Hyperparameter 정의
hidden = 384
blocks = 3
p = 0.2
n_heads = 4
n_tokens = 8
in_drop = 0.05
attn_drop = 0.1

in_dim = X_train_np.shape[1]

model = TabMLP_Attn(in_dim=X_train_np.shape[1],
    hidden= hidden,     # 256~512 사이에서 튜닝
    blocks= blocks,
    p= p,
    n_heads= n_heads,
    n_tokens= n_tokens,
    in_drop= in_drop,
    attn_drop= attn_drop
).to(DEVICE)


In [None]:
# parameter nan, inf값 있는지 확인

for name, p in model.named_parameters():
    if torch.isnan(p).any() or torch.isinf(p).any():
        print("!! NaN/Inf in param:", name)

model train & valid

In [None]:
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=3e-4
)
best_val = float("inf")
patience = 2
bad = 0

# AMP 끄기
use_amp = False
scaler_amp = torch.amp.GradScaler(device="cuda", enabled=False)

for epoch in range(30):
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb = xb.to(DEVICE).float()
        yb = yb.to(DEVICE).float()

        optimizer.zero_grad()
        with torch.amp.autocast("cuda", enabled=False):   # ← AMP 완전 off
            pred_res = model(xb)
            loss = criterion(pred_res, yb)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_ds)

    # valid도 AMP 끄고
    model.eval()
    val_loss = 0.0
    with torch.no_grad(), torch.amp.autocast("cuda", enabled=False):
        for xb, yb in valid_loader:
            xb = xb.to(DEVICE).float()
            yb = yb.to(DEVICE).float()
            pred_res = model(xb)
            loss = criterion(pred_res, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(valid_ds)

    print(f"[{epoch}] train={train_loss:.4f}, valid={val_loss:.4f}")

    if val_loss < best_val - 1e-4:
        best_val = val_loss
        bad = 0
        best_state = {
            "model": model.state_dict(),
            "scaler_mean": scaler.mean_,
            "scaler_scale": scaler.scale_,
            "feature_cols_nn": feature_cols_nn,
        }
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping")
            break


[0] train=28.6217, valid=29.1141
[1] train=28.3529, valid=29.0209
[2] train=28.2144, valid=29.0090


In [None]:
# valid용 NN 입력 만들기
X_valid_nn_pred = X_valid.copy()
X_valid_nn_pred["xgb_pred"] = y_pred_va_xgb
X_valid_nn_pred = X_valid_nn_pred.reindex(columns=best_state["feature_cols_nn"], fill_value=0)

# 스케일링 복구
scaler_inf = StandardScaler()
scaler_inf.mean_ = best_state["scaler_mean"]
scaler_inf.scale_ = best_state["scaler_scale"]
scaler_inf.var_ = scaler_inf.scale_**2
scaler_inf.n_features_in_ = len(best_state["feature_cols_nn"])

X_valid_np_pred = scaler_inf.transform(X_valid_nn_pred.values.astype(np.float32))

def predict_residual_np(X_np, batch=32768):
    preds = []
    n = len(X_np)
    for i in range(0, n, batch):
        xb = torch.from_numpy(X_np[i:i+batch]).to(DEVICE)
        with torch.no_grad(), torch.amp.autocast("cuda", enabled=(DEVICE=="cuda")):
            out = model(xb).squeeze().float().cpu().numpy()
        preds.append(out)
        del xb, out
        torch.cuda.empty_cache()
    return np.concatenate(preds, axis=0)

res_va_pred = predict_residual_np(X_valid_np_pred)  # (N_va,)


# XGB 단독
mae_xgb = mean_absolute_error(y_valid, y_pred_va_xgb)

# 스태킹 (XGB + NN residual)
y_valid_final = y_pred_va_xgb + res_va_pred
y_valid_final = np.clip(y_valid_final, 0, 1200)
mae_stack = mean_absolute_error(y_valid, y_valid_final)

print("MAE XGB      :", mae_xgb)
print("MAE Stacking :", mae_stack)

# test set 예측 후 csv생성

In [None]:

# ========== 1) 데이터 불러오기 ==========
# test_itp: 보간 완료된 테스트 데이터셋 (CSV/Parquet 어떤 형식이든 OK)
TEST_ITP_PATH = "test_itp.parquet"
df_test = pd.read_parquet(TEST_ITP_PATH)

# 메타 컬럼(제출 양식용) 확정
meta_cols = [c for c in ["time", "pv_id", "type"] if c in df_test.columns]
sub_meta = df_test[meta_cols].copy() if meta_cols else pd.DataFrame()

# 모델 입력 피처 컬럼: train 기준으로 test 정렬(학습 당시의 열 순서와 동일해야 함)
feature_cols = final_feats

# test_itp를 학습 컬럼 순서에 맞추고, 없는 열은 0으로 채움
X_test = df_test.reindex(columns=feature_cols, fill_value=0)
# 숫자형 캐스팅(혹시 object가 섞였을 경우 대비)
X_test = X_test.apply(pd.to_numeric, errors="coerce").fillna(0)

In [None]:
# 1) XGB로 test 예측
y_pred_te_xgb = xgb_predict(X_test).astype(np.float32)

# 2) NN 입력 구성 (X_test + xgb_pred)
X_test_nn = X_test.copy()
X_test_nn["xgb_pred"] = y_pred_te_xgb
X_test_nn = X_test_nn.reindex(columns=best_state["feature_cols_nn"], fill_value=0)

X_test_np = scaler_inf.transform(X_test_nn.values.astype(np.float32))

# 3) residual 예측
res_te_pred = predict_residual_np(X_test_np)

# 4) 최종 예측 = XGB + residual
y_pred_final = y_pred_te_xgb + res_te_pred
y_pred_final = np.clip(y_pred_final, 0, 1200)


In [None]:
# ------------------------------------------
# 2) 최종 예측 반영
# ------------------------------------------
submission = sub_meta.copy()
submission['nins'] = y_pred_final.astype(np.float32)

# 혹시라도 순서 꼬이면 정렬
submission = submission[['time', 'pv_id', 'type', 'nins']]

# ------------------------------------------
# 3) CSV 저장 (현재 작업 디렉토리)
# ------------------------------------------
save_path = "submission_XGBMLP.csv"
submission.to_csv(save_path, index=False, encoding='utf-8')
print("파일 저장됨:", save_path)


In [None]:
import shutil

drive_path = "/content/drive/MyDrive/OIBC/submission_XGBMLP.csv"

shutil.copy("submission_XGBMLP.csv", drive_path)
print("Google Drive로 저장 완료:", drive_path)
