# Iteration 0: Setup & Data Download

GPU-accelerated forecasting with CuPy and Polars.

In [None]:
import cupy as np
import sys
import os
import subprocess
import zipfile

# Check GPU
try:
    print(f"Python: {sys.executable}")
    devs = np.cuda.runtime.getDeviceCount()
    print(f"✓ GPU: {devs} device(s), CuPy {np.__version__}")
except Exception as e:
    print(f"✗ GPU Error: {e}")

# Download Data
def download_data(comp="ts-forecasting"):
    if os.path.exists("data/train.parquet"):
        print("✓ Data exists.")
        return
    os.makedirs("data", exist_ok=True)
    env = os.environ.copy()
    env["KAGGLE_USERNAME"] = "dummy_user"
    env["KAGGLE_KEY"] = "KGAT_ccc00b322d3c4b85f0036a23cc420469"
    try:
        subprocess.run(["kaggle", "competitions", "download", "-c", comp], check=True, env=env)
        with zipfile.ZipFile(f"{comp}.zip", 'r') as z: z.extractall("data")
        os.remove(f"{comp}.zip")
        print("✓ Downloaded.")
    except: print("✗ Download failed.")

download_data()

# Iteration 2: Imports & Utilities

In [None]:
import polars as pl
import warnings
import os
import gc
import lightgbm as lgb
import cupy as np
import numpy as np_cpu
from typing import Tuple, List, Dict

warnings.filterwarnings("ignore")

def clear_memory():
    gc.collect()
    try:
        np.get_default_memory_pool().free_all_blocks()
    except: pass

def gpu_to_cpu(x):
    if x is None: return None
    try:
        if isinstance(x, (float, int, np_cpu.generic)): return x
        return x.get() if hasattr(x, 'get') else np_cpu.asarray(x)
    except: return np_cpu.asarray(x)

def cpu_to_gpu(x):
    return np.asarray(x) if x is not None else None

def weighted_rmse_score(y_true, y_pred, weights) -> float:
    y_t, y_p, w = np.asarray(y_true), np.asarray(y_pred), np.asarray(weights)
    score = 1 - np.sqrt(np.sum(w * (y_t - y_p)**2) / (np.sum(w * y_t**2) + 1e-8))
    return float(gpu_to_cpu(score))

def fast_eval(df_tr, df_va, feats, target="feature_ch", weight="feature_cg"):
    """Quick LGBM eval for iteration tracking."""
    X_tr = df_tr.select(feats).fill_null(0).to_numpy()
    y_tr, w_tr = df_tr[target].to_numpy(), df_tr[weight].fill_null(1.0).to_numpy()
    X_va = df_va.select(feats).fill_null(0).to_numpy()
    y_va, w_va = df_va[target].to_numpy(), df_va[weight].fill_null(1.0).to_numpy()
    
    m = lgb.LGBMRegressor(n_estimators=100, device="gpu", random_state=42, verbose=-1)
    m.fit(X_tr, y_tr, sample_weight=w_tr)
    return weighted_rmse_score(cpu_to_gpu(y_va), cpu_to_gpu(m.predict(X_va)), cpu_to_gpu(w_va))

# Iteration A: Load Data & Baseline

In [None]:
def load_and_split_data(train_path="data/train.parquet", test_path="data/test.parquet", valid_ratio=0.2):
    print(f"Loading {train_path}...")
    def optimize(df):
        return df.with_columns([
            pl.col(c).cast(pl.Float32) for c, t in df.schema.items() if t == pl.Float64
        ] + [
            pl.col(c).cast(pl.Categorical) for c, t in df.schema.items() if t == pl.Utf8 or t == pl.String
        ])

    train_full = optimize(pl.read_parquet(train_path))
    test_df = optimize(pl.read_parquet(test_path))
    
    max_ts = train_full["ts_index"].max()
    split_ts = max_ts - int((max_ts - train_full["ts_index"].min()) * valid_ratio)
    
    train_df = train_full.filter(pl.col("ts_index") < split_ts)
    valid_df = train_full.filter(pl.col("ts_index") >= split_ts)
    del train_full
    clear_memory()

    excl = ["id", "code", "sub_code", "sub_category", "feature_ch", "feature_cg", "ts_index", "horizon"]
    feats = [c for c in train_df.columns if c not in excl]
    return train_df, valid_df, test_df, feats

train_df, valid_df, test_df, feature_cols = load_and_split_data()

# Baseline
y_val = train_df["feature_ch"].mean()
y_true_gpu = cpu_to_gpu(valid_df["feature_ch"].to_numpy())
w_gpu = cpu_to_gpu(valid_df["feature_cg"].fill_null(1.0).to_numpy())
score_a = weighted_rmse_score(y_true_gpu, np.full_like(y_true_gpu, y_val), w_gpu)
print(f"Iteration A Score: {score_a:.4f} | Baseline=Mean | Features={len(feature_cols)}")

# Iteration B: Temporal Features

In [None]:
def create_temporal_features_pl(df, feats, group_cols=["code", "sub_code"], windows=[7, 30]):
    # Optimized: process more features (30 instead of 15) to avoid detail loss
    to_proc = feats[:30]
    print(f"Creating temporal features for {len(to_proc)} base features...")
    
    df = df.sort(group_cols + ["ts_index"])
    batch_size = 5
    for i in range(0, len(to_proc), batch_size):
        batch = to_proc[i:i+batch_size]
        exprs = []
        for f in batch:
            exprs.append(pl.col(f).shift(1).over(group_cols).alias(f"{f}_lag1").cast(pl.Float32))
            for w in windows:
                exprs.append(pl.col(f).shift(1).rolling_mean(w, min_periods=1).over(group_cols).alias(f"{f}_rm{w}").cast(pl.Float32))
        df = df.with_columns(exprs)
        clear_memory()
    return df

# Prepare for feature creation
train_df = train_df.with_columns(pl.lit("train").alias("set"))
valid_df = valid_df.with_columns(pl.lit("valid").alias("set"))
test_df = test_df.with_columns(pl.lit("test").alias("set"))

full_df = pl.concat([train_df, valid_df, test_df], how="diagonal")
del train_df, valid_df, test_df
full_df = create_temporal_features_pl(full_df, feature_cols)

# Split back
train_df = full_df.filter(pl.col("set") == "train")
valid_df = full_df.filter(pl.col("set") == "valid")
test_df = full_df.filter(pl.col("set") == "test")
del full_df
clear_memory()

current_features = [c for c in train_df.columns if c not in ["id", "code", "sub_code", "sub_category", "feature_ch", "feature_cg", "ts_index", "horizon", "set"]]
score_b = fast_eval(train_df, valid_df, current_features)
print(f"Iteration B Score: {score_b:.4f} | Δ: {score_b - score_a:+.4f} | Features: {len(current_features)}")

# Iteration C: Weighted LightGBM

In [None]:
def train_horizon_model(df, feats, h):
    df_h = df.filter(pl.col("horizon") == h).sort("ts_index")
    split_ts = df_h["ts_index"].unique().sort()[int(len(df_h["ts_index"].unique())*0.9)]
    
    tr = df_h.filter(pl.col("ts_index") < split_ts)
    va = df_h.filter(pl.col("ts_index") >= split_ts)
    
    dtrain = lgb.Dataset(tr.select(feats).fill_null(0).to_numpy(), tr["feature_ch"], weight=tr["feature_cg"].fill_null(1.0))
    dvalid = lgb.Dataset(va.select(feats).fill_null(0).to_numpy(), va["feature_ch"], weight=va["feature_cg"].fill_null(1.0), reference=dtrain)
    
    m = lgb.train({"objective":"regression","metric":"rmse","learning_rate":0.05,"num_leaves":31,"device":"gpu","verbose":-1},
                  dtrain, num_boost_round=500, valid_sets=[dvalid], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(False)])
    return m

print("Training Horizon models (Iteration C)...")
horizons = sorted(train_df["horizon"].unique().to_list())
models_c = {h: train_horizon_model(train_df, current_features, h) for h in horizons}

# Evaluation
valid_df = valid_df.with_columns(pl.lit(0.0).alias("pred_c"))
for h, m in models_c.items():
    mask = (pl.col("horizon") == h)
    if valid_df.filter(mask).height > 0:
        preds = m.predict(valid_df.filter(mask).select(current_features).fill_null(0).to_numpy())
        valid_df = valid_df.with_columns(pl.when(mask).then(pl.Series(preds)).otherwise(pl.col("pred_c")).alias("pred_c"))

score_c = weighted_rmse_score(y_true_gpu, cpu_to_gpu(valid_df["pred_c"]), w_gpu)
print(f"Iteration C Score: {score_c:.4f} | Δ: {score_c - score_b:+.4f}")

# Iteration D: PCA Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
print("PCA Dimensionality Reduction (Iteration D)...")

pca_feats = [c for c in train_df.columns if "_rm" in c or "_lag" in c][:40]

def get_pca(df_tr, df_va, df_te, cols, n=8):
    X_tr = df_tr.select(cols).fill_null(0).to_numpy()
    mean, std = X_tr.mean(axis=0), X_tr.std(axis=0)
    std[std == 0] = 1.0
    
    pca = PCA(n_components=n)
    tr_pca = pca.fit_transform((X_tr - mean) / std)
    va_pca = pca.transform((df_va.select(cols).fill_null(0).to_numpy() - mean) / std)
    te_pca = pca.transform((df_te.select(cols).fill_null(0).to_numpy() - mean) / std)
    return tr_pca, va_pca, te_pca

tr_p, va_p, te_p = get_pca(train_df, valid_df, test_df, pca_feats)
pca_cols = [f"pca_{i}" for i in range(8)]

train_df = pl.concat([train_df, pl.DataFrame(tr_p, schema=pca_cols).with_columns(pl.all().cast(pl.Float32))], how="horizontal")
valid_df = pl.concat([valid_df, pl.DataFrame(va_p, schema=pca_cols).with_columns(pl.all().cast(pl.Float32))], how="horizontal")
test_df = pl.concat([test_df, pl.DataFrame(te_p, schema=pca_cols).with_columns(pl.all().cast(pl.Float32))], how="horizontal")

features_d = current_features + pca_cols
score_d = fast_eval(train_df, valid_df, features_d)
print(f"Iteration D Score: {score_d:.4f} | Δ: {score_d - score_c:+.4f} | Features: {len(features_d)}")

# Iteration E: Target Encoding

In [None]:
def smoothed_enc(df, col, target="feature_ch", smoothing=10):
    g_mean = df[target].mean()
    base = df.select([col, target]).with_columns([
        pl.col(target).shift(1).cum_sum().over(col).fill_null(0).alias("sum"),
        pl.col(target).shift(1).cum_count().over(col).fill_null(0).alias("cnt")
    ])
    return ((base["sum"] + smoothing * g_mean) / (base["cnt"] + smoothing)).cast(pl.Float32)

print("Target Encoding (Iteration E)...")
full = pl.concat([train_df.select(["code","sub_code","feature_ch"]), 
                  valid_df.select(["code","sub_code","feature_ch"]),
                  test_df.select(["code","sub_code","feature_ch"])], how="vertical")

for c in ["code", "sub_code"]:
    enc = smoothed_enc(full, c)
    train_df = train_df.with_columns(enc.slice(0, train_df.height).alias(f"{c}_enc"))
    valid_df = valid_df.with_columns(enc.slice(train_df.height, valid_df.height).alias(f"{c}_enc"))
    test_df = test_df.with_columns(enc.slice(train_df.height + valid_df.height, test_df.height).alias(f"{c}_enc"))

features_e = features_d + ["code_enc", "sub_code_enc"]
score_e = fast_eval(train_df, valid_df, features_e)
print(f"Iteration E Score: {score_e:.4f} | Δ: {score_e - score_d:+.4f} | Features: {len(features_e)}")

# Iteration F: Feature Selection

In [None]:
print("Feature Selection (Iteration F)...")

# Importance selection
X_np = train_df.select(features_e).fill_null(0).to_numpy()
y_np = train_df["feature_ch"].to_numpy()
m_sel = lgb.LGBMRegressor(n_estimators=100, device="gpu", random_state=42, verbose=-1)
m_sel.fit(X_np, y_np)

importance = pl.DataFrame({"f": features_e, "i": m_sel.feature_importances_}).sort("i", descending=True)
selected_feats = importance.head(150)["f"].to_list()

score_f = fast_eval(train_df, valid_df, selected_feats)
print(f"Iteration F Score: {score_f:.4f} | Δ: {score_f - score_e:+.4f} | Top Features: {selected_feats[:5]}")

# Iteration G: Ensemble (LightGBM + XGBoost + CatBoost)

In [None]:
import xgboost as xgb
from catboost import CatBoostRegressor

print("Ensemble Training (Iteration G)...")
valid_df = valid_df.with_columns(pl.lit(0.0).alias("pred_g"))
test_preds = []

for h in horizons:
    tr = train_df.filter(pl.col("horizon") == h)
    va = valid_df.filter(pl.col("horizon") == h)
    te = test_df.filter(pl.col("horizon") == h)
    
    X_tr = tr.select(selected_feats).fill_null(0).to_numpy()
    y_tr, w_tr = tr["feature_ch"].to_numpy(), tr["feature_cg"].fill_null(1.0).to_numpy()
    X_va = va.select(selected_feats).fill_null(0).to_numpy()
    X_te = te.select(selected_feats).fill_null(0).to_numpy()
    
    # Simple weighted ensemble
    m1 = lgb.LGBMRegressor(n_estimators=500, device="gpu", verbose=-1).fit(X_tr, y_tr, sample_weight=w_tr)
    m2 = xgb.XGBRegressor(n_estimators=500, tree_method="hist", device="cuda").fit(X_tr, y_tr, sample_weight=w_tr)
    m3 = CatBoostRegressor(n_estimators=500, task_type="GPU", verbose=0).fit(X_tr, y_tr, sample_weight=w_tr)
    
    p_va = 0.4*m1.predict(X_va) + 0.4*m2.predict(X_va) + 0.2*m3.predict(X_va)
    p_te = 0.4*m1.predict(X_te) + 0.4*m2.predict(X_te) + 0.2*m3.predict(X_te)
    
    valid_df = valid_df.with_columns(pl.when(pl.col("horizon")==h).then(pl.Series(p_va)).otherwise(pl.col("pred_g")).alias("pred_g"))
    test_preds.append(te.select("id").with_columns(pl.Series("prediction", p_te)))
    clear_memory()

score_g = weighted_rmse_score(y_true_gpu, cpu_to_gpu(valid_df["pred_g"]), w_gpu)
pl.concat(test_preds).write_csv("submission_final_polars.csv")
print(f"Iteration G Score: {score_g:.4f} | Δ: {score_g - score_f:+.4f} | Saved submission.")