# Step 0: Download Data


# GPU-Accelerated Time Series Forecasting

**Hardware**: RTX 3070 + CUDA 13.1 + CuPy 13.3.0 + LightGBM

**7-Step Pipeline**:
1. Load data & baseline metric
2. Temporal features (lags, rolling windows)
3. Horizon-specific LightGBM models
4. PCA dimensionality reduction
5. Smoothed target encoding
6. Feature selection & interactions
7. Ensemble predictions

**Key Design**: Matrix operations (GPU) â†’ CuPy. External libs (CPU) â†’ NumPy.


In [1]:
# GPU Setup & Initialization

import cupy as np
import numpy as np_cpu
import sys

print(f"Python: {sys.executable}")
print(f"CuPy Version: {np.__version__}")

# Check GPU
try:
    device_count = np.cuda.runtime.getDeviceCount()
    device = np.cuda.Device(0)
    cap = device.compute_capability
    print(f"âœ“ GPU Ready: {device_count} device(s), Compute Capability {cap}")
except Exception as e:
    print(f"âœ— GPU Error: {e}")

Python: /usr/bin/python3
CuPy Version: 13.6.0
âœ“ GPU Ready: 1 device(s), Compute Capability 75



# Step 1: Download Data


In [None]:
# Kaggle Data Download
import os
import subprocess

def download_kaggle_data(competition_name="ts-forecasting"):
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Check if files already exist
    if os.path.exists(os.path.join(data_dir, "train.parquet")) and \
       os.path.exists(os.path.join(data_dir, "test.parquet")):
        print("Data files already exist. Skipping download.")
        return

    print(f"Downloading data for competition '{competition_name}'...")
    try:
        # Colab/Linux: Ensure .kaggle directory exists for the CLI
        root_kaggle = os.path.expanduser("~/.kaggle")
        if not os.path.exists(root_kaggle):
            os.makedirs(root_kaggle)

        # Use the specific Token variable required for KGAT_ tokens
        # This fixes the '401 Unauthorized' and 'Exit Status 1' errors
        subprocess.run([
            "kaggle", "competitions", "download", "-c", competition_name
        ], check=True, env=os.environ)
        
        # Unzip
        import zipfile
        zip_path = f"{competition_name}.zip"
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(data_dir)
            os.remove(zip_path)
            print(f"Downloaded and extracted to {data_dir}/")
        else:
            print("Zip file not found. Check Kaggle API output.")
    except Exception as e:
        print(f"Failed to download via Kaggle API: {e}")
        print("Tip: If in Colab, ensure you have accepted competition rules on Kaggle website.")

# Correct credentials for the new API Token format
os.environ["KAGGLE_API_TOKEN"] = "KGAT_ccc00b322d3c4b85f0036a23cc420469"

download_kaggle_data()

Downloading data for competition 'ts-forecasting'...
Failed to download via Kaggle API: Command '['kaggle', 'competitions', 'download', '-c', 'ts-forecasting']' returned non-zero exit status 1.
Ensure 'kaggle' is installed and credentials are set.



# Step 2: Imports & Metric


In [3]:
# Utilities & Metric

import polars as pl
import warnings
import os
import lightgbm as lgb
import cupy as np
import numpy as np_cpu
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from typing import Tuple, List, Dict
import requests

warnings.filterwarnings("ignore")

# GPU Memory Management
def clear_gpu_memory():
    """Clear CuPy memory pools to prevent OOM and mapping errors."""
    try:
        np.get_default_memory_pool().free_all_blocks()
        np.get_default_pinned_memory_pool().free_all_blocks()
    except Exception as e:
        print(f"Memory cleanup warning: {e}")

# GPU â†” CPU Conversion
def gpu_to_cpu(x):
    """CuPy GPU â†’ NumPy CPU (handles scalars + arrays)."""
    if x is None:
        return None
    try:
        if isinstance(x, (float, int, np_cpu.generic)):
            return x
        if hasattr(x, 'get'):
            return x.get()
        elif hasattr(x, 'item'):
            return x.item()
        else:
            return np_cpu.asarray(x)
    except Exception as e:
        return np_cpu.asarray(x)

def cpu_to_gpu(x):
    """NumPy CPU â†’ CuPy GPU."""
    if x is None: return None
    # Avoid redundant copies if already on GPU
    if hasattr(x, '__cuda_array_interface__'):
        return x
    return np.asarray(x)

# Weighted RMSE Skill Score
def weighted_rmse_score(y_true: np.ndarray, y_pred: np.ndarray, 
                        weights: np.ndarray) -> float:
    """GPU-accelerated metric. Returns Python float."""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    weights = np.asarray(weights)
    
    score = 1 - np.sqrt(np.sum(weights * (y_true - y_pred) ** 2) / 
                        (np.sum(weights * y_true ** 2) + 1e-8))
    return float(gpu_to_cpu(score))

# Step 3: Load Data & Baseline (Iter A)

In [4]:

# Iteration A: Load Train & Test (Polars)

def load_and_split_data(
    train_path: str = "data/train.parquet",
    test_path: str = "data/test.parquet",
    target_col: str = "feature_ch",
    weight_col: str = "feature_cg",
    valid_ratio: float = 0.20,
) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, list]:
    print(f"Loading data from {train_path} and {test_path}")
    
    # Load Train
    if os.path.exists(train_path):
        train_full = pl.read_parquet(train_path)
    else:
        print("Warning: Train file not found. Creating dummy train.")
        train_full = pl.DataFrame({"id": ["tr1"], "ts_index": [0], "horizon": [1], target_col: [0.0], weight_col: [1.0]})
        
    # Load Test
    if os.path.exists(test_path):
        test_df = pl.read_parquet(test_path)
    else:
        print("Warning: Test file not found. Creating dummy test.")
        test_df = pl.DataFrame({"id": ["te1"], "ts_index": [100], "horizon": [1], target_col: [0.0], weight_col: [1.0]})

    print(f"Loaded Train: {train_full.height:,} rows, Test: {test_df.height:,} rows")

    # Time-based split on Train for internal validation
    max_ts = train_full["ts_index"].max()
    min_ts = train_full["ts_index"].min()
    split_ts = max_ts - int((max_ts - min_ts) * valid_ratio)
    
    train_df = train_full.filter(pl.col("ts_index") < split_ts)
    valid_df = train_full.filter(pl.col("ts_index") >= split_ts)

    print(f"Internal Validation split at ts_index >= {split_ts}")
    
    # Feature columns (exclude meta)
    exclude_cols = ["id", "code", "sub_code", "sub_category", target_col, weight_col, "ts_index", "horizon"]
    feature_cols = [c for c in train_full.columns if c not in exclude_cols]

    return train_df, valid_df, test_df, feature_cols

# Execute Iter A
train_df, valid_df, test_df, feature_cols = load_and_split_data()

# Baseline Calculation on valid_df
y_true_gpu = cpu_to_gpu(valid_df["feature_ch"].to_numpy())
weights_gpu = cpu_to_gpu(valid_df["feature_cg"].fill_null(1.0).to_numpy())

train_mean_gpu = np.mean(cpu_to_gpu(train_df["feature_ch"].to_numpy()))
train_mean = float(train_mean_gpu)

y_pred_baseline = np.ones_like(y_true_gpu) * train_mean
baseline_score = weighted_rmse_score(y_true_gpu, y_pred_baseline, weights_gpu)
print(f"Baseline (Mean Prediction) Score on Validation: {baseline_score:.4f}")


Loading data from data/train.parquet and data/test.parquet
Loaded Train: 1 rows, Test: 1 rows
Internal Validation split at ts_index >= 0
Baseline (Mean Prediction) Score on Validation: nan


# Step 4: Temporal Features (Iter B)

In [5]:

# Iteration B: Smart Temporal Feature Engineering (Polars)

def create_temporal_features_pl(
    df: pl.DataFrame,
    feature_cols: List[str],
    group_cols: List[str] = ["code", "sub_code", "sub_category"],
    rolling_windows: List[int] = [3, 7, 14, 30],
) -> pl.DataFrame:
    print("Creating advanced temporal features with Polars...")
    df = df.sort(group_cols + ["ts_index"])
    features_to_process = feature_cols[:30] if len(feature_cols) > 30 else feature_cols
    
    exprs = []
    for feat in features_to_process:
        for lag in [1, 2, 3, 5, 10]:
            exprs.append(pl.col(feat).shift(lag).over(group_cols).alias(f"{feat}_lag_{lag}"))
        exprs.append((pl.col(feat) - pl.col(feat).shift(1)).over(group_cols).alias(f"{feat}_diff_1"))
        for window in rolling_windows:
            shifted = pl.col(feat).shift(1)
            exprs.append(shifted.rolling_mean(window_size=window, min_periods=1).over(group_cols).alias(f"{feat}_roll_mean_{window}"))
            exprs.append(shifted.rolling_std(window_size=window, min_periods=1).over(group_cols).alias(f"{feat}_roll_std_{window}"))
        shifted = pl.col(feat).shift(1)
        exprs.append((shifted.cum_sum() / shifted.cum_count()).over(group_cols).alias(f"{feat}_exp_mean").fill_nan(0))

    target = "feature_ch"
    exprs.append(pl.col(target).shift(1).mean().over(group_cols).alias("group_target_avg"))
    exprs.append(pl.col(target).shift(1).std().over(group_cols).alias("group_target_vol"))

    df = df.with_columns(exprs)
    return df

# Combine with tracking
train_df = train_df.with_columns(pl.lit("train").alias("source_set"))
valid_df = valid_df.with_columns(pl.lit("valid").alias("source_set"))
test_df = test_df.with_columns(pl.lit("test").alias("source_set"))

full_df = pl.concat([train_df, valid_df, test_df], how="diagonal") # Diagonal to handle missing target in test

# Create advanced features
full_df = create_temporal_features_pl(full_df, feature_cols)

# Re-split accurately
train_df = full_df.filter(pl.col("source_set") == "train")
valid_df = full_df.filter(pl.col("source_set") == "valid")
test_df = full_df.filter(pl.col("source_set") == "test")

current_features = [c for c in full_df.columns if c not in ["id", "code", "sub_code", "sub_category", "feature_ch", "feature_cg", "ts_index", "horizon", "source_set"]]
print(f"Total features after advanced Iter B: {len(current_features)}")


Creating advanced temporal features with Polars...


ColumnNotFoundError: code

# Iteration C: Weighted LightGBM

In [None]:
# Iteration C: Weighted LightGBM with Memory-Safe CV

def train_lgb_model_cv(df, features, target="feature_ch", weight="feature_cg", n_folds=3):
    """Memory-efficient Time-Series CV for LightGBM."""
    clear_gpu_memory()
    
    # Sort for time-series split
    df = df.sort("ts_index")
    ts_indices = df["ts_index"].unique().sort()
    
    cv_scores = []
    models = []
    
    # Simplified Expanding Window CV
    for i in range(1, n_folds + 1):
        split_idx = int(len(ts_indices) * (1 - 0.1 * i))
        split_ts = ts_indices[split_idx]
        
        train_fold = df.filter(pl.col("ts_index") < split_ts)
        valid_fold = df.filter(pl.col("ts_index") >= split_ts)
        
        if valid_fold.height == 0: continue
        
        print(f"  Fold {i}: Train={train_fold.height}, Valid={valid_fold.height}")
        
        # Data preparation (NO redundant GPU copies!)
        # LightGBM can handle NumPy directly and will move to GPU if 'device': 'gpu'
        X_tr = train_fold.select(features).fill_null(0).to_numpy()
        y_tr = train_fold[target].to_numpy()
        w_tr = train_fold[weight].fill_null(1.0).to_numpy()
        
        X_va = valid_fold.select(features).fill_null(0).to_numpy()
        y_va = valid_fold[target].to_numpy()
        w_va = valid_fold[weight].fill_null(1.0).to_numpy()
        
        train_data = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
        valid_data = lgb.Dataset(X_va, label=y_va, weight=w_va, reference=train_data)
        
        params = {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "device": "gpu",
            "verbose": -1,
            "n_jobs": -1
        }
        
        model = lgb.train(
            params, train_data, num_boost_round=500,
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(False)]
        )
        
        preds = model.predict(X_va)
        score = weighted_rmse_score(cpu_to_gpu(y_va), cpu_to_gpu(preds), cpu_to_gpu(w_va))
        cv_scores.append(score)
        models.append(model)
        
        # Explicitly delete large objects and clear pool
        del X_tr, y_tr, w_tr, X_va, y_va, w_va, train_data, valid_data
        clear_gpu_memory()
        
    avg_score = sum(cv_scores) / len(cv_scores) if cv_scores else 0
    return models[-1], avg_score

# Train separate models for horizons
horizons = sorted(train_df["horizon"].unique().to_list())
best_models = {}

print("Training Horizon-specific Models with CV (Iter C)...")
for h in horizons:
    t_h = train_df.filter(pl.col("horizon") == h)
    if t_h.height == 0: continue
        
    print(f"Processing Horizon {h}...")
    model, cv_score = train_lgb_model_cv(t_h, current_features)
    best_models[h] = model
    print(f"âœ“ Horizon {h} CV Score: {cv_score:.4f}")

# Generate Predictions
preds_full = []
for h, model in best_models.items():
    sub_df = valid_df.filter(pl.col("horizon") == h)
    if sub_df.height > 0:
        preds = model.predict(sub_df.select(current_features).fill_null(0).to_numpy())
        temp_df = sub_df.select("id").with_columns(pl.Series(name="pred_iter_c_h", values=preds))
        preds_full.append(temp_df)

if preds_full:
    preds_all = pl.concat(preds_full)
    valid_df = valid_df.join(preds_all, on="id", how="left").with_columns(
        pl.col("pred_iter_c_h").fill_null(0).alias("pred_iter_c")
    )

# Overall evaluation
y_true_gpu = cpu_to_gpu(valid_df["feature_ch"].to_numpy())
pred_gpu = cpu_to_gpu(valid_df["pred_iter_c"].to_numpy())
weights_gpu = cpu_to_gpu(valid_df["feature_cg"].fill_null(1.0).to_numpy())
overall_score_c = weighted_rmse_score(y_true_gpu, pred_gpu, weights_gpu)
print(f"Overall Iteration C Score: {overall_score_c:.4f}")

# Iteration D: PCA

In [None]:

# Iteration D: PCA (GPU-optimized & Memory-Safe)

print("Applying PCA (Iter D)...")
clear_gpu_memory()

# Select numeric features
pca_features = [c for c in train_df.columns if c.startswith("feature_") or "_roll_" in c]
pca_features = pca_features[:50]

# Load to GPU
X_train_gpu = cpu_to_gpu(train_df.select(pca_features).fill_null(0).to_numpy())
X_valid_gpu = cpu_to_gpu(valid_df.select(pca_features).fill_null(0).to_numpy())
X_test_gpu = cpu_to_gpu(test_df.select(pca_features).fill_null(0).to_numpy())

# Standardize
mean_gpu = np.mean(X_train_gpu, axis=0, keepdims=True)
std_gpu = np.std(X_train_gpu, axis=0, keepdims=True)
std_gpu = np.where(std_gpu == 0, 1.0, std_gpu)

X_train_scaled = (X_train_gpu - mean_gpu) / std_gpu
X_valid_scaled = (X_valid_gpu - mean_gpu) / std_gpu
X_test_scaled = (X_test_gpu - mean_gpu) / std_gpu

# PCA on CPU
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(gpu_to_cpu(X_train_scaled))
X_valid_pca = pca.transform(gpu_to_cpu(X_valid_scaled))
X_test_pca = pca.transform(gpu_to_cpu(X_test_scaled))

# Add back
pca_cols = [f"pca_{i}" for i in range(10)]
train_df = pl.concat([train_df, pl.DataFrame(X_train_pca, schema=pca_cols)], how="horizontal")
valid_df = pl.concat([valid_df, pl.DataFrame(X_valid_pca, schema=pca_cols)], how="horizontal")
test_df = pl.concat([test_df, pl.DataFrame(X_test_pca, schema=pca_cols)], how="horizontal")

features_d = current_features + pca_cols
print(f"PCA features added to all sets. Total: {len(features_d)}")
del X_train_gpu, X_valid_gpu, X_test_gpu, X_train_scaled, X_valid_scaled, X_test_scaled
clear_gpu_memory()


# Iteration E: Smoothed Target Encoding

In [None]:
# Iteration E: Smoothed Target Encoding (Polars Fast)

def create_smoothed_target_encoding_pl(
    df, col, target="feature_ch", weight="feature_cg", min_samples=10, smoothing=10
):
    df = df.sort([col, "ts_index"])
    global_mean = df[target].mean()
    
    return df.with_columns(
        (
            (pl.col(target).shift(1).cum_sum().over(col).fill_null(0) + smoothing * global_mean) 
            / 
            (pl.col(target).shift(1).cum_count().over(col).fill_null(0) + smoothing)
        ).alias(f"{col}_enc_smooth")
    )

print("Applying Smoothed Target Encoding (Iter E)...")
cols_in_common = [c for c in train_df.columns if c in valid_df.columns and c in test_df.columns]
full_df = pl.concat([
    train_df.select(cols_in_common), 
    valid_df.select(cols_in_common), 
    test_df.select(cols_in_common)
], how="diagonal")

for col in ["code", "sub_code", "sub_category"]:
    full_df = create_smoothed_target_encoding_pl(full_df, col)

# Re-split
train_df = full_df.filter(pl.col("source_set") == "train")
valid_df = full_df.filter(pl.col("source_set") == "valid")
test_df = full_df.filter(pl.col("source_set") == "test")

features_e = features_d + [f"{c}_enc_smooth" for c in ["code", "sub_code", "sub_category"]]


# Step 8: Feature Selection (Iter F)

In [None]:
# Iteration F: Advanced Feature Selection (Model-Based)

print("Applying Non-Linear Feature Selection (Iter F)...")

# 1. Create Interactions
new_cols = []
top_feats = [c for c in features_e if "_lag_1" in c or "_roll_mean_7" in c][:10]
for feat in top_feats:
    new_cols.append((pl.col(feat) * pl.col("horizon")).alias(f"{feat}_x_horizon"))

train_df = train_df.with_columns(new_cols)
valid_df = valid_df.with_columns(new_cols)
test_df = test_df.with_columns(new_cols)

all_candidates_f = features_e + [f"{feat}_x_horizon" for feat in top_feats]
all_candidates_f = [c for c in all_candidates_f if c in train_df.columns]

# 2. Use LightGBM Importance for Selection
X_sel_np = train_df.select(all_candidates_f).fill_null(0).to_numpy()
y_sel_np = train_df["feature_ch"].to_numpy()

sel_model = lgb.LGBMRegressor(n_estimators=100, device="gpu", random_state=42, verbose=-1)
sel_model.fit(X_sel_np, y_sel_np)

importance_df = pl.DataFrame({
    "feature": all_candidates_f,
    "importance": sel_model.feature_importances_
}).sort("importance", descending=True)

selected_features_f = importance_df.head(150)["feature"].to_list()
print(f"âœ“ Selected top {len(selected_features_f)} features")


# Iteration G: Ensemble

In [None]:
# Iteration G: The "Silver Bullet" Ensemble (Final Submission on test.parquet)

import xgboost as xgb
from catboost import CatBoostRegressor

print("Training Integrated GPU Ensemble & Generating Final Submission...")
clear_gpu_memory()

horizons = sorted(train_df["horizon"].unique().to_list())
preds_test = []
preds_valid = []

for h in horizons:
    t_h = train_df.filter(pl.col("horizon") == h)
    v_h = valid_df.filter(pl.col("horizon") == h)
    te_h = test_df.filter(pl.col("horizon") == h)
    
    if t_h.height == 0: continue
    
    print(f"Processing Horizon {h}...")
    
    X_train = t_h.select(selected_features_f).fill_null(0).to_numpy()
    y_train = t_h["feature_ch"].to_numpy()
    w_train = t_h["feature_cg"].fill_null(1.0).to_numpy()
    
    X_valid = v_h.select(selected_features_f).fill_null(0).to_numpy()
    X_test = te_h.select(selected_features_f).fill_null(0).to_numpy()
    
    # Simple weighted blend
    def train_and_predict(X_tr, y_tr, w_tr, X_val, X_te):
        # LGBM
        m1 = lgb.LGBMRegressor(n_estimators=500, device="gpu", random_state=42, verbose=-1)
        m1.fit(X_tr, y_tr, sample_weight=w_tr)
        p1_v, p1_t = m1.predict(X_val), m1.predict(X_te)
        
        # XGB
        m2 = xgb.XGBRegressor(n_estimators=500, tree_method="hist", device="cuda", random_state=42)
        m2.fit(X_tr, y_tr, sample_weight=w_tr)
        p2_v, p2_t = m2.predict(X_val), m2.predict(X_te)
        
        # CatBoost
        m3 = CatBoostRegressor(n_estimators=500, task_type="GPU", random_state=42, verbose=0)
        m3.fit(X_tr, y_tr, sample_weight=w_tr)
        p3_v, p3_t = m3.predict(X_val), m3.predict(X_te)
        
        clear_gpu_memory()
        v_res = (0.4 * p1_v + 0.4 * p2_v + 0.2 * p3_v)
        t_res = (0.4 * p1_t + 0.4 * p2_t + 0.2 * p3_t)
        return v_res, t_res

    p_val, p_test = train_and_predict(X_train, y_train, w_train, X_valid, X_test)
    
    preds_valid.append(v_h.select("id").with_columns(pl.Series("prediction", p_val)))
    preds_test.append(te_h.select("id").with_columns(pl.Series("prediction", p_test)))

# Save Submission
if preds_test:
    sub = pl.concat(preds_test)
    sub.write_csv("submission_final_polars.csv")
    print(f"âœ“ Saved submission with {sub.height} rows")

# Validation Score
if preds_valid:
    val_res = pl.concat(preds_valid)
    val_merged = valid_df.join(val_res, on="id")
    y_true = cpu_to_gpu(val_merged["feature_ch"].to_numpy())
    y_pred = cpu_to_gpu(val_merged["prediction"].to_numpy())
    weights = cpu_to_gpu(val_merged["feature_cg"].fill_null(1.0).to_numpy())
    score = weighted_rmse_score(y_true, y_pred, weights)
    print(f"ðŸš€ Final Validation Skill Score: {score:.4f}")
