# Step 0: Download Data


# GPU-Accelerated Time Series Forecasting

**Hardware**: RTX 3070 + CUDA 13.1 + CuPy 13.3.0 + LightGBM

**7-Step Pipeline**:
1. Load data & baseline metric
2. Temporal features (lags, rolling windows)
3. Horizon-specific LightGBM models
4. PCA dimensionality reduction
5. Smoothed target encoding
6. Feature selection & interactions
7. Ensemble predictions

**Key Design**: Matrix operations (GPU) â†’ CuPy. External libs (CPU) â†’ NumPy.


In [None]:
# GPU Setup & Initialization

import cupy as np
import numpy as np_cpu
import sys

print(f"Python: {sys.executable}")
print(f"CuPy Version: {np.__version__}")

# Check GPU
try:
    device_count = np.cuda.runtime.getDeviceCount()
    device = np.cuda.Device(0)
    cap = device.compute_capability
    print(f"âœ“ GPU Ready: {device_count} device(s), Compute Capability {cap}")
except Exception as e:
    print(f"âœ— GPU Error: {e}")


# Step 1: Download Data


In [None]:

# Kaggle Data Download (Cross-Platform)
import os
import subprocess
import sys
import zipfile
import platform

def download_kaggle_data(competition_name="ts-forecasting"):
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # Check if files already exist
    train_exists = os.path.exists(os.path.join(data_dir, "train.parquet"))
    test_exists = os.path.exists(os.path.join(data_dir, "test.parquet"))
    
    if train_exists and test_exists:
        print("âœ“ Data files already exist in 'data/'. Skipping download.")
        return

    print(f"[{platform.system()}] Downloading data for competition '{competition_name}'...")
    
    # Configure Credentials
    env = os.environ.copy()
    token = "KGAT_ccc00b322d3c4b85f0036a23cc420469"
    env["KAGGLE_USERNAME"] = "dummy_user"
    env["KAGGLE_KEY"] = token
    env["KAGGLE_API_TOKEN"] = token

    # Detect Kaggle Executable Path
    kaggle_cmd = "kaggle"
    try:
        # Check if 'kaggle' is in PATH
        subprocess.run([kaggle_cmd, "--version"], capture_output=True, check=True)
    except:
        # If not, try common venv locations
        if platform.system() == "Windows":
            venv_kaggle = os.path.join(".venv", "Scripts", "kaggle.exe")
        else:
            venv_kaggle = os.path.join(".venv", "bin", "kaggle")
            
        if os.path.exists(venv_kaggle):
            kaggle_cmd = venv_kaggle
        else:
            print(f"! Warning: 'kaggle' CLI not found in PATH or .venv. Attempting default 'kaggle' anyway.")

    try:
        # 1. Download
        result = subprocess.run([
            kaggle_cmd, "competitions", "download", "-c", competition_name
        ], check=True, env=env, capture_output=True, text=True)
        
        # 2. Extract
        zip_path = f"{competition_name}.zip"
        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(data_dir)
            os.remove(zip_path)
            print(f"âœ“ Downloaded and extracted to {data_dir}/")
        else:
            # Check for alternative zip names
            zips = [f for f in os.listdir('.') if f.endswith('.zip')]
            if zips:
                with zipfile.ZipFile(zips[0], 'r') as zip_ref:
                    zip_ref.extractall(data_dir)
                os.remove(zips[0])
                print(f"âœ“ Extracted {zips[0]} to {data_dir}/")
            else:
                print("âœ— Zip file not found after download.")
                
    except subprocess.CalledProcessError as e:
        print(f"âœ— Failed to download via Kaggle API.")
        print(f"Error: {e.stderr if e.stderr else e.stdout}")
        print("Tip: If error is 403, ensure you accepted competition rules on Kaggle website.")
    except Exception as e:
        print(f"âœ— General Error: {e}")

download_kaggle_data()



# Step 2: Imports & Metric


In [None]:
# Utilities & Metric

import polars as pl
import warnings
import os
import lightgbm as lgb
import cupy as np
import numpy as np_cpu
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from typing import Tuple, List, Dict
import requests

warnings.filterwarnings("ignore")

# GPU Memory Management
def clear_gpu_memory():
    """Clear CuPy memory pools to prevent OOM and mapping errors."""
    try:
        np.get_default_memory_pool().free_all_blocks()
        np.get_default_pinned_memory_pool().free_all_blocks()
    except Exception as e:
        print(f"Memory cleanup warning: {e}")

# GPU â†” CPU Conversion
def gpu_to_cpu(x):
    """CuPy GPU â†’ NumPy CPU (handles scalars + arrays)."""
    if x is None:
        return None
    try:
        if isinstance(x, (float, int, np_cpu.generic)):
            return x
        if hasattr(x, 'get'):
            return x.get()
        elif hasattr(x, 'item'):
            return x.item()
        else:
            return np_cpu.asarray(x)
    except Exception as e:
        return np_cpu.asarray(x)

def cpu_to_gpu(x):
    """NumPy CPU â†’ CuPy GPU."""
    if x is None: return None
    # Avoid redundant copies if already on GPU
    if hasattr(x, '__cuda_array_interface__'):
        return x
    return np.asarray(x)

# Weighted RMSE Skill Score
def weighted_rmse_score(y_true: np.ndarray, y_pred: np.ndarray, 
                        weights: np.ndarray) -> float:
    """GPU-accelerated metric. Returns Python float."""
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    weights = np.asarray(weights)
    
    score = 1 - np.sqrt(np.sum(weights * (y_true - y_pred) ** 2) / 
                        (np.sum(weights * y_true ** 2) + 1e-8))
    return float(gpu_to_cpu(score))

# Step 3: Load Data & Baseline (Iter A)

In [None]:

# Iteration A: Load Train & Test (Polars) - Ultra-Lean Memory Strategy
import gc
from typing import Tuple, List

def load_and_split_data(
    train_path: str = "data/train.parquet",
    test_path: str = "data/test.parquet",
    target_col: str = "feature_ch",
    weight_col: str = "feature_cg",
    valid_ratio: float = 0.20,
    ) -> Tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame, list]:
    print(f"Loading data from {train_path} and {test_path} (Ultra-Lean)...")
    
    def optimize_df(df):
        # Cast floats to 32-bit
        float_cols = [c for c, t in df.schema.items() if t == pl.Float64]
        # Cast strings/objects to Categorical for 10x memory saving
        str_cols = [c for c, t in df.schema.items() if t == pl.Utf8 or t == pl.String]
        
        ops = []
        if float_cols: ops.extend([pl.col(c).cast(pl.Float32) for c in float_cols])
        if str_cols: ops.extend([pl.col(c).cast(pl.Categorical) for c in str_cols])
        
        return df.with_columns(ops) if ops else df

    # Load Train
    if os.path.exists(train_path):
        train_full = optimize_df(pl.read_parquet(train_path))
    else:
        print("Warning: Train file not found. Creating dummy train.")
        train_full = pl.DataFrame({"id": ["tr1"], "ts_index": [0], "horizon": [1], target_col: [0.0], weight_col: [1.0]})
        train_full = optimize_df(train_full)
        
    # Load Test
    if os.path.exists(test_path):
        test_df = optimize_df(pl.read_parquet(test_path))
    else:
        print("Warning: Test file not found. Creating dummy test.")
        test_df = pl.DataFrame({"id": ["te1"], "ts_index": [100], "horizon": [1], target_col: [0.0], weight_col: [1.0]})
        test_df = optimize_df(test_df)

    print(f"Loaded Train: {train_full.height:,} rows, Test: {test_df.height:,} rows (Floats -> 32bit, Strings -> Categorical)")

    # Time-based split
    max_ts = train_full["ts_index"].max()
    min_ts = train_full["ts_index"].min()
    split_ts = max_ts - int((max_ts - min_ts) * valid_ratio)
    
    train_df = train_full.filter(pl.col("ts_index") < split_ts)
    valid_df = train_full.filter(pl.col("ts_index") >= split_ts)
    
    del train_full
    gc.collect()

    print(f"Internal Validation split at ts_index >= {split_ts}")
    
    exclude_cols = ["id", "code", "sub_code", "sub_category", target_col, weight_col, "ts_index", "horizon"]
    exclude_cols = [c for c in exclude_cols if c in train_df.columns]
    feature_cols = [c for c in train_df.columns if c not in exclude_cols]

    return train_df, valid_df, test_df, feature_cols

# Execute Iter A
train_df, valid_df, test_df, feature_cols = load_and_split_data()

# Baseline Prediction (GPU)
y_true_gpu = cpu_to_gpu(valid_df["feature_ch"].to_numpy())
weights_gpu = cpu_to_gpu(valid_df["feature_cg"].fill_null(1.0).to_numpy())
train_mean = float(np.mean(cpu_to_gpu(train_df["feature_ch"].to_numpy())))

y_pred_baseline = np.ones_like(y_true_gpu) * train_mean
baseline_score = weighted_rmse_score(y_true_gpu, y_pred_baseline, weights_gpu)
print(f"Baseline (Mean Prediction) Score on Validation: {baseline_score:.4f}")


# Step 4: Temporal Features (Iter B)

In [None]:

# Iteration B: Smart Temporal Feature Engineering (Polars) 
def create_temporal_features_pl(
    df: pl.DataFrame,
    feature_cols: List[str],
    group_cols: List[str] = ["code", "sub_code"],
    rolling_windows: List[int] = [7, 30],
) -> pl.DataFrame:
    # Use top 15 features for absolute RAM safety on 16GB systems
    features_to_process = feature_cols[:15]
    print(f"Creating features for {len(features_to_process)} features...")
    
    # Sort
    group_cols_existing = [c for c in group_cols if c in df.columns]
    df = df.sort(group_cols_existing + ["ts_index"])
    
    batch_size = 5
    for i in range(0, len(features_to_process), batch_size):
        batch = features_to_process[i:i+batch_size]
        print(f"  Batch {i//batch_size + 1}: {batch}")
        
        exprs = []
        for feat in batch:
            exprs.append(pl.col(feat).shift(1).over(group_cols_existing).alias(f"{feat}_lag1").cast(pl.Float32))
            exprs.append(pl.col(feat).shift(2).over(group_cols_existing).alias(f"{feat}_lag2").cast(pl.Float32))
            for w in rolling_windows:
                exprs.append(pl.col(feat).shift(1).rolling_mean(w, min_periods=1).over(group_cols_existing).alias(f"{feat}_rm{w}").cast(pl.Float32))
        
        df = df.with_columns(exprs)
        gc.collect()

    return df

# Start Fresh
gc.collect()
train_df = train_df.with_columns(pl.lit("train").alias("source_set").cast(pl.Categorical))
valid_df = valid_df.with_columns(pl.lit("valid").alias("source_set").cast(pl.Categorical))
test_df = test_df.with_columns(pl.lit("test").alias("source_set").cast(pl.Categorical))

full_df = pl.concat([train_df, valid_df, test_df], how="diagonal")
del train_df, valid_df, test_df
gc.collect()

full_df = create_temporal_features_pl(full_df, feature_cols)

print("Splitting Destructively...")
train_df = full_df.filter(pl.col("source_set") == "train")
full_df = full_df.filter(pl.col("source_set") != "train")
gc.collect()

valid_df = full_df.filter(pl.col("source_set") == "valid")
full_df = full_df.filter(pl.col("source_set") != "valid")
gc.collect()

test_df = full_df.filter(pl.col("source_set") == "test")
del full_df
gc.collect()

current_features = [c for c in train_df.columns if c not in ["id", "code", "sub_code", "sub_category", "feature_ch", "feature_cg", "ts_index", "horizon", "source_set"]]
print(f"Total features after Iter B: {len(current_features)}")


# Iteration C: Weighted LightGBM

In [None]:
# Iteration C: Weighted LightGBM with Memory-Safe CV

def train_lgb_model_cv(df, features, target="feature_ch", weight="feature_cg", n_folds=3):
    """Memory-efficient Time-Series CV for LightGBM."""
    clear_gpu_memory()
    
    # Sort for time-series split
    df = df.sort("ts_index")
    ts_indices = df["ts_index"].unique().sort()
    
    cv_scores = []
    models = []
    
    # Simplified Expanding Window CV
    for i in range(1, n_folds + 1):
        split_idx = int(len(ts_indices) * (1 - 0.1 * i))
        split_ts = ts_indices[split_idx]
        
        train_fold = df.filter(pl.col("ts_index") < split_ts)
        valid_fold = df.filter(pl.col("ts_index") >= split_ts)
        
        if valid_fold.height == 0: continue
        
        print(f"  Fold {i}: Train={train_fold.height}, Valid={valid_fold.height}")
        
        # Data preparation (NO redundant GPU copies!)
        # LightGBM can handle NumPy directly and will move to GPU if 'device': 'gpu'
        X_tr = train_fold.select(features).fill_null(0).to_numpy()
        y_tr = train_fold[target].to_numpy()
        w_tr = train_fold[weight].fill_null(1.0).to_numpy()
        
        X_va = valid_fold.select(features).fill_null(0).to_numpy()
        y_va = valid_fold[target].to_numpy()
        w_va = valid_fold[weight].fill_null(1.0).to_numpy()
        
        train_data = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
        valid_data = lgb.Dataset(X_va, label=y_va, weight=w_va, reference=train_data)
        
        params = {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "device": "gpu",
            "verbose": -1,
            "n_jobs": -1
        }
        
        model = lgb.train(
            params, train_data, num_boost_round=500,
            valid_sets=[valid_data],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(False)]
        )
        
        preds = model.predict(X_va)
        score = weighted_rmse_score(cpu_to_gpu(y_va), cpu_to_gpu(preds), cpu_to_gpu(w_va))
        cv_scores.append(score)
        models.append(model)
        
        # Explicitly delete large objects and clear pool
        del X_tr, y_tr, w_tr, X_va, y_va, w_va, train_data, valid_data
        clear_gpu_memory()
        
    avg_score = sum(cv_scores) / len(cv_scores) if cv_scores else 0
    return models[-1], avg_score

# Train separate models for horizons
horizons = sorted(train_df["horizon"].unique().to_list())
best_models = {}

print("Training Horizon-specific Models with CV (Iter C)...")
for h in horizons:
    t_h = train_df.filter(pl.col("horizon") == h)
    if t_h.height == 0: continue
        
    print(f"Processing Horizon {h}...")
    model, cv_score = train_lgb_model_cv(t_h, current_features)
    best_models[h] = model
    print(f"âœ“ Horizon {h} CV Score: {cv_score:.4f}")

# Generate Predictions
preds_full = []
for h, model in best_models.items():
    sub_df = valid_df.filter(pl.col("horizon") == h)
    if sub_df.height > 0:
        preds = model.predict(sub_df.select(current_features).fill_null(0).to_numpy())
        temp_df = sub_df.select("id").with_columns(pl.Series(name="pred_iter_c_h", values=preds))
        preds_full.append(temp_df)

if preds_full:
    preds_all = pl.concat(preds_full)
    valid_df = valid_df.join(preds_all, on="id", how="left").with_columns(
        pl.col("pred_iter_c_h").fill_null(0).alias("pred_iter_c")
    )

# Overall evaluation
y_true_gpu = cpu_to_gpu(valid_df["feature_ch"].to_numpy())
pred_gpu = cpu_to_gpu(valid_df["pred_iter_c"].to_numpy())
weights_gpu = cpu_to_gpu(valid_df["feature_cg"].fill_null(1.0).to_numpy())
overall_score_c = weighted_rmse_score(y_true_gpu, pred_gpu, weights_gpu)
print(f"Overall Iteration C Score: {overall_score_c:.4f}")

# Iteration D: PCA

In [None]:

# Iteration D: PCA (GPU-optimized & Memory-Safe)

print("Applying PCA (Iter D)...")
clear_gpu_memory()

# Select numeric features (Updated for new names: _rm, _lag)
pca_features = [c for c in train_df.columns if c.startswith("feature_") or "_rm" in c or "_lag" in c]
pca_features = pca_features[:40] # Reduced to 40 for stability

print(f"  Using {len(pca_features)} features for PCA")

# Load to GPU (Batch process if needed, but 40 cols should fit)
def get_scaled_gpu(df, cols, mean=None, std=None):
    X = cpu_to_gpu(df.select(cols).fill_null(0).to_numpy())
    if mean is None:
        mean = np.mean(X, axis=0, keepdims=True)
        std = np.std(X, axis=0, keepdims=True)
        std = np.where(std == 0, 1.0, std)
    X = (X - mean) / std
    return X, mean, std

X_train_scaled, m, s = get_scaled_gpu(train_df, pca_features)

# PCA on CPU (sklearn)
pca = PCA(n_components=8) # Reduced components for RAM
X_train_pca = pca.fit_transform(gpu_to_cpu(X_train_scaled))
del X_train_scaled
gc.collect()

# Validation
X_valid_scaled, _, _ = get_scaled_gpu(valid_df, pca_features, m, s)
X_valid_pca = pca.transform(gpu_to_cpu(X_valid_scaled))
del X_valid_scaled
gc.collect()

# Test
X_test_scaled, _, _ = get_scaled_gpu(test_df, pca_features, m, s)
X_test_pca = pca.transform(gpu_to_cpu(X_test_scaled))
del X_test_scaled
clear_gpu_memory()

# Add back as Float32
pca_cols = [f"pca_{i}" for i in range(8)]
train_df = pl.concat([train_df, pl.DataFrame(X_train_pca, schema=pca_cols).with_columns([pl.all().cast(pl.Float32)])], how="horizontal")
valid_df = pl.concat([valid_df, pl.DataFrame(X_valid_pca, schema=pca_cols).with_columns([pl.all().cast(pl.Float32)])], how="horizontal")
test_df = pl.concat([test_df, pl.DataFrame(X_test_pca, schema=pca_cols).with_columns([pl.all().cast(pl.Float32)])], how="horizontal")

features_d = current_features + pca_cols
print(f"âœ“ PCA features added. Total candidates: {len(features_d)}")


# Iteration E: Smoothed Target Encoding

In [None]:

# Iteration E: Smoothed Target Encoding (RAM-Safe Hstack)

def create_smoothed_target_encoding_pl(
    df, col, target="feature_ch", smoothing=10
):
    if col not in df.columns: return df
    global_mean = df[target].mean()
    
    # Use shift(1) to prevent leakage
    return df.with_columns(
        (
            (pl.col(target).shift(1).cum_sum().over(col).fill_null(0) + smoothing * global_mean) 
            / 
            (pl.col(target).shift(1).cum_count().over(col).fill_null(0) + smoothing)
        ).alias(f"{col}_enc").cast(pl.Float32)
    )

print("Applying Smoothed Target Encoding (Iter E)...")

# Join into one for encoding, but keep it lean (ID columns only)
cols_to_keep = ["source_set", "feature_ch", "code", "sub_code", "ts_index"]
full_df = pl.concat([
    train_df.select(cols_to_keep),
    valid_df.select(cols_to_keep),
    test_df.select(cols_to_keep)
], how="diagonal")

for col in ["code", "sub_code"]:
    full_df = create_smoothed_target_encoding_pl(full_df, col)
    gc.collect()

# Transfer encodings back using HSTACK (requires same order, which filter preserves)
print("  Transferring encodings via hstack...")
train_enc = full_df.filter(pl.col("source_set")=="train").select(["code_enc", "sub_code_enc"])
train_df = train_df.hstack(train_enc)
del train_enc

valid_enc = full_df.filter(pl.col("source_set")=="valid").select(["code_enc", "sub_code_enc"])
valid_df = valid_df.hstack(valid_enc)
del valid_enc

test_enc = full_df.filter(pl.col("source_set")=="test").select(["code_enc", "sub_code_enc"])
test_df = test_df.hstack(test_enc)
del test_enc

del full_df
gc.collect()

features_e = features_d + ["code_enc", "sub_code_enc"]
print(f"âœ“ Target encoding complete. Total candidates: {len(features_e)}")


# Step 8: Feature Selection (Iter F)

In [None]:
# Iteration F: Advanced Feature Selection (Model-Based)

print("Applying Non-Linear Feature Selection (Iter F)...")

# 1. Create Interactions
new_cols = []
top_feats = [c for c in features_e if "_lag_1" in c or "_roll_mean_7" in c][:10]
for feat in top_feats:
    new_cols.append((pl.col(feat) * pl.col("horizon")).alias(f"{feat}_x_horizon"))

train_df = train_df.with_columns(new_cols)
valid_df = valid_df.with_columns(new_cols)
test_df = test_df.with_columns(new_cols)

all_candidates_f = features_e + [f"{feat}_x_horizon" for feat in top_feats]
all_candidates_f = [c for c in all_candidates_f if c in train_df.columns]

# 2. Use LightGBM Importance for Selection
X_sel_np = train_df.select(all_candidates_f).fill_null(0).to_numpy()
y_sel_np = train_df["feature_ch"].to_numpy()

sel_model = lgb.LGBMRegressor(n_estimators=100, device="gpu", random_state=42, verbose=-1)
sel_model.fit(X_sel_np, y_sel_np)

importance_df = pl.DataFrame({
    "feature": all_candidates_f,
    "importance": sel_model.feature_importances_
}).sort("importance", descending=True)

selected_features_f = importance_df.head(150)["feature"].to_list()
print(f"âœ“ Selected top {len(selected_features_f)} features")


# Iteration G: Ensemble

In [None]:
# Iteration G: The "Silver Bullet" Ensemble (Final Submission on test.parquet)

import xgboost as xgb
from catboost import CatBoostRegressor

print("Training Integrated GPU Ensemble & Generating Final Submission...")
clear_gpu_memory()

horizons = sorted(train_df["horizon"].unique().to_list())
preds_test = []
preds_valid = []

for h in horizons:
    t_h = train_df.filter(pl.col("horizon") == h)
    v_h = valid_df.filter(pl.col("horizon") == h)
    te_h = test_df.filter(pl.col("horizon") == h)
    
    if t_h.height == 0: continue
    
    print(f"Processing Horizon {h}...")
    
    X_train = t_h.select(selected_features_f).fill_null(0).to_numpy()
    y_train = t_h["feature_ch"].to_numpy()
    w_train = t_h["feature_cg"].fill_null(1.0).to_numpy()
    
    X_valid = v_h.select(selected_features_f).fill_null(0).to_numpy()
    X_test = te_h.select(selected_features_f).fill_null(0).to_numpy()
    
    # Simple weighted blend
    def train_and_predict(X_tr, y_tr, w_tr, X_val, X_te):
        # LGBM
        m1 = lgb.LGBMRegressor(n_estimators=500, device="gpu", random_state=42, verbose=-1)
        m1.fit(X_tr, y_tr, sample_weight=w_tr)
        p1_v, p1_t = m1.predict(X_val), m1.predict(X_te)
        
        # XGB
        m2 = xgb.XGBRegressor(n_estimators=500, tree_method="hist", device="cuda", random_state=42)
        m2.fit(X_tr, y_tr, sample_weight=w_tr)
        p2_v, p2_t = m2.predict(X_val), m2.predict(X_te)
        
        # CatBoost
        m3 = CatBoostRegressor(n_estimators=500, task_type="GPU", random_state=42, verbose=0)
        m3.fit(X_tr, y_tr, sample_weight=w_tr)
        p3_v, p3_t = m3.predict(X_val), m3.predict(X_te)
        
        clear_gpu_memory()
        v_res = (0.4 * p1_v + 0.4 * p2_v + 0.2 * p3_v)
        t_res = (0.4 * p1_t + 0.4 * p2_t + 0.2 * p3_t)
        return v_res, t_res

    p_val, p_test = train_and_predict(X_train, y_train, w_train, X_valid, X_test)
    
    preds_valid.append(v_h.select("id").with_columns(pl.Series("prediction", p_val)))
    preds_test.append(te_h.select("id").with_columns(pl.Series("prediction", p_test)))

# Save Submission
if preds_test:
    sub = pl.concat(preds_test)
    sub.write_csv("submission_final_polars.csv")
    print(f"âœ“ Saved submission with {sub.height} rows")

# Validation Score
if preds_valid:
    val_res = pl.concat(preds_valid)
    val_merged = valid_df.join(val_res, on="id")
    y_true = cpu_to_gpu(val_merged["feature_ch"].to_numpy())
    y_pred = cpu_to_gpu(val_merged["prediction"].to_numpy())
    weights = cpu_to_gpu(val_merged["feature_cg"].fill_null(1.0).to_numpy())
    score = weighted_rmse_score(y_true, y_pred, weights)
    print(f"ðŸš€ Final Validation Skill Score: {score:.4f}")
