In [7]:
# Cell 1 — Imports & Config

import os, gc, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection      import KFold
from sklearn.preprocessing       import RobustScaler
from sklearn.linear_model        import (
    HuberRegressor, RANSACRegressor, TheilSenRegressor,
    Lasso, ElasticNet, Ridge
)
from sklearn.cross_decomposition import PLSRegression

from xgboost                     import XGBRegressor
from lightgbm                    import LGBMRegressor
from scipy.stats                 import pearsonr

warnings.filterwarnings("ignore")

class Config:
    # Paths
    TRAIN_PATH   = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    TEST_PATH    = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    SUB_IN       = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"
    SUB_OUT_DIR  = "."

    # Raw features you selected
    RAW_FEATS    = [
        "X863","X856","X598","X862","X385","X852","X603","X860","X674",
        "X415","X345","X855","X174","X302","X178","X168","X612",
        "buy_qty","sell_qty","volume","X888","X421","X333",
        "bid_qty","ask_qty"
    ]

    # Microstructure features
    MICRO_FEATS  = [
        "volume_weighted_sell", "buy_sell_ratio",
        "selling_pressure",     "effective_spread_proxy"
    ]
    # Extra “robust” features
    ROBUST_FEATS = [
        "log_volume", "bid_ask_imbalance",
        "order_flow_imbalance", "liquidity_ratio"
    ]

    # Full list used for modeling
    ALL_FEATURES = RAW_FEATS + MICRO_FEATS + ROBUST_FEATS

    LABEL_COL    = "label"
    N_FOLDS      = 3
    RANDOM_STATE = 42

    # How much to up-weight recent data
    DECAY_FACTOR = 0.9

    # XGBoost GPU settings
    XGB_PARAMS = {
        "tree_method":      "hist",
        "predictor":        "gpu_predictor",
        "gpu_id":           0,
        "colsample_bylevel":0.4778,
        "colsample_bynode": 0.3628,
        "colsample_bytree": 0.7107,
        "gamma":            1.7095,
        "learning_rate":    0.02213,
        "max_depth":        20,
        "max_leaves":       12,
        "min_child_weight": 16,
        "n_estimators":     1667,
        "subsample":        0.06567,
        "reg_alpha":        39.3524,
        "reg_lambda":       75.4484,
        "verbosity":        0,
        "random_state":     RANDOM_STATE,
        "n_jobs":           -1
    }

    # LightGBM GPU settings
    LGBM_PARAMS = {
        "n_estimators":      500,
        "learning_rate":     0.03,
        "num_leaves":        31,
        "min_child_samples": 50,
        "subsample":         0.8,
        "colsample_bytree":  0.8,
        "reg_alpha":         10,
        "reg_lambda":        10,
        "device":            "gpu",
        "verbosity":         -1,
        "random_state":      RANDOM_STATE,
        "n_jobs":            -1
    }

    # All learners including a Ridge baseline
    LEARNERS = [
        {"name":"xgb",     "Estimator":XGBRegressor,   "params":XGB_PARAMS,  "need_scale":False},
        {"name":"lgbm",    "Estimator":LGBMRegressor,  "params":LGBM_PARAMS, "need_scale":False},
        {"name":"ridge",   "Estimator":Ridge,          "params":{},          "need_scale":True},
        {"name":"huber",   "Estimator":HuberRegressor, "params":{"epsilon":1.5,"alpha":0.01,"max_iter":500},    "need_scale":True},
        {"name":"ransac",  "Estimator":RANSACRegressor,"params":{"min_samples":0.7,"max_trials":100,"random_state":RANDOM_STATE},"need_scale":True},
        {"name":"theilsen","Estimator":TheilSenRegressor,"params":{"max_subpopulation":10000,"random_state":RANDOM_STATE},"need_scale":True},
        {"name":"lasso",   "Estimator":Lasso,           "params":{"alpha":0.001,"max_iter":1000},                "need_scale":True},
        {"name":"elastic","Estimator":ElasticNet,       "params":{"alpha":0.001,"l1_ratio":0.5,"max_iter":1000}, "need_scale":True},
        {"name":"pls",     "Estimator":PLSRegression,   "params":{"n_components":50},                          "need_scale":True},
    ]


In [8]:
# Cell 2 — Feature Engineering & Helpers

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    # Microstructure
    df["volume_weighted_sell"]   = df["sell_qty"] * df["volume"]
    df["buy_sell_ratio"]         = df["buy_qty"] / (df["sell_qty"] + 1e-8)
    df["selling_pressure"]       = df["sell_qty"] / (df["volume"] + 1e-8)
    df["effective_spread_proxy"] = np.abs(df["buy_qty"] - df["sell_qty"]) / (df["volume"] + 1e-8)

    # Robust transforms
    df["log_volume"]             = np.log1p(df["volume"])
    df["bid_ask_imbalance"]      = (df["bid_qty"] - df["ask_qty"]) / (df["bid_qty"] + df["ask_qty"] + 1e-8)
    df["order_flow_imbalance"]   = (df["buy_qty"] - df["sell_qty"]) / (df["buy_qty"] + df["sell_qty"] + 1e-8)
    df["liquidity_ratio"]        = (df["bid_qty"] + df["ask_qty"]) / (df["volume"] + 1e-8)

    # Replace infs/NaNs, then median‐impute
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for c in df.columns:
        if df[c].isna().any():
            med = df[c].median()
            df[c].fillna(med if not pd.isna(med) else 0, inplace=True)

    return df

def create_time_decay_weights(n: int, decay: float=Config.DECAY_FACTOR) -> np.ndarray:
    idx        = np.arange(n)
    normalized = idx / (n - 1)
    w          = decay ** (1.0 - normalized)
    return w * n / w.sum()

def get_model_slices(n_samples: int):
    return [
        {"name":"full_data",  "cutoff":0},
        {"name":"last_75pct","cutoff":int(0.25*n_samples)},
        {"name":"last_50pct","cutoff":int(0.50*n_samples)},
    ]

def train_single_model(X_tr, y_tr, X_val, y_val, X_test, learner, sw=None):
    # scale if needed
    if learner["need_scale"]:
        scaler = RobustScaler()
        X_tr  = scaler.fit_transform(X_tr)
        X_val = scaler.transform(X_val)
        X_test= scaler.transform(X_test)

    Model = learner["Estimator"]
    model = Model(**learner["params"])

    # pass sample_weight when supported
    fit_kwargs = {}
    if sw is not None and "sample_weight" in model.fit.__code__.co_varnames:
        fit_kwargs["sample_weight"] = sw

    model.fit(X_tr, y_tr, **fit_kwargs)
    return model.predict(X_val), model.predict(X_test)


In [9]:
# Cell 3 — Load Data (auto-detect paths), Drop Inf, Clip Labels

import os, glob

def load_data():
    # 0) Auto-detect the dataset directory under /kaggle/input
    dirs = glob.glob("/kaggle/input/*drw*crypto*") or glob.glob("/kaggle/input/*drw-crypto*") or glob.glob("/kaggle/input/*crypto*")
    if not dirs:
        raise FileNotFoundError("Could not find the DRW crypto dataset under /kaggle/input")
    base = dirs[0]

    # 1) Build the correct file paths
    train_path = os.path.join(base, "train.parquet")
    test_path  = os.path.join(base, "test.parquet")
    sub_path   = os.path.join(base, "sample_submission.csv")

    # 2) Read only the RAW_FEATS + label
    train = pd.read_parquet(train_path, columns=Config.RAW_FEATS + [Config.LABEL_COL])
    test  = pd.read_parquet(test_path,  columns=Config.RAW_FEATS)
    sub   = pd.read_csv(sub_path)

    # 3) Drop any RAW_FEATS that are entirely infinite
    inf_train = train.isin([np.inf, -np.inf]).all()
    drop_inf  = [c for c in Config.RAW_FEATS if inf_train.get(c, False)]
    if drop_inf:
        train.drop(columns=drop_inf, inplace=True)
        test.drop(columns=drop_inf,  inplace=True)
        Config.RAW_FEATS = [c for c in Config.RAW_FEATS if c not in drop_inf]

    # 4) Feature engineering (adds MICRO_FEATS + ROBUST_FEATS)
    train = feature_engineering(train)
    test  = feature_engineering(test)

    # 5) Prepare label (clip extremes) and remove from train
    y = train[Config.LABEL_COL].clip(-10, 10).values
    train.drop(columns=[Config.LABEL_COL], inplace=True)

    # 6) Finalize the feature list
    Config.ALL_FEATURES = Config.RAW_FEATS + Config.MICRO_FEATS + Config.ROBUST_FEATS

    print(f"Train shape: {train.shape}, Test shape: {test.shape}, Submission shape: {sub.shape}")
    return train.reset_index(drop=True), y, test.reset_index(drop=True), sub

# Now call it:
train_df, y, test_df, submission_df = load_data()


FileNotFoundError: Could not find the DRW crypto dataset under /kaggle/input

In [None]:
# Cell 4 — Train & OOF / Test Predictions

def train_and_evaluate(train_df, y, test_df):
    n       = len(train_df)
    slices  = get_model_slices(n)
    feats   = Config.ALL_FEATURES

    # storage
    oof_preds  = {lr["name"]:{s["name"]:np.zeros(n) for s in slices} for lr in Config.LEARNERS}
    test_preds = {lr["name"]:{s["name"]:np.zeros(len(test_df)) for s in slices} for lr in Config.LEARNERS}

    decay_wts = create_time_decay_weights(n)
    kf        = KFold(n_splits=Config.N_FOLDS, shuffle=False)

    for fold, (tr_idx, val_idx) in enumerate(kf.split(train_df),1):
        print(f"\n=== Fold {fold}/{Config.N_FOLDS} ===")
        X_val = train_df.iloc[val_idx][feats];  y_val = y[val_idx]
        X_test= test_df[feats]

        for sl in slices:
            name, cut = sl["name"], sl["cutoff"]
            sub       = train_df.iloc[cut:].reset_index(drop=True)
            rel_idx   = tr_idx[tr_idx>=cut] - cut
            if rel_idx.size==0: continue

            X_tr = sub.iloc[rel_idx][feats]
            y_tr = y[tr_idx[tr_idx>=cut]]
            sw   = (create_time_decay_weights(len(sub))[rel_idx] 
                    if cut>0 else decay_wts[tr_idx])

            print(f"  Slice {name}: train size={len(X_tr)}")

            for learner in Config.LEARNERS:
                try:
                    val_p, test_p = train_single_model(
                        X_tr, y_tr, X_val, y_val, X_test, learner, sw
                    )
                    # OOF
                    mask = val_idx>=cut
                    if mask.any():
                        idxs = val_idx[mask]
                        oof_preds[learner["name"]][name][idxs] = val_p[mask]
                    # carry-forward older rows
                    if cut>0 and (~mask).any():
                        oof_preds[learner["name"]][name][val_idx[~mask]] = \
                          oof_preds[learner["name"]]["full_data"][val_idx[~mask]]

                    # test accumulation
                    test_preds[learner["name"]][name] += test_p

                except Exception as e:
                    print(f"    ! {learner['name']} failed: {e}")

        gc.collect()

    # normalize test preds
    for lr in test_preds:
        for sl in test_preds[lr]:
            test_preds[lr][sl] /= Config.N_FOLDS

    return oof_preds, test_preds, slices

oof_preds, test_preds, model_slices = train_and_evaluate(train_df, y, test_df)


In [None]:
# Cell 5 — Ensemble & Submissions

def create_submissions(oof_preds, test_preds, submission_template):
    subs = {}

    # helper to average slice preds
    def avg(pred_dict):
        return np.mean(list(pred_dict.values()), axis=0)

    # 1) per-learner simple & weighted
    all_oof, all_test, scores = {}, {}, {}
    for name in oof_preds:
        o = avg(oof_preds[name]);  t = avg(test_preds[name])
        r = pearsonr(y, o)[0]
        if not np.isnan(r) and r>0:
            scores[name]=r
            all_oof[name]=o
            all_test[name]=t
            print(f"{name:12s} OOF r = {r:.4f}")
    total = sum(scores.values())

    # save individual
    for name in all_test:
        df = submission_template.copy()
        df["prediction"] = all_test[name]
        df.to_csv(f"{name}.csv", index=False)
        subs[name] = scores[name]

    # weighted full ensemble
    weights = {n:sc/total for n,sc in scores.items()}
    w_oof = sum(weights[n]*all_oof[n] for n in weights)
    w_tst = sum(weights[n]*all_test[n] for n in weights)
    w_r   = pearsonr(y, w_oof)[0]
    print(f"weighted_ensemble OOF r = {w_r:.4f}, weights={weights}")

    df = submission_template.copy()
    df["prediction"] = w_tst
    df.to_csv("submission_weighted.csv", index=False)
    subs["weighted_ensemble"] = w_r

    # simple full ensemble
    s_tst = avg(all_test)
    df = submission_template.copy()
    df["prediction"] = s_tst
    df.to_csv("submission_simple.csv", index=False)
    subs["simple_ensemble"] = pearsonr(y, avg(all_oof))[0]

    return subs

# load submission template
submission_df = pd.read_csv(Config.SUB_IN)
submission_scores = create_submissions(oof_preds, test_preds, submission_df)
print("\nDone, scores:", submission_scores)


In [None]:
# Cell 6 — DONE
print("All submission CSVs written to", Config.SUB_OUT_DIR) 
