In [None]:
import os
import numpy as np
import pandas as pd
import polars as pl
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')

# ---- import your metric ----
from kaggle_evaluation import score as competition_score
try:
    from kaggle_evaluation import ParticipantVisibleError
except Exception:
    class ParticipantVisibleError(Exception): ...



def to_allocation(preds: np.ndarray, k: float) -> np.ndarray:
    """
    Sigmoid mapping to [0,2]. k is tuned on validation.
    """
    return np.clip(2.0 * (1.0 / (1.0 + np.exp(-preds * k))), 0.0, 2.0)


def eval_with_metric(solution_pl: pl.DataFrame, allocations: np.ndarray) -> float:
    """
    Wrap kaggle_evaluation.score:
      score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float
    We pass 'date_id' to match the signature.
    """
    sol_pd = solution_pl.select(["date_id", "forward_returns", "risk_free_rate"]).to_pandas().copy()
    sub_pd = pd.DataFrame({
        "date_id": sol_pd["date_id"].values,  # not used inside score(), but required by its signature
        "prediction": allocations
    })
    try:
        return float(competition_score(sol_pd, sub_pd, row_id_column_name="date_id"))
    except ParticipantVisibleError as e:
        # Metric guards (e.g., bounds, zero std): treat as poor score
        # You can print the warning if desired.
        # print(f"[metric warning] {e}")
        return -1e9


def tune_k_on_val(val_df: pl.DataFrame, raw_preds: np.ndarray) -> tuple[float, float]:
    grid = [5, 10, 20, 40, 80, 120]
    best_score, best_k = -1e9, grid[0]
    for k in grid:
        alloc = to_allocation(raw_preds, k)
        score = eval_with_metric(val_df, alloc)
        if score > best_score:
            best_score, best_k = score, k
    return best_k, best_score




MAX_LAG = 5
ROLL_WINDOWS = [10, 30]
MAX_ROLL = max(ROLL_WINDOWS)
FE_HISTORY = MAX_ROLL + MAX_LAG + 5  # cushion

SPECIAL_COLS = {"date_id", "forward_returns", "risk_free_rate", "target", "is_scored"}

def create_features(df: pl.DataFrame) -> pl.DataFrame:
    """
    Past-only features:
      - Lags 1,3,5
      - Rolling mean/std (10,30) with shift(1) to avoid using current value.
    We do NOT drop rows; XGB handles NaNs.
    """
    features_to_engineer = [c for c in df.columns if c not in SPECIAL_COLS]
    out = df.clone()

    for feature in features_to_engineer:
        # lags
        for lag in [1, 3, 5]:
            out = out.with_columns(
                pl.col(feature).shift(lag).alias(f'{feature}_lag_{lag}')
            )
        # rolling (shifted to avoid leakage)
        for w in ROLL_WINDOWS:
            out = out.with_columns(
                pl.col(feature).rolling_mean(w).shift(1).alias(f'{feature}_roll_mean_{w}'),
                pl.col(feature).rolling_std(w).shift(1).alias(f'{feature}_roll_std_{w}')
            )
    return out




print("Loading training and test data...")
train_path = "train.csv"
test_path  = "test.csv"

full_train_df = pl.read_csv(train_path)
test_df       = pl.read_csv(test_path)

# rename target (we predict market_forward_excess_returns)
full_train_df = full_train_df.rename({"market_forward_excess_returns": "target"})

# Cast numeric columns to Float64 (except date_id)
num_cols = [c for c in full_train_df.columns if c != "date_id"]
full_train_df = full_train_df.with_columns(pl.col(num_cols).cast(pl.Float64, strict=False))

print("Engineering features on training set...")
processed_df = create_features(full_train_df)

# FEATURES (exclude specials + date_id)
FEATURES = [c for c in processed_df.columns if c not in SPECIAL_COLS and c != "date_id"]
TARGET_COL = "target"



# Use last 252 days as primary validation.
# Optionally evaluate across last 3 blocks (3 x 252) to reduce regime overfit.
VALID_BLOCK = 252
USE_MULTI_BLOCK = True  # set False to tune on last block only

def get_blocks(df_proc: pl.DataFrame):
    N = df_proc.height
    if USE_MULTI_BLOCK and N >= 3 * VALID_BLOCK + 1000:
        # Train on everything before each block, evaluate on that block, average score
        blocks = []
        for m in [3, 2, 1]:
            val_end = N - (m-1) * VALID_BLOCK
            val_start = val_end - VALID_BLOCK
            tr_end = val_start  # train up to start of this val block
            blocks.append({
                "train_slice": (0, tr_end),
                "val_slice":   (val_start, val_end),
            })
        return blocks
    else:
        # Single last block
        return [{
            "train_slice": (0, N - VALID_BLOCK),
            "val_slice":   (N - VALID_BLOCK, N),
        }]

VAL_BLOCKS = get_blocks(processed_df)

def slice_np(df_proc: pl.DataFrame, feature_cols, target_col, start, end):
    X = df_proc.slice(start, end - start).select(feature_cols).to_numpy()
    y = df_proc.slice(start, end - start).select(target_col).to_numpy().ravel()
    return X, y



def trial_to_params(trial: optuna.Trial):
    return dict(
        objective='reg:squarederror',
        tree_method='hist',
        n_jobs=-1,
        random_state=42,
        # search space (tight to avoid overfit to one year)
        n_estimators=trial.suggest_int("n_estimators", 400, 900),
        learning_rate=trial.suggest_float("learning_rate", 0.02, 0.15, log=True),
        max_depth=trial.suggest_int("max_depth", 3, 8),
        subsample=trial.suggest_float("subsample", 0.6, 1.0),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.6, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 3.0),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 3.0),
    )

def objective(trial: optuna.Trial) -> float:
    params = trial_to_params(trial)

    scores = []
    for blk in VAL_BLOCKS:
        tr_s, tr_e = blk["train_slice"]
        va_s, va_e = blk["val_slice"]

        X_tr, y_tr = slice_np(processed_df, FEATURES, TARGET_COL, tr_s, tr_e)
        X_va, y_va = slice_np(processed_df, FEATURES, TARGET_COL, va_s, va_e)

        # Fit model on this block's train
        model = xgb.XGBRegressor(**params)
        model.fit(X_tr, y_tr, verbose=False)

        # Predict on this block's val
        raw_preds = model.predict(X_va)

        # Build a minimal solution DF for metric (date_id + fwd + rf) over val slice
        sol_va = processed_df.slice(va_s, va_e - va_s).select(["date_id", "forward_returns", "risk_free_rate"])

        # Tune k on this block and collect the score
        best_k, best_score = tune_k_on_val(sol_va, raw_preds)
        scores.append(best_score)

    # Average across blocks (or single block)
    return float(np.mean(scores))

# Run Optuna
N_TRIALS = 30  # keep modest to reduce overfit and runtime
print(f"Starting Optuna tuning... (trials={N_TRIALS}, multi_block={USE_MULTI_BLOCK})")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS)

best_params = trial_to_params(study.best_trial)
print("Best Optuna params:", {k: v for k, v in best_params.items() if k not in ["objective", "tree_method", "n_jobs", "random_state"]})

# For reporting: compute best k on the final block using best params
last_blk = VAL_BLOCKS[-1]
tr_s, tr_e = last_blk["train_slice"]
va_s, va_e = last_blk["val_slice"]
X_tr, y_tr = slice_np(processed_df, FEATURES, TARGET_COL, tr_s, tr_e)
X_va, y_va = slice_np(processed_df, FEATURES, TARGET_COL, va_s, va_e)
tmp_model = xgb.XGBRegressor(**best_params)
tmp_model.fit(X_tr, y_tr, verbose=False)
raw_val_preds = tmp_model.predict(X_va)
best_k_final, val_score_final = tune_k_on_val(
    processed_df.slice(va_s, va_e - va_s).select(["date_id", "forward_returns", "risk_free_rate"]),
    raw_val_preds
)
print(f"Tuned k on validation (final block): best_k={best_k_final}, val_score={val_score_final:.4f}")



X_full = processed_df.select(FEATURES).to_numpy()
y_full = processed_df.select(TARGET_COL).to_numpy().ravel()
print(f"Refitting on full dataset: {X_full.shape[0]} rows...")
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_full, y_full, verbose=False)
print("Final model trained.")

# Prepare history for test walk
HISTORY_BUFFER = full_train_df.tail(FE_HISTORY).clone()
print(f"Prepared HISTORY_BUFFER with last {FE_HISTORY} raw rows.")
print(f"FEATURES count: {len(FEATURES)} | BEST_K: {best_k_final}")



print("Generating predictions on test.csv ...")

# Standardize incoming test column names (match training schema)
rename_map = {
    'lagged_forward_returns': 'forward_returns',
    'lagged_risk_free_rate': 'risk_free_rate',
    'lagged_market_forward_excess_returns': 'target'
}

# Cast test floats
float_cols_test = [c for c in test_df.columns if c != "date_id"]
test_df = test_df.with_columns(pl.col(float_cols_test).cast(pl.Float64, strict=False))

allocations = []
date_ids = []

for i in range(test_df.height):
    row = test_df.slice(i, 1)

    # Standardize names for the incoming row
    row = row.rename({k: v for k, v in rename_map.items() if k in row.columns})

    # Drop runtime-only column if present
    if 'is_scored' in row.columns:
        row = row.drop('is_scored')

    # Append, keep tail for FE
    HISTORY_BUFFER = pl.concat([HISTORY_BUFFER, row], how="vertical").tail(FE_HISTORY)

    # Feature engineering on tail
    feat_tail = create_features(HISTORY_BUFFER)

    # Latest engineered row → select FEATURES
    latest = feat_tail.tail(1)
    latest_pd = latest.select(FEATURES).to_pandas()

    # Ensure all training FEATURES exist
    for c in FEATURES:
        if c not in latest_pd.columns:
            latest_pd[c] = 0.0
    latest_pd = latest_pd[FEATURES]

    # Predict and map to allocation
    raw_pred = float(final_model.predict(latest_pd.values)[0])
    alloc = float(to_allocation(np.array([raw_pred]), best_k_final)[0])

    allocations.append(alloc)
    date_ids.append(int(row['date_id'][0]))

# Save submission (your local scorer expects 'prediction')
submission = pd.DataFrame({
    'date_id': date_ids,
    'prediction': allocations
})
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv (columns: date_id, prediction)")



val_alloc = to_allocation(raw_val_preds, best_k_final)
val_score_check = eval_with_metric(
    processed_df.slice(va_s, va_e - va_s).select(["date_id", "forward_returns", "risk_free_rate"]),
    val_alloc
)
print(f"Validation metric (final block, best params): {val_score_check:.4f}")
