# Data Analysis

In [None]:
import pandas as pd
from functools import reduce
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import ParameterGrid


In [None]:
DATE = "2025-02-28"

df_prices = pd.read_csv(f"data/{DATE}/prices.csv", parse_dates=["date"])
df_cash_rates = pd.read_csv(f"data/{DATE}/cash_rate.csv", parse_dates=["date"])
df_signals = pd.read_csv(f"data/{DATE}/signals.csv", parse_dates=["date"])
df_volumes = pd.read_csv(f"data/{DATE}/volumes.csv", parse_dates=["date"])

dfs = [df_prices, df_cash_rates, df_signals, df_volumes]

df = (
    reduce(
        lambda left, right: pd.merge(left, right, on="date", how="left"),
        dfs,
    )
    .sort_values("date")
    .reset_index(drop=True)
)

def add_periodic_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    date = pd.to_datetime(df["date"])

    day_of_week = date.dt.dayofweek
    day_of_year = date.dt.dayofyear
    month = date.dt.month
    week_of_year = date.dt.isocalendar().week.astype(int)

    df["dow_sin"] = np.sin(2 * np.pi * day_of_week / 7)
    df["dow_cos"] = np.cos(2 * np.pi * day_of_week / 7)
    df["doy_sin"] = np.sin(2 * np.pi * day_of_year / 365.25)
    df["doy_cos"] = np.cos(2 * np.pi * day_of_year / 365.25)
    df["month_sin"] = np.sin(2 * np.pi * month / 12)
    df["month_cos"] = np.cos(2 * np.pi * month / 12)
    df["woy_sin"] = np.sin(2 * np.pi * week_of_year / 52.18)
    df["woy_cos"] = np.cos(2 * np.pi * week_of_year / 52.18)
    return df

def add_instrument_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for inst in [f"INSTRUMENT_{i}" for i in range(1, 11)]:
        px = df[inst].astype(float)
        ret_1 = px.pct_change()
        log_ret_1 = np.log(px).diff()

        df[f"{inst}_ret1"] = ret_1
        df[f"{inst}_logret1"] = log_ret_1
        df[f"{inst}_mom5"] = px.pct_change(5)
        df[f"{inst}_mom20"] = px.pct_change(20)
        df[f"{inst}_mom60"] = px.pct_change(60)
        df[f"{inst}_vol20"] = ret_1.rolling(20).std()
        df[f"{inst}_vol60"] = ret_1.rolling(60).std()
        df[f"{inst}_ma_ratio20"] = px / px.rolling(20).mean() - 1.0
        df[f"{inst}_ma_ratio60"] = px / px.rolling(60).mean() - 1.0

        vol_col = f"{inst}_vol"
        if vol_col in df.columns:
            log_vol = np.log1p(df[vol_col].astype(float))
            df[f"{inst}_logvol"] = log_vol
            df[f"{inst}_vol_z20"] = (
                (log_vol - log_vol.rolling(20).mean())
                / (log_vol.rolling(20).std() + 1e-12)
            )
            df[f"{inst}_vol_chg5"] = log_vol.diff(5)

    return df

df = add_periodic_date_features(df)
df = add_instrument_features(df)


In [None]:
print(len(df))
df.info()


In [None]:
INSTRUMENTS = [f"INSTRUMENT_{i}" for i in range(1, 11)]
DATE_FEATURES = [
    "dow_sin", "dow_cos",
    "doy_sin", "doy_cos",
    "month_sin", "month_cos",
    "woy_sin", "woy_cos",
]
MACRO_FEATURES = [
    c for c in [
        "1mo", "1.5month", "2mo", "3mo", "4mo", "6mo",
        "1yr", "2yr", "3yr", "5yr", "7yr", "10yr", "20yr", "30yr",
    ]
    if c in df.columns and df[c].notna().mean() > 0.5
]


In [None]:
def softmax(x: np.ndarray):
    x = x - np.max(x)
    e = np.exp(x)
    return e / (e.sum() + 1e-12)

def cap_and_renorm(w: pd.Series, cap=0.25) -> pd.Series:
    w = w.clip(lower=0.0, upper=cap)
    s = float(w.sum())
    if s <= 0:
        return pd.Series(1.0 / len(w), index=w.index)
    return w / s

def build_feature_cols(df: pd.DataFrame, inst: str):
    own_cols = [
        col for col in df.columns
        if col == inst or col.startswith(f"{inst}_")
    ]
    cols = DATE_FEATURES + MACRO_FEATURES + own_cols
    return [col for col in cols if col in df.columns]

def make_labels_next_return(df: pd.DataFrame, inst: str):
    x = df[inst].astype(float)
    return x.shift(-1) / x - 1.0


In [None]:
def train_predict_scores(
    df: pd.DataFrame,
    min_train: int = 750,
    val_size: int = 252,
):
    df = df.sort_values("date").reset_index(drop=True)

    param_grid = {
        "n_estimators": [150, 300],
        "max_depth": [3, 4],
        "learning_rate": [0.03, 0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.8],
        "reg_lambda": [1.0, 3.0],
    }

    preds = {}
    validation_rows = []
    fitted_models = {}

    for inst in INSTRUMENTS:
        feat_cols = build_feature_cols(df, inst)
        X_all = df.iloc[:-1][feat_cols].copy()
        y_all = make_labels_next_return(df, inst).iloc[:-1]

        mask = ~y_all.isna()
        X = X_all.loc[mask]
        y = y_all.loc[mask]

        if len(X) < min_train + val_size:
            preds[inst] = 0.0
            validation_rows.append(
                {
                    "instrument": inst,
                    "n_features": len(feat_cols),
                    "n_train": 0,
                    "n_val": 0,
                    "best_params": None,
                    "val_rmse": np.nan,
                    "val_corr": np.nan,
                    "val_hit_rate": np.nan,
                }
            )
            continue

        X_train = X.iloc[:-val_size]
        y_train = y.iloc[:-val_size]
        X_val = X.iloc[-val_size:]
        y_val = y.iloc[-val_size:]

        best_params = None
        best_score = -np.inf

        for params in ParameterGrid(param_grid):
            model = XGBRegressor(
                objective="reg:squarederror",
                tree_method="hist",
                random_state=0,
                n_jobs=1,
                **params,
            )
            model.fit(X_train.values, y_train.values)

            val_pred = model.predict(X_val.values)
            val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
            val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)

            if val_corr > best_score:
                best_score = val_corr
                best_params = params

        final_model = XGBRegressor(
            objective="reg:squarederror",
            tree_method="hist",
            random_state=0,
            n_jobs=1,
            **best_params,
        )
        final_model.fit(X.values, y.values)

        val_pred = final_model.predict(X_val.values)
        val_rmse = float(np.sqrt(np.mean((val_pred - y_val.values) ** 2)))
        val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
        val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)
        val_hit_rate = float(((val_pred > 0) == (y_val.values > 0)).mean())

        validation_rows.append(
            {
                "instrument": inst,
                "n_features": len(feat_cols),
                "n_train": len(X_train),
                "n_val": len(X_val),
                "best_params": best_params,
                "val_rmse": val_rmse,
                "val_corr": val_corr,
                "val_hit_rate": val_hit_rate,
            }
        )

        x_last = df.loc[df.index[-1], feat_cols].astype(float)
        preds[inst] = float(final_model.predict([x_last.values])[0])
        fitted_models[inst] = final_model

    validation_df = pd.DataFrame(validation_rows).sort_values("val_corr", ascending=False)

    print("Validation results by instrument:")
    display(
        validation_df[
            [
                "instrument",
                "n_features",
                "n_train",
                "n_val",
                "val_rmse",
                "val_corr",
                "val_hit_rate",
                "best_params",
            ]
        ].round(4)
    )

    print("\nAverage validation metrics:")
    display(
        validation_df[["val_rmse", "val_corr", "val_hit_rate"]]
        .mean()
        .to_frame("mean")
        .T.round(4)
    )

    return fitted_models, pd.Series(preds), validation_df

models, pred, validation_df = train_predict_scores(df)


In [None]:
def construct_weights(
    pred: pd.Series,
    beta=5.0,
    cap=0.25,
    smooth_alpha=0.35,
    prev_weights: pd.Series | None = None,
):
    scores = pred.copy()

    w = pd.Series(softmax(beta * scores.values), index=INSTRUMENTS)
    w = cap_and_renorm(w, cap=cap)

    if prev_weights is not None:
        prev_weights = prev_weights.reindex(INSTRUMENTS).fillna(0.0)
        prev_weights = prev_weights / prev_weights.sum()
        w = smooth_alpha * w + (1.0 - smooth_alpha) * prev_weights
        w = cap_and_renorm(w, cap=cap)

    return w

def write_submission(weights: pd.Series, team_name: str, round_n: int, out_path="."):
    out = pd.DataFrame({"asset": weights.index, "weight": weights.values})
    fname = f"{team_name}_round_{round_n}.csv"
    out.to_csv(f"{out_path}/{fname}", index=False)
    return fname

prev = None
w = construct_weights(pred, prev_weights=prev)
fname = write_submission(w, team_name="HiddenLabel", round_n=1, out_path=".")
print("Wrote:", fname)
print(w)


In [None]:
def fit_best_model_for_instrument(
    df: pd.DataFrame,
    inst: str,
    end_idx: int,
    min_train: int = 750,
    val_size: int = 252,
):
    feat_cols = build_feature_cols(df, inst)
    X_all = df.iloc[:end_idx][feat_cols].copy()
    y_all = make_labels_next_return(df, inst).iloc[:end_idx]

    mask = ~y_all.isna()
    X = X_all.loc[mask]
    y = y_all.loc[mask]

    if len(X) < min_train + val_size:
        return None

    X_train = X.iloc[:-val_size]
    y_train = y.iloc[:-val_size]
    X_val = X.iloc[-val_size:]
    y_val = y.iloc[-val_size:]

    param_grid = {
        "n_estimators": [150, 300],
        "max_depth": [3, 4],
        "learning_rate": [0.03, 0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.8],
        "reg_lambda": [1.0, 3.0],
    }

    best_params = None
    best_score = -np.inf

    for params in ParameterGrid(param_grid):
        model = XGBRegressor(
            objective="reg:squarederror",
            tree_method="hist",
            random_state=0,
            n_jobs=1,
            **params,
        )
        model.fit(X_train.values, y_train.values)
        val_pred = model.predict(X_val.values)
        val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
        val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)

        if val_corr > best_score:
            best_score = val_corr
            best_params = params

    if best_params is None:
        return None

    final_model = XGBRegressor(
        objective="reg:squarederror",
        tree_method="hist",
        random_state=0,
        n_jobs=1,
        **best_params,
    )
    final_model.fit(X.values, y.values)

    return {
        "model": final_model,
        "feature_cols": feat_cols,
        "best_params": best_params,
        "val_corr": best_score,
    }

def backtest_model(
    df: pd.DataFrame,
    start_idx: int = 1500,
    retrain_every: int = 63,
    beta: float = 5.0,
    filter_bad_instruments: bool = True,
):
    df = df.sort_values("date").reset_index(drop=True)
    next_returns = pd.DataFrame({inst: make_labels_next_return(df, inst) for inst in INSTRUMENTS})

    pred_rows = []
    selected_rows = []

    for t in range(start_idx, len(df) - 1, retrain_every):
        row = {"date": df.loc[t, "date"]}
        selected = []

        for inst in INSTRUMENTS:
            fitted = fit_best_model_for_instrument(df, inst, end_idx=t)
            if fitted is None:
                row[inst] = np.nan
                continue

            x_test = df.loc[t, fitted["feature_cols"]].astype(float)
            row[inst] = float(fitted["model"].predict([x_test.values])[0])

            if (not filter_bad_instruments) or (fitted["val_corr"] > 0):
                selected.append(inst)

        pred_rows.append(row)
        selected_rows.append({"date": df.loc[t, "date"], "selected": selected})

    pred_df = pd.DataFrame(pred_rows)
    selected_df = pd.DataFrame(selected_rows)

    actual_df = next_returns.iloc[start_idx:len(df) - 1:retrain_every].reset_index(drop=True)
    actual_df.insert(
        0,
        "date",
        df["date"].iloc[start_idx:len(df) - 1:retrain_every].reset_index(drop=True),
    )

    pred_mat = pred_df[INSTRUMENTS].copy()
    act_mat = actual_df[INSTRUMENTS].copy()

    if filter_bad_instruments:
        for i, selected in enumerate(selected_df["selected"]):
            excluded = [inst for inst in INSTRUMENTS if inst not in selected]
            pred_mat.loc[i, excluded] = np.nan

    ic_by_date = pred_mat.corrwith(act_mat, axis=1, method="spearman")
    directional_accuracy = ((pred_mat.values > 0) == (act_mat.values > 0)).mean()

    pred_values = pred_mat.to_numpy(dtype=float)
    pred_values = np.where(np.isnan(pred_values), -np.inf, pred_values)
    pred_values = pred_values - np.max(pred_values, axis=1, keepdims=True)
    exp_scores = np.exp(beta * pred_values)
    exp_scores[np.isinf(pred_values)] = 0.0
    weights = exp_scores / (exp_scores.sum(axis=1, keepdims=True) + 1e-12)

    portfolio_returns = (weights * act_mat.to_numpy()).sum(axis=1)
    equal_weight_returns = act_mat.mean(axis=1).to_numpy()

    top3_mask = pred_mat.rank(axis=1, ascending=False, method="first") <= 3
    top3_weights = top3_mask.div(top3_mask.sum(axis=1), axis=0).fillna(0.0).to_numpy()
    top3_returns = (top3_weights * act_mat.to_numpy()).sum(axis=1)

    equity_curve = pd.Series((1 + portfolio_returns).cumprod(), index=pred_df["date"])
    ann_factor = np.sqrt(252 / retrain_every)

    strategy_curve = pd.Series((1 + portfolio_returns).cumprod())
    equal_weight_curve = pd.Series((1 + equal_weight_returns).cumprod())
    top3_curve = pd.Series((1 + top3_returns).cumprod())

    metrics = pd.DataFrame(
        {
            "n_test_points": [len(pred_df), len(pred_df), len(pred_df)],
            "mean_spearman_ic": [ic_by_date.mean(), np.nan, np.nan],
            "hit_rate": [directional_accuracy, np.nan, np.nan],
            "annualized_sharpe": [
                portfolio_returns.mean() / (portfolio_returns.std(ddof=1) + 1e-12) * ann_factor,
                equal_weight_returns.mean() / (equal_weight_returns.std(ddof=1) + 1e-12) * ann_factor,
                top3_returns.mean() / (top3_returns.std(ddof=1) + 1e-12) * ann_factor,
            ],
            "avg_period_return": [
                portfolio_returns.mean(),
                equal_weight_returns.mean(),
                top3_returns.mean(),
            ],
            "vol_period_return": [
                portfolio_returns.std(ddof=1),
                equal_weight_returns.std(ddof=1),
                top3_returns.std(ddof=1),
            ],
            "total_return": [
                strategy_curve.iloc[-1] - 1.0,
                equal_weight_curve.iloc[-1] - 1.0,
                top3_curve.iloc[-1] - 1.0,
            ],
            "max_drawdown": [
                (strategy_curve / strategy_curve.cummax() - 1.0).min(),
                (equal_weight_curve / equal_weight_curve.cummax() - 1.0).min(),
                (top3_curve / top3_curve.cummax() - 1.0).min(),
            ],
        },
        index=["strategy", "equal_weight", "top3_predicted"],
    )

    return {
        "predictions": pred_df,
        "actuals": actual_df,
        "selected": selected_df,
        "ic_by_date": ic_by_date,
        "portfolio_returns": pd.Series(portfolio_returns, index=pred_df["date"]),
        "equity_curve": equity_curve,
        "metrics": metrics,
    }

bt = backtest_model(df, start_idx=1500, retrain_every=63, beta=5.0, filter_bad_instruments=True)
display(bt["metrics"].round(4))
bt["equity_curve"].plot(title="Out-of-Sample Equity Curve", figsize=(10, 4))
print("Selected instruments on last rebalance:", bt["selected"].iloc[-1]["selected"])
