# Data Analysis

In [88]:
import pandas as pd
from functools import reduce
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import ParameterGrid


In [89]:
DATE = "2025-02-28"
prev_date = "2024-02-28"

df_prices = pd.read_csv(f"data/{DATE}/prices.csv", parse_dates=["date"])
df_cash_rates = pd.read_csv(f"data/{DATE}/cash_rate.csv", parse_dates=["date"])
df_signals = pd.read_csv(f"data/{DATE}/signals.csv", parse_dates=["date"])
df_volumes = pd.read_csv(f"data/{DATE}/volumes.csv", parse_dates=["date"])

dfs = [df_prices, df_cash_rates, df_signals, df_volumes]

df = (
    reduce(
        lambda left, right: pd.merge(left, right, on="date", how="left"),
        dfs,
    )
    .sort_values("date")
    .reset_index(drop=True)
)

def add_periodic_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    date = pd.to_datetime(df["date"])

    day_of_week = date.dt.dayofweek
    day_of_year = date.dt.dayofyear
    month = date.dt.month
    week_of_year = date.dt.isocalendar().week.astype(int)

    df["dow_sin"] = np.sin(2 * np.pi * day_of_week / 7)
    df["dow_cos"] = np.cos(2 * np.pi * day_of_week / 7)
    df["doy_sin"] = np.sin(2 * np.pi * day_of_year / 365.25)
    df["doy_cos"] = np.cos(2 * np.pi * day_of_year / 365.25)
    df["month_sin"] = np.sin(2 * np.pi * month / 12)
    df["month_cos"] = np.cos(2 * np.pi * month / 12)
    df["woy_sin"] = np.sin(2 * np.pi * week_of_year / 52.18)
    df["woy_cos"] = np.cos(2 * np.pi * week_of_year / 52.18)
    return df

def add_instrument_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for inst in [f"INSTRUMENT_{i}" for i in range(1, 11)]:
        px = df[inst].astype(float)
        ret_1 = px.pct_change()
        log_ret_1 = np.log(px).diff()

        df[f"{inst}_ret1"] = ret_1
        df[f"{inst}_logret1"] = log_ret_1
        df[f"{inst}_mom5"] = px.pct_change(5)
        df[f"{inst}_mom20"] = px.pct_change(20)
        df[f"{inst}_mom60"] = px.pct_change(60)
        df[f"{inst}_vol20"] = ret_1.rolling(20).std()
        df[f"{inst}_vol60"] = ret_1.rolling(60).std()
        df[f"{inst}_ma_ratio20"] = px / px.rolling(20).mean() - 1.0
        df[f"{inst}_ma_ratio60"] = px / px.rolling(60).mean() - 1.0

        vol_col = f"{inst}_vol"
        if vol_col in df.columns:
            log_vol = np.log1p(df[vol_col].astype(float))
            df[f"{inst}_logvol"] = log_vol
            df[f"{inst}_vol_z20"] = (
                (log_vol - log_vol.rolling(20).mean())
                / (log_vol.rolling(20).std() + 1e-12)
            )
            df[f"{inst}_vol_chg5"] = log_vol.diff(5)

    return df

df = add_periodic_date_features(df)
df = add_instrument_features(df)


  df[f"{inst}_logret1"] = log_ret_1
  df[f"{inst}_mom5"] = px.pct_change(5)
  df[f"{inst}_mom20"] = px.pct_change(20)
  df[f"{inst}_mom60"] = px.pct_change(60)
  df[f"{inst}_vol20"] = ret_1.rolling(20).std()
  df[f"{inst}_vol60"] = ret_1.rolling(60).std()
  df[f"{inst}_ma_ratio20"] = px / px.rolling(20).mean() - 1.0
  df[f"{inst}_ma_ratio60"] = px / px.rolling(60).mean() - 1.0
  df[f"{inst}_logvol"] = log_vol
  df[f"{inst}_vol_z20"] = (
  df[f"{inst}_vol_chg5"] = log_vol.diff(5)
  df[f"{inst}_ret1"] = ret_1
  df[f"{inst}_logret1"] = log_ret_1
  df[f"{inst}_mom5"] = px.pct_change(5)
  df[f"{inst}_mom20"] = px.pct_change(20)
  df[f"{inst}_mom60"] = px.pct_change(60)
  df[f"{inst}_vol20"] = ret_1.rolling(20).std()
  df[f"{inst}_vol60"] = ret_1.rolling(60).std()
  df[f"{inst}_ma_ratio20"] = px / px.rolling(20).mean() - 1.0
  df[f"{inst}_ma_ratio60"] = px / px.rolling(60).mean() - 1.0
  df[f"{inst}_logvol"] = log_vol
  df[f"{inst}_vol_z20"] = (
  df[f"{inst}_vol_chg5"] = log_vol.diff(5)


In [90]:
print(len(df))
df.info()


2910
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2910 entries, 0 to 2909
Columns: 203 entries, date to INSTRUMENT_10_vol_chg5
dtypes: datetime64[ns](1), float64(192), int64(10)
memory usage: 4.5 MB


In [91]:
INSTRUMENTS = [f"INSTRUMENT_{i}" for i in range(1, 11)]
DATE_FEATURES = [
    "dow_sin", "dow_cos",
    "doy_sin", "doy_cos",
    "month_sin", "month_cos",
    "woy_sin", "woy_cos",
]
MACRO_FEATURES = [
    c for c in [
        "1mo", "1.5month", "2mo", "3mo", "4mo", "6mo",
        "1yr", "2yr", "3yr", "5yr", "7yr", "10yr", "20yr", "30yr",
    ]
    if c in df.columns and df[c].notna().mean() > 0.5
]


In [92]:
def softmax(x: np.ndarray):
    x = x - np.max(x)
    e = np.exp(x)
    return e / (e.sum() + 1e-12)

def cap_and_renorm(w: pd.Series, cap=0.25) -> pd.Series:
    w = w.clip(lower=0.0, upper=cap)
    s = float(w.sum())
    if s <= 0:
        return pd.Series(1.0 / len(w), index=w.index)
    return w / s

def build_feature_cols(df: pd.DataFrame, inst: str):
    own_cols = [
        col for col in df.columns
        if col == inst or col.startswith(f"{inst}_")
    ]
    cols = DATE_FEATURES + MACRO_FEATURES + own_cols
    return [col for col in cols if col in df.columns]

def make_labels_next_return(df: pd.DataFrame, inst: str):
    x = df[inst].astype(float)
    return x.shift(-1) / x - 1.0


In [93]:
def train_predict_scores(
    df: pd.DataFrame,
    min_train: int = 750,
    val_size: int = 252,
    verbose = True
):
    df = df.sort_values("date").reset_index(drop=True)

    param_grid = {
        "n_estimators": [150, 300],
        "max_depth": [4, 5],
        "learning_rate": [0.03, 0.05],
        "subsample": [1.0], #[0.8, 1.0],
        "colsample_bytree": [1.0], #[0.8, 1.0],
        "reg_lambda": [1.0, 3.0, 5.0],
    }

    preds = {}
    validation_rows = []
    fitted_models = {}

    for inst in INSTRUMENTS:
        feat_cols = build_feature_cols(df, inst)
        X_all = df.iloc[:-1][feat_cols].copy()
        y_all = make_labels_next_return(df, inst).iloc[:-1]

        mask = ~y_all.isna()
        X = X_all.loc[mask]
        y = y_all.loc[mask]

        if len(X) < min_train + val_size:
            preds[inst] = 0.0
            validation_rows.append(
                {
                    "instrument": inst,
                    "n_features": len(feat_cols),
                    "n_train": 0,
                    "n_val": 0,
                    "best_params": None,
                    "val_rmse": np.nan,
                    "val_corr": np.nan,
                    "val_hit_rate": np.nan,
                }
            )
            continue

        X_train = X.iloc[:-val_size]
        y_train = y.iloc[:-val_size]
        X_val = X.iloc[-val_size:]
        y_val = y.iloc[-val_size:]

        best_params = None
        best_score = -np.inf

        for params in ParameterGrid(param_grid):
            model = XGBRegressor(
                objective="reg:squarederror",
                tree_method="hist",
                random_state=0,
                n_jobs=4,
                **params,
            )
            model.fit(X_train.values, y_train.values)

            val_pred = model.predict(X_val.values)
            val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
            val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)

            if val_corr > best_score:
                best_score = val_corr
                best_params = params

        final_model = XGBRegressor(
            objective="reg:squarederror",
            tree_method="hist",
            random_state=0,
            n_jobs=1,
            **best_params,
        )
        final_model.fit(X.values, y.values)

        val_pred = final_model.predict(X_val.values)
        val_rmse = float(np.sqrt(np.mean((val_pred - y_val.values) ** 2)))
        val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
        val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)
        val_hit_rate = float(((val_pred > 0) == (y_val.values > 0)).mean())

        validation_rows.append(
            {
                "instrument": inst,
                "n_features": len(feat_cols),
                "n_train": len(X_train),
                "n_val": len(X_val),
                "best_params": best_params,
                "val_rmse": val_rmse,
                "val_corr": val_corr,
                "val_hit_rate": val_hit_rate,
            }
        )

        x_last = df.loc[df.index[-1], feat_cols].astype(float)
        preds[inst] = float(final_model.predict([x_last.values])[0])
        fitted_models[inst] = final_model

    validation_df = pd.DataFrame(validation_rows).sort_values("val_corr", ascending=False)
    confidence_series = (
        validation_df.set_index("instrument")["val_corr"]
        .reindex(INSTRUMENTS)
        .fillna(0.0)
    )
    if verbose:
        print("Validation results by instrument:")
        display(
            validation_df[
                [
                    "instrument",
                    "n_features",
                    "n_train",
                    "n_val",
                    "val_rmse",
                    "val_corr",
                    "val_hit_rate",
                    "best_params",
                ]
            ].round(4)
        )

    print("\nAverage validation metrics:")
    display(
        validation_df[["val_rmse", "val_corr", "val_hit_rate"]]
        .mean()
        .to_frame("mean")
        .T.round(4)
    )

    return fitted_models, pd.Series(preds), validation_df, best_params, confidence_series

models, pred, validation_df, best_params, confidence = train_predict_scores(df)

print("Best Parameters: ", best_params)


Validation results by instrument:


Unnamed: 0,instrument,n_features,n_train,n_val,val_rmse,val_corr,val_hit_rate,best_params
6,INSTRUMENT_7,38,2657,252,0.0047,0.8863,0.754,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
9,INSTRUMENT_10,38,2657,252,0.0236,0.8362,0.7778,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
0,INSTRUMENT_1,38,2657,252,0.0054,0.7842,0.75,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
1,INSTRUMENT_2,38,2657,252,0.0081,0.7803,0.6587,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
7,INSTRUMENT_8,38,2657,252,0.013,0.6564,0.6667,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
8,INSTRUMENT_9,38,2657,252,0.0224,0.654,0.7063,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
4,INSTRUMENT_5,38,2657,252,0.0065,0.5982,0.6151,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
2,INSTRUMENT_3,38,2657,252,0.0065,0.5563,0.6706,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
3,INSTRUMENT_4,38,2657,252,0.008,0.5229,0.6667,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."
5,INSTRUMENT_6,38,2657,252,0.0032,0.4757,0.5992,"{'colsample_bytree': 1.0, 'learning_rate': 0.0..."



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.0101,0.675,0.6865


Best Parameters:  {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300, 'reg_lambda': 3.0, 'subsample': 1.0}


In [94]:
print(confidence)

instrument
INSTRUMENT_1     0.784219
INSTRUMENT_2     0.780272
INSTRUMENT_3     0.556320
INSTRUMENT_4     0.522885
INSTRUMENT_5     0.598179
INSTRUMENT_6     0.475689
INSTRUMENT_7     0.886279
INSTRUMENT_8     0.656354
INSTRUMENT_9     0.654004
INSTRUMENT_10    0.836236
Name: val_corr, dtype: float64


In [100]:
def construct_weights(
    pred: pd.Series,
    confidence: float | pd.Series,
    beta=3.0,
    cap=0.25,
    smooth_alpha=1.0,
    prev_weights: pd.Series | None = None,
):
    scores = pred.copy()

    # allow scalar or per-asset confidence
    if isinstance(confidence, pd.Series):
        conf = confidence.reindex(INSTRUMENTS).fillna(0.0)
        logits = beta * conf * scores
    else:
        logits = beta * confidence * scores

    w = pd.Series(softmax(logits.values), index=INSTRUMENTS)
    w = cap_and_renorm(w, cap=cap)

    if prev_weights is not None:
        prev_weights = prev_weights.reindex(INSTRUMENTS).fillna(0.0)
        prev_weights = prev_weights / prev_weights.sum()
        w = smooth_alpha * w + (1.0 - smooth_alpha) * prev_weights
        w = cap_and_renorm(w, cap=cap)

    return w

def write_submission(weights: pd.Series, team_name: str, round_n: int, out_path="."):
    out = pd.DataFrame({"asset": weights.index, "weight": weights.values})
    fname = f"{team_name}_round_{round_n}.csv"
    out.to_csv(f"{out_path}/{fname}", index=False)
    return fname

w = construct_weights(pred, confidence)
fname = write_submission(w, team_name="HiddenLabel", round_n=2, out_path=".")
print("Wrote:", fname)
print(w)


Wrote: HiddenLabel_round_2.csv
INSTRUMENT_1     0.099683
INSTRUMENT_2     0.099412
INSTRUMENT_3     0.099464
INSTRUMENT_4     0.099526
INSTRUMENT_5     0.099746
INSTRUMENT_6     0.099823
INSTRUMENT_7     0.099038
INSTRUMENT_8     0.099904
INSTRUMENT_9     0.099974
INSTRUMENT_10    0.103430
dtype: float64


In [98]:
def make_forward_returns(df: pd.DataFrame, horizon: int) -> pd.DataFrame:
    out = {}
    for inst in INSTRUMENTS:
        px = df[inst].astype(float)
        out[inst] = px.shift(-horizon) / px - 1.0
    return pd.DataFrame(out)


def trailing_realized_vol(df: pd.DataFrame, end_idx: int, window: int = 20) -> pd.Series:
    vols = {}
    for inst in INSTRUMENTS:
        ret1 = df[inst].astype(float).pct_change()
        vols[inst] = ret1.iloc[:end_idx].tail(window).std()
    vols = pd.Series(vols).replace([np.inf, -np.inf], np.nan)
    return vols.fillna(vols.median()).clip(lower=1e-4)


def prediction_to_weights(
    pred_row: pd.Series,
    vol_row: pd.Series,
    beta: float = 3.0,
    cap: float = 0.25,
    selected: list[str] | None = None,
):
    scores = pred_row.copy()

    if selected is not None:
        excluded = [inst for inst in INSTRUMENTS if inst not in selected]
        scores.loc[excluded] = np.nan

    valid = scores.notna()
    if valid.sum() == 0:
        return pd.Series(1.0 / len(INSTRUMENTS), index=INSTRUMENTS)

    z = scores.loc[valid]
    z = (z - z.mean()) / (z.std(ddof=0) + 1e-12)
    z = z / vol_row.loc[valid]

    raw = pd.Series(0.0, index=INSTRUMENTS)
    scaled = beta * z
    scaled = scaled - scaled.max()
    raw.loc[valid] = np.exp(scaled)

    return cap_and_renorm(raw, cap=cap)


def backtest_model_fast(
    df: pd.DataFrame,
    start_idx: int = 1500,
    retrain_every: int = 120,
    holding_period: int = 120,
    beta: float = 3.0,
    cap: float = 0.25,
    filter_bad_instruments: bool = True,
    transaction_cost_bps: float = 0,
):
    df = df.sort_values("date").reset_index(drop=True)
    forward_returns = make_forward_returns(df, horizon=holding_period)

    pred_rows = []
    bt_rows = []
    prev_weights = pd.Series(0.0, index=INSTRUMENTS)

    for t in range(start_idx, len(df) - holding_period, retrain_every):
        fitted_models, pred, validation_df, best_params, _ = train_predict_scores(df.iloc[: t + 1].copy(), verbose = False)

        selected = validation_df.loc[validation_df["val_corr"] > 0, "instrument"].tolist()
        if not filter_bad_instruments:
            selected = INSTRUMENTS.copy()

        vol_row = trailing_realized_vol(df, end_idx=t, window=20)
        weights = prediction_to_weights(
            pred,
            vol_row,
            beta=beta,
            cap=cap,
            selected=selected,
        )

        realized = forward_returns.loc[t, INSTRUMENTS].astype(float)
        gross_return = float((weights * realized).sum())

        turnover = float(np.abs(weights - prev_weights).sum())
        tc = turnover * transaction_cost_bps / 10000.0
        net_return = gross_return - tc

        eq_weights = pd.Series(1.0 / len(INSTRUMENTS), index=INSTRUMENTS)
        eq_return = float((eq_weights * realized).sum())

        top3 = pred.rank(ascending=False, method="first") <= 3
        if filter_bad_instruments and len(selected) > 0:
            top3 = top3 & top3.index.to_series().isin(selected).values
        top3_weights = top3.astype(float)
        if top3_weights.sum() > 0:
            top3_weights = top3_weights / top3_weights.sum()
        else:
            top3_weights = eq_weights.copy()
        top3_return = float((top3_weights * realized).sum())

        pred_rows.append({"date": df.loc[t, "date"], **pred.to_dict()})
        bt_rows.append(
            {
                "date": df.loc[t, "date"],
                "selected_count": len(selected),
                "gross_return": gross_return,
                "net_return": net_return,
                "equal_weight_return": eq_return,
                "top3_return": top3_return,
                "turnover": turnover,
                "transaction_cost": tc,
                **{f"w_{inst}": weights[inst] for inst in INSTRUMENTS},
            }
        )

        prev_weights = weights.copy()

    pred_df = pd.DataFrame(pred_rows)
    bt_df = pd.DataFrame(bt_rows)

    actual_df = forward_returns.iloc[start_idx : len(df) - holding_period : retrain_every].reset_index(drop=True)
    actual_df.insert(
        0,
        "date",
        df["date"].iloc[start_idx : len(df) - holding_period : retrain_every].reset_index(drop=True),
    )

    pred_mat = pred_df[INSTRUMENTS]
    act_mat = actual_df[INSTRUMENTS]

    ic_by_date = pred_mat.corrwith(act_mat, axis=1, method="spearman")
    hit_rate = ((pred_mat.values > 0) == (act_mat.values > 0)).mean()

    strategy_curve = (1 + bt_df["net_return"]).cumprod()
    gross_curve = (1 + bt_df["gross_return"]).cumprod()
    equal_weight_curve = (1 + bt_df["equal_weight_return"]).cumprod()
    top3_curve = (1 + bt_df["top3_return"]).cumprod()

    ann_factor = np.sqrt(252 / holding_period)

    def sharpe(x: pd.Series) -> float:
        return float(x.mean() / (x.std(ddof=1) + 1e-12) * ann_factor)

    def max_dd(curve: pd.Series) -> float:
        return float((curve / curve.cummax() - 1.0).min())

    metrics = pd.DataFrame(
        {
            "n_test_points": [len(bt_df), len(bt_df), len(bt_df), len(bt_df)],
            "mean_spearman_ic": [ic_by_date.mean(), np.nan, np.nan, np.nan],
            "hit_rate": [hit_rate, np.nan, np.nan, np.nan],
            "annualized_sharpe": [
                sharpe(bt_df["net_return"]),
                sharpe(bt_df["gross_return"]),
                sharpe(bt_df["equal_weight_return"]),
                sharpe(bt_df["top3_return"]),
            ],
            "avg_period_return": [
                bt_df["net_return"].mean(),
                bt_df["gross_return"].mean(),
                bt_df["equal_weight_return"].mean(),
                bt_df["top3_return"].mean(),
            ],
            "vol_period_return": [
                bt_df["net_return"].std(ddof=1),
                bt_df["gross_return"].std(ddof=1),
                bt_df["equal_weight_return"].std(ddof=1),
                bt_df["top3_return"].std(ddof=1),
            ],
            "total_return": [
                strategy_curve.iloc[-1] - 1.0,
                gross_curve.iloc[-1] - 1.0,
                equal_weight_curve.iloc[-1] - 1.0,
                top3_curve.iloc[-1] - 1.0,
            ],
            "max_drawdown": [
                max_dd(strategy_curve),
                max_dd(gross_curve),
                max_dd(equal_weight_curve),
                max_dd(top3_curve),
            ],
            "avg_turnover": [
                bt_df["turnover"].mean(),
                bt_df["turnover"].mean(),
                0.0,
                np.nan,
            ],
        },
        index=["strategy_net", "strategy_gross", "equal_weight", "top3_predicted"],
    )

    return {
        "predictions": pred_df,
        "actuals": actual_df,
        "backtest_df": bt_df,
        "ic_by_date": ic_by_date,
        "equity_curve": pd.Series(strategy_curve.values, index=bt_df["date"]),
        "gross_curve": pd.Series(gross_curve.values, index=bt_df["date"]),
        "equal_weight_curve": pd.Series(equal_weight_curve.values, index=bt_df["date"]),
        "top3_curve": pd.Series(top3_curve.values, index=bt_df["date"]),
        "metrics": metrics,
    }


bt = backtest_model_fast(
    df,
)

display(bt["metrics"].round(4))

ax = bt["equity_curve"].plot(figsize=(10, 4), title="Fast Out-of-Sample Backtest", label="strategy_net")
bt["gross_curve"].plot(ax=ax, label="strategy_gross")
bt["equal_weight_curve"].plot(ax=ax, label="equal_weight")
bt["top3_curve"].plot(ax=ax, label="top3_predicted")
ax.legend()

display(bt["backtest_df"][["date", "selected_count", "gross_return", "net_return", "turnover"]].tail())



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.0106,0.7399,0.6929



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.0102,0.739,0.7079



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.0102,0.7446,0.7278



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.0114,0.7684,0.7369



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.0134,0.8038,0.754


KeyboardInterrupt: 