# Data Analysis

In [117]:
import pandas as pd
from functools import reduce
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import ParameterGrid



In [None]:
from functools import reduce
import pandas as pd

dfs = [
    pd.read_csv("data/2024-12-31/prices.csv", parse_dates=["date"]),
    pd.read_csv("data/2024-12-31/cash_rate.csv", parse_dates=["date"]), #prev
    pd.read_csv("data/2024-12-31/signals.csv", parse_dates=["date"]),
    pd.read_csv("data/2024-12-31/volumes.csv", parse_dates=["date"])
    ]

df = (
    reduce(
        lambda left, right: pd.merge(left, right, on="date", how="left"),
        dfs
    )
    .sort_values("date")
    .reset_index(drop=True)
)

def add_periodic_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    date = pd.to_datetime(df["date"])

    day_of_week = date.dt.dayofweek
    day_of_year = date.dt.dayofyear
    month = date.dt.month
    week_of_year = date.dt.isocalendar().week.astype(int)

    df["dow_sin"] = np.sin(2 * np.pi * day_of_week / 7)
    df["dow_cos"] = np.cos(2 * np.pi * day_of_week / 7)
    df["doy_sin"] = np.sin(2 * np.pi * day_of_year / 365.25)
    df["doy_cos"] = np.cos(2 * np.pi * day_of_year / 365.25)
    df["month_sin"] = np.sin(2 * np.pi * month / 12)
    df["month_cos"] = np.cos(2 * np.pi * month / 12)
    df["woy_sin"] = np.sin(2 * np.pi * week_of_year / 52.18)
    df["woy_cos"] = np.cos(2 * np.pi * week_of_year / 52.18)
    return df

df = add_periodic_date_features(df)

In [119]:
print(len(df))
df.info()

2851
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2851 entries, 0 to 2850
Data columns (total 83 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   2851 non-null   datetime64[ns]
 1   INSTRUMENT_1           2851 non-null   float64       
 2   INSTRUMENT_2           2851 non-null   float64       
 3   INSTRUMENT_3           2851 non-null   float64       
 4   INSTRUMENT_4           2851 non-null   float64       
 5   INSTRUMENT_5           2851 non-null   float64       
 6   INSTRUMENT_6           2851 non-null   float64       
 7   INSTRUMENT_7           2851 non-null   float64       
 8   INSTRUMENT_8           2851 non-null   float64       
 9   INSTRUMENT_9           2851 non-null   float64       
 10  INSTRUMENT_10          2851 non-null   float64       
 11  1mo                    2000 non-null   float64       
 12  1.5month               0 non-null      float64       
 13

In [120]:
INSTRUMENTS = [f"INSTRUMENT_{i}" for i in range(1, 11)]
DATE_FEATURES = [
    "dow_sin", "dow_cos",
    "doy_sin", "doy_cos",
    "month_sin", "month_cos",
    "woy_sin", "woy_cos",
]
MACRO_FEATURES = [
    c for c in ["1mo", "1.5month", "2mo", "3mo", "4mo", "6mo",
                "1yr", "2yr", "3yr", "5yr", "7yr", "10yr", "20yr", "30yr"]
    if c in df.columns and df[c].notna().mean() > 0.5
]


In [121]:
def softmax(x: np.ndarray):
    x = x - np.max(x)  # stability
    e = np.exp(x)
    return e / (e.sum() + 1e-12)

def cap_and_renorm(w: pd.Series, cap=0.25) -> pd.Series:
    w = w.clip(lower=0.0, upper=cap)
    s = float(w.sum())
    if s <= 0:
        return pd.Series(1.0 / len(w), index=w.index)
    return w / s

def build_feature_cols(df: pd.DataFrame):
    return [col for col in df.columns if col != "date"]

def make_labels_next_return(df: pd.DataFrame, inst: str):
    x = df[inst].astype(float)
    return x.shift(-1) / x - 1.0

In [122]:
from sklearn.model_selection import ParameterGrid

def train_predict_scores(
    df: pd.DataFrame,
    min_train: int = 750,
    val_size: int = 252,
):
    df = df.sort_values("date").reset_index(drop=True)
    feat_cols = build_feature_cols(df)
    X_all = df.iloc[:-1][feat_cols].copy()

    param_grid = {
        "n_estimators": [150, 300],
        "max_depth": [3, 4],
        "learning_rate": [0.03, 0.05],
        "subsample": [0.8],
        "colsample_bytree": [0.8],
        "reg_lambda": [1.0, 3.0],
    }

    preds = {}
    validation_rows = []

    for inst in INSTRUMENTS:
        y_all = make_labels_next_return(df, inst).iloc[:-1]
        mask = ~y_all.isna()

        X = X_all.loc[mask]
        y = y_all.loc[mask]

        if len(X) < min_train + val_size:
            preds[inst] = 0.0
            validation_rows.append(
                {
                    "instrument": inst,
                    "n_train": 0,
                    "n_val": 0,
                    "best_params": None,
                    "val_rmse": np.nan,
                    "val_corr": np.nan,
                    "val_hit_rate": np.nan,
                }
            )
            continue

        X_train = X.iloc[:-val_size]
        y_train = y.iloc[:-val_size]
        X_val = X.iloc[-val_size:]
        y_val = y.iloc[-val_size:]

        best_model = None
        best_params = None
        best_score = -np.inf

        for params in ParameterGrid(param_grid):
            model = XGBRegressor(
                objective="reg:squarederror",
                tree_method="hist",
                random_state=0,
                n_jobs=1,
                **params,
            )
            model.fit(X_train.values, y_train.values)

            val_pred = model.predict(X_val.values)
            val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
            val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)

            if val_corr > best_score:
                best_score = val_corr
                best_model = model
                best_params = params

        val_pred = best_model.predict(X_val.values)
        val_rmse = float(np.sqrt(np.mean((val_pred - y_val.values) ** 2)))
        val_corr = pd.Series(val_pred).corr(pd.Series(y_val.values))
        val_corr = 0.0 if pd.isna(val_corr) else float(val_corr)
        val_hit_rate = float(((val_pred > 0) == (y_val.values > 0)).mean())

        validation_rows.append(
            {
                "instrument": inst,
                "n_train": len(X_train),
                "n_val": len(X_val),
                "best_params": best_params,
                "val_rmse": val_rmse,
                "val_corr": val_corr,
                "val_hit_rate": val_hit_rate,
            }
        )

        # Refit on all available history with the best hyperparameters
        final_model = XGBRegressor(
            objective="reg:squarederror",
            tree_method="hist",
            random_state=0,
            n_jobs=1,
            **best_params,
        )
        final_model.fit(X.values, y.values)

        x_last = df.loc[df.index[-1], feat_cols].astype(float)
        preds[inst] = float(final_model.predict([x_last.values])[0])

    validation_df = pd.DataFrame(validation_rows).sort_values("val_corr", ascending=False)

    print("Validation results by instrument:")
    display(
        validation_df[
            ["instrument", "n_train", "n_val", "val_rmse", "val_corr", "val_hit_rate", "best_params"]
        ].round(4)
    )

    print("\nAverage validation metrics:")
    display(
        validation_df[["val_rmse", "val_corr", "val_hit_rate"]].mean().to_frame("mean").T.round(4)
    )

    return model, pd.Series(preds)

model, pred = train_predict_scores(df)

Validation results by instrument:


Unnamed: 0,instrument,n_train,n_val,val_rmse,val_corr,val_hit_rate,best_params
7,INSTRUMENT_8,2598,252,0.0147,0.1432,0.5833,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
9,INSTRUMENT_10,2598,252,0.0338,0.1358,0.5516,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
8,INSTRUMENT_9,2598,252,0.0258,0.1254,0.5397,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
4,INSTRUMENT_5,2598,252,0.0075,0.046,0.5516,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
6,INSTRUMENT_7,2598,252,0.0096,0.0342,0.5516,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
0,INSTRUMENT_1,2598,252,0.0077,0.0255,0.4881,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
3,INSTRUMENT_4,2598,252,0.0092,0.0204,0.5635,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
1,INSTRUMENT_2,2598,252,0.0104,0.0118,0.5,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
5,INSTRUMENT_6,2598,252,0.0035,0.0115,0.5317,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."
2,INSTRUMENT_3,2598,252,0.0077,0.0024,0.5317,"{'colsample_bytree': 0.8, 'learning_rate': 0.0..."



Average validation metrics:


Unnamed: 0,val_rmse,val_corr,val_hit_rate
mean,0.013,0.0556,0.5393


In [123]:
def construct_weights(pred: pd.Series,
                      beta=5.0,
                      cap=0.25,
                      smooth_alpha=0.35,
                      prev_weights: pd.Series | None = None):
    

    # Use predictions directly. Raw *_vol columns are traded volume, not return volatility.
    scores = pred.copy()

    w = pd.Series(softmax(beta * scores.values), index=INSTRUMENTS)
    w = cap_and_renorm(w, cap=cap)

    if prev_weights is not None:
        prev_weights = prev_weights.reindex(INSTRUMENTS).fillna(0.0)
        prev_weights = prev_weights / prev_weights.sum()
        w = smooth_alpha * w + (1.0 - smooth_alpha) * prev_weights
        w = cap_and_renorm(w, cap=cap)

    return w

def write_submission(weights: pd.Series, team_name: str, round_n: int, out_path="."):
    out = pd.DataFrame({"asset": weights.index, "weight": weights.values})
    fname = f"{team_name}_round_{round_n}.csv"
    out.to_csv(f"{out_path}/{fname}", index=False)
    return fname


In [124]:
# optional: load your last submitted weights for smoothing
# prev = pd.read_csv("myteam_round_3.csv").set_index("asset")["weight"]
prev = None

w = construct_weights(pred, prev_weights=prev)
fname = write_submission(w, team_name="HiddenLabel", round_n=1, out_path=".")
print("Wrote:", fname)
print(w)

Wrote: HiddenLabel_round_1.csv
INSTRUMENT_1     0.099252
INSTRUMENT_2     0.097238
INSTRUMENT_3     0.100249
INSTRUMENT_4     0.099738
INSTRUMENT_5     0.099608
INSTRUMENT_6     0.099529
INSTRUMENT_7     0.101037
INSTRUMENT_8     0.100646
INSTRUMENT_9     0.101023
INSTRUMENT_10    0.101680
dtype: float64


In [125]:
# def backtest_model(
#     df: pd.DataFrame,
#     start_idx: int = 1500,
#     retrain_every: int = 21,
#     beta: float = 5.0,
# ):
#     feat_cols = build_feature_cols(df)
#     next_returns = pd.DataFrame(
#         {inst: df[inst].shift(-1) / df[inst] - 1.0 for inst in INSTRUMENTS}
#     )

#     pred_rows = []

#     for t in range(start_idx, len(df) - 1, retrain_every):
#         X_train_full = df.iloc[:t][feat_cols]
#         x_test = df.loc[t, feat_cols].astype(float)

#         row = {"date": df.loc[t, "date"]}

#         for inst in INSTRUMENTS:
#             y_train_full = next_returns[inst].iloc[:t]
#             mask = ~y_train_full.isna()

#             X_train = X_train_full.loc[mask]
#             y_train = y_train_full.loc[mask]

#             if len(X_train) < 200:
#                 row[inst] = np.nan
#                 continue

#             model = XGBRegressor(
#                 n_estimators=300,
#                 max_depth=4,
#                 learning_rate=0.05,
#                 subsample=0.8,
#                 colsample_bytree=0.8,
#                 reg_alpha=0.0,
#                 reg_lambda=1.0,
#                 objective="reg:squarederror",
#                 tree_method="hist",
#                 random_state=0,
#                 n_jobs=1,
#             )
#             model.fit(X_train.values, y_train.values)
#             row[inst] = float(model.predict([x_test.values])[0])

#         pred_rows.append(row)

#     pred_df = pd.DataFrame(pred_rows)

#     actual_df = next_returns.iloc[start_idx : len(df) - 1 : retrain_every].reset_index(drop=True)
#     actual_df.insert(
#         0,
#         "date",
#         df["date"].iloc[start_idx : len(df) - 1 : retrain_every].reset_index(drop=True),
#     )

#     pred_mat = pred_df[INSTRUMENTS]
#     act_mat = actual_df[INSTRUMENTS]

#     ic_by_date = pred_mat.corrwith(act_mat, axis=1, method="spearman")
#     directional_accuracy = ((pred_mat.values > 0) == (act_mat.values > 0)).mean()

#     scores = pred_mat.to_numpy()
#     scores = scores - np.nanmax(scores, axis=1, keepdims=True)
#     exp_scores = np.exp(beta * scores)
#     weights = exp_scores / np.nansum(exp_scores, axis=1, keepdims=True)

#     portfolio_returns = (weights * act_mat.to_numpy()).sum(axis=1)
#     equity_curve = pd.Series((1 + portfolio_returns).cumprod(), index=pred_df["date"])

#     ann_factor = np.sqrt(252 / retrain_every)
#     sharpe = portfolio_returns.mean() / (portfolio_returns.std(ddof=1) + 1e-12) * ann_factor

#     running_max = equity_curve.cummax()
#     drawdown = equity_curve / running_max - 1.0
#     max_drawdown = drawdown.min()

#     metrics = pd.Series(
#         {
#             "n_test_points": len(pred_df),
#             "mean_spearman_ic": ic_by_date.mean(),
#             "hit_rate": directional_accuracy,
#             "annualized_sharpe": sharpe,
#             "avg_period_return": portfolio_returns.mean(),
#             "vol_period_return": portfolio_returns.std(ddof=1),
#             "total_return": equity_curve.iloc[-1] - 1.0,
#             "max_drawdown": max_drawdown,
#         }
#     )

#     results = {
#         "predictions": pred_df,
#         "actuals": actual_df,
#         "ic_by_date": ic_by_date,
#         "portfolio_returns": pd.Series(portfolio_returns, index=pred_df["date"]),
#         "equity_curve": equity_curve,
#         "metrics": metrics,
#     }
#     return results


In [126]:
# bt = backtest_model(df, start_idx=1500, retrain_every=21, beta=5.0)

# print(bt["metrics"].round(4))

# bt["equity_curve"].plot(title="Out-of-Sample Equity Curve", figsize=(10, 4))
