In [None]:
%uv pip install numerapi pyarrow numerai-tools xgboost -q

In [None]:
import json
from numerapi import NumerAPI
import pandas as pd
import xgboost as xgb
import warnings
from numerai_tools.scoring import numerai_corr
import warnings
import numpy as np

# Introduction

Lets start by loading the train and validation dataset

In [None]:
DATA_VERSION = "v5.1"
napi = NumerAPI()

napi.download_dataset(f"{DATA_VERSION}/features.json")
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
feature_sets = feature_metadata["feature_sets"]
feature_set = feature_sets["all"]

# Download the training data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/train.parquet")
train = pd.read_parquet(
    f"{DATA_VERSION}/train.parquet",
    columns=["era", "target"] + feature_set
)

# Download validation data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")
validation = pd.read_parquet(
    f"{DATA_VERSION}/validation.parquet",
    columns=["era", "target"] + feature_set
)

# Download the validation benchmark models
napi.download_dataset(f"{DATA_VERSION}/validation_benchmark_models.parquet")
validation_benchmark_models = pd.read_parquet(
    f"{DATA_VERSION}/validation_benchmark_models.parquet",
)

validation["benchmark_prediction"] = validation["v51_lgbm_cyrusd20"]
validation = validation.dropna()

We keep the last 100th eras as test set for measuring performances.

We fit the model on the rest of the eras.

We remove the 4 first test eras to avoid data leakage.

In [None]:
#Keep the last 100th eras for test set
test_era = validation["era"].unique()[-100:]
test = validation[validation["era"].isin(test_era)]
validation = validation[~validation["era"].isin(test_era)]

# Concatenate train and validation datasets for training
train = pd.concat([train, validation.drop("benchmark_prediction", axis=1)], ignore_index=True)


# Eras are 1 week apart, but targets look 20 days (o 4 weeks/eras) into the future,
# so we need to "embargo" the first 4 eras following our last train era to avoid "data leakage"
last_train_era = int(train["era"].unique()[-1])
eras_to_embargo = [str(era).zfill(4) for era in [last_train_era + i for i in range(4)]]

test = test[~test["era"].isin(eras_to_embargo)]

# Training

We observe the best model is the following one:

TODO complete here

Now we are gonna train on the full train and val eras and run the model on test eras to see its actual real performance.

**NB** We should actually do rolling cross validation every week however this is costly and we don't expect it to give a major change so we are only gonna train the model once.

In [None]:
model = xgb.XGBRegressor(
    n_estimators=30000,
    learning_rate=0.001,
    max_depth=10,
    colsample_bytree=0.1,
    verbosity=0,
    seed=42,
    tree_method="hist",
    device="cuda",
)

model.fit(train[feature_set], train["target"])
test["prediction"] = model.predict(test[feature_set])

## Evaluation


Numerai provides a benchmark model. We will use it to compare our performances on the usual metrics.

In [39]:
def evaluate_prediction(test, prediction_col):

    # Compute per-era correlation
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", RuntimeWarning)
        per_era_corr = test.groupby("era").apply(
            lambda x: numerai_corr(x[[prediction_col]].dropna(), x["target"].dropna()),
            include_groups=False
        )
        per_era_corr.fillna(0, inplace=True)

    # Compute performance metrics
    corr_mean = per_era_corr.mean().item()
    corr_std = per_era_corr.std(ddof=0).item()
    corr_sharpe = corr_mean / corr_std if corr_std > 0 else np.nan
    corr_max_drawdown = (per_era_corr.cumsum().expanding(min_periods=1).max() - per_era_corr.cumsum()).max().item()
    corr_hit_rate = (per_era_corr > 0).mean().item()


    # Display performance metrics
    print("=" * 60)
    print("MODEL PERFORMANCE METRICS")
    print("=" * 60)
    print(f"Mean:           {corr_mean:>10.6f}")
    print(f"Std:            {corr_std:>10.6f}")
    print(f"Sharpe:         {corr_sharpe:>10.4f}")
    print(f"Max Drawdown:   {corr_max_drawdown:>10.6f}")
    print(f"Hit Rate:       {corr_hit_rate:>10.2%}")
    print()


    # Compute feature exposures
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", RuntimeWarning)
        feature_exposures = validation.groupby("era").apply(
            lambda d: d.drop(columns=["target", "prediction"]).corrwith(d["prediction"]),
            include_groups=False
        )
        feature_exposures.fillna(0, inplace=True)

    # Display feature exposure metrics
    max_feature_exposure = feature_exposures.max(axis=1)
    print("FEATURE EXPOSURE")
    print("=" * 60)
    print(f"Max Feature Exposure - Mean: {max_feature_exposure.mean():.4f}")
    print(f"Max Feature Exposure - Std:  {max_feature_exposure.std():.4f}")
    print("=" * 60)
    print()


    # Plot the cumulative per-era correlation
    per_era_corr.cumsum().plot(
    title="Cumulative Validation CORR",
    kind="line",
    figsize=(8, 4),
    legend=False
    )

    return per_era_corr

In [None]:
per_era_corr = evaluate_prediction(test, "prediction")

per_era_corr.cumsum().plot(
title="Cumulative Validation CORR on final model",
kind="line",
figsize=(8, 4),
legend=False
)

In [None]:
per_era_corr = evaluate_prediction(test, "benchmark_prediction")

per_era_corr.cumsum().plot(
title="Cumulative Validation CORR on benchmark model",
kind="line",
figsize=(8, 4),
legend=False
)

## 

TODO complete the final analysis here