In [1]:
#!/usr/bin/env python
"""
Example classifier on Numerai data using a xgboost regression.
To get started, install the required packages: pip install pandas numpy sklearn xgboost
"""

'\nExample classifier on Numerai data using a xgboost regression.\nTo get started, install the required packages: pip install pandas numpy sklearn xgboost\n'

In [2]:
import csv
from pathlib import Path

In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from models.burningcrusade.data_preparation import prepare_data
from helpers.utils import generate_features_list, load_model, clean_for_xgboost
import xgboost

In [4]:
import numerapi
NAPI = numerapi.NumerAPI(verbosity="info")

In [5]:
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"

In [6]:
pre = "../models/burningcrusade/"
QURTY_MODEL = Path(pre+"burningcrusade_green_mmc_good_corr.pkl")
BOOMKIN_MODEL = Path(pre+"wotlk.pkl")
DISC_MODEL = Path(pre+"burningcrusade_best_mmc_yet.pkl")
BURNING_CRUSADE_MODEL = Path(pre+"burningcrusade_all_green_fnc.pkl")
WOTLK_MODEL = Path(pre+"burningcrusade_another_fnc.pkl")

In [7]:
QURTYN_MODEL = Path(pre+"burningcrusade_green_fnc_sharpe_drawdown.pkl")
BOOMKINN_MODEL = Path(pre+"burningcrusade_4_greenies.pkl")
COMBUSTON_MODEL = Path(pre+"burningcrusade_3_greenies.pkl")

Submissions are scored by spearman correlation

In [8]:
def correlation(predictions, targets):
    ranked_preds = predictions.rank(pct=True, method="first")
    return np.corrcoef(ranked_preds, targets)[0, 1]

convenience method for scoring

In [9]:
def score(df):
    return correlation(df[PREDICTION_NAME], df[TARGET_NAME])

Payout is just the score cliped at +/-25%

In [10]:
def payout(scores):
    return scores.clip(lower=-0.25, upper=0.25)

Read the csv file into a pandas Dataframe as float16 to save space

In [11]:
def read_csv(file_path):
    with open(file_path, 'r') as f:
        column_names = next(csv.reader(f))
    dtypes = {x: np.float16 for x in column_names if x.startswith(('feature', 'target'))}
    df = pd.read_csv(file_path, dtype=dtypes, index_col=0)

    # Memory constrained? Try this instead (slower, but more memory efficient)
    # see https://forum.numer.ai/t/saving-memory-with-uint8-features/254
    # dtypes = {f"target": np.float16}
    # to_uint8 = lambda x: np.uint8(float(x) * 4)
    # converters = {x: to_uint8 for x in column_names if x.startswith('feature')}
    # df = pd.read_csv(file_path, dtype=dtypes, converters=converters)
    return df

In [12]:
def neutralize(df,
               columns,
               extra_neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    # need to do this for lint to be happy bc [] is a "dangerous argument"
    if extra_neutralizers is None:
        extra_neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        print(u, end="\r")
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (pd.Series(x).rank(method="first").values - .5) / len(x)
                scores2.append(x)
            scores = np.array(scores2).T
            extra = df_era[extra_neutralizers].values
            exposures = np.concatenate([extra], axis=1)
        else:
            exposures = df_era[extra_neutralizers].values
        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))
        scores /= scores.std(ddof=0)
        computed.append(scores)
    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)

    # this line makes series neutral to a constant column so that it's centered and for sure gets corr 0 with exposures
    exposures = np.hstack(
        (exposures,
         np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
    correction = proportion * (exposures.dot(
        np.linalg.lstsq(exposures, scores, rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized
def unif(df):
    x = (df.rank(method="first") - 0.5) / len(df)
    return pd.Series(x, index=df.index)
def get_feature_neutral_mean(df, feature_cols):
    df.loc[:, "neutral_sub"] = neutralize(df, [PREDICTION_NAME],
                                          feature_cols)[PREDICTION_NAME]
    scores = df.groupby("era").apply(
        lambda x: correlation(x["neutral_sub"], x[TARGET_NAME])).mean()
    return np.mean(scores)

In [13]:
print("Loading data...")
# The training data is used to train your model how to predict the targets.
train, tournament = prepare_data(pre+"burningcrusade_features.pkl")
# The tournament data is the data that Numerai uses to evaluate your model.
feature_names = generate_features_list(train)
print(f"Loaded {len(feature_names)} features")

Loading data...
Downloading new data for round: 263!


/Users/brianbroeking/projects/numerai/data/numerai_dataset_263.zip:  99%|█████████▉| 400M/403M [00:09<00:00, 38.6MB/s] 2021-05-08 23:16:37,783 INFO numerapi.base_api: unzipping file...


Loading the data


/Users/brianbroeking/projects/numerai/data/numerai_dataset_263.zip: 403MB [00:25, 15.7MB/s]                           


Loaded 328 features


In [14]:
qurty = load_model(QURTY_MODEL)
boomkin = load_model(BOOMKIN_MODEL)
disc = load_model(DISC_MODEL)
bc = load_model(BURNING_CRUSADE_MODEL)
wotlk = load_model(WOTLK_MODEL)
qurtyn = load_model(QURTYN_MODEL)
boomkinn = load_model(BOOMKINN_MODEL)
combustion = load_model(COMBUSTON_MODEL)
current_round = NAPI.get_current_round()

In [15]:
models = [
    ('qurty', qurty),
    ('boomkin', boomkin),
    ('disc', disc),
    ('bc', bc),
    ('wotlk', wotlk),
    ('boomkinn', boomkinn),
    ('qurtyn', qurtyn),
    ('combustion', combustion),
]

In [16]:
def generate(name, model):
    training_data = train.copy()
    tournament_data = tournament.copy()
    
    X_train, y_train = clean_for_xgboost(train)
    X_tournament, y_tournament = clean_for_xgboost(tournament)
    print("Generating predictions...")
    training_data.loc[:, PREDICTION_NAME] = model.predict(X_train)
    tournament_data.loc[:, PREDICTION_NAME] = model.predict(X_tournament)

    # Check the per-era correlations on the training set (in sample)
    train_correlations = training_data.groupby("era").apply(score)
    print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std(ddof=0)}")
    print(f"On training the average per-era payout is {payout(train_correlations).mean()}")
    """Validation Metrics"""
    # Check the per-era correlations on the validation set (out of sample)
    validation_data = tournament_data[tournament_data.data_type == "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(f"On validation the correlation has mean {validation_correlations.mean()} and "
        f"std {validation_correlations.std(ddof=0)}")
    print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")

    # Check the "sharpe" ratio on the validation set
    validation_sharpe = validation_correlations.mean() / validation_correlations.std(ddof=0)
    print(f"Validation Sharpe: {validation_sharpe}")
    print("checking max drawdown...")
    rolling_max = (validation_correlations + 1).cumprod().rolling(window=100,
                                                                min_periods=1).max()
    daily_value = (validation_correlations + 1).cumprod()
    max_drawdown = -((rolling_max - daily_value) / rolling_max).max()
    print(f"max drawdown: {max_drawdown}")

    # Check the feature exposure of your validation predictions
    feature_exposures = validation_data[feature_names].apply(lambda d: correlation(validation_data[PREDICTION_NAME], d),
                                                            axis=0)
    max_per_era = validation_data.groupby("era").apply(
        lambda d: d[feature_names].corrwith(d[PREDICTION_NAME]).abs().max())
    max_feature_exposure = max_per_era.mean()
    print(f"Max Feature Exposure: {max_feature_exposure}")

    # Check feature neutral mean
    print("Calculating feature neutral mean...")
    feature_neutral_mean = get_feature_neutral_mean(validation_data, X_train.columns)
    print(f"Feature Neutral Mean is {feature_neutral_mean}")

    # Load example preds to get MMC metrics
    example_preds = pd.read_csv(f"../data/numerai_dataset_{current_round}/example_predictions.csv").set_index("id")["prediction"]
    validation_example_preds = example_preds[validation_data.index].values
    validation_data.loc[:, "ExamplePreds"] = validation_example_preds
    print("calculating MMC stats...")
    # MMC over validation
    mmc_scores = []
    corr_scores = []
    for _, x in validation_data.groupby("era"):
        series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])),
                                pd.Series(unif(x["ExamplePreds"])))
        mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29 ** 2))
        corr_scores.append(correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME]))
    val_mmc_mean = np.mean(mmc_scores)
    val_mmc_std = np.std(mmc_scores)
    val_mmc_sharpe = val_mmc_mean / val_mmc_std
    corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)]
    corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs)
    corr_plus_mmc_mean = np.mean(corr_plus_mmcs)
    corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe
    print(
        f"MMC Mean: {val_mmc_mean}\n"
        f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n"
        f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}"
    )
    # Check correlation with example predictions
    full_df = pd.concat([pd.DataFrame(validation_example_preds), validation_data[PREDICTION_NAME], validation_data["era"]], axis=1)
    full_df.columns = ["example_preds", "prediction", "era"]
    per_era_corrs = full_df.groupby('era').apply(lambda d: correlation(unif(d["prediction"]), unif(d["example_preds"])))
    corr_with_example_preds = per_era_corrs.mean()
    print(f"Corr with example preds: {corr_with_example_preds}")

    # Save predictions as a CSV and upload to https://numer.ai
    tournament_data.set_index('id', inplace=True)
    tournament_data[PREDICTION_NAME].to_csv(f"../submissions/{name}/submission_{name}_{current_round}.csv", header=True)

In [17]:
for pair in models:
    generate(pair[0], pair[1])

Generating predictions...
On training the correlation has mean 0.07938842721104011 and std 0.020025013213848798
On training the average per-era payout is 0.07938842721104011
On validation the correlation has mean 0.021026576936866777 and std 0.020088435194400896
On validation the average per-era payout is 0.021026576936866777
Validation Sharpe: 1.0467005883428573
checking max drawdown...
max drawdown: -0.012104731584593315
Max Feature Exposure: 0.3044719324670308
Calculating feature neutral mean...
era212

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Feature Neutral Mean is 0.0006159097986371161
calculating MMC stats...
MMC Mean: 0.00479077681890712
Corr Plus MMC Sharpe:0.9253847481111769
Corr Plus MMC Diff:-0.1213158402316804
Corr with example preds: 0.5570093484586577
Generating predictions...
On training the correlation has mean 0.0794471225792282 and std 0.0232163940281972
On training the average per-era payout is 0.0794471225792282
On validation the correlation has mean 0.020529590120606716 and std 0.01990772611081819
On validation the average per-era payout is 0.020529590120606716
Validation Sharpe: 1.0312373199393474
checking max drawdown...
max drawdown: -0.01653109463678609
Max Feature Exposure: 0.24562651389898152
Calculating feature neutral mean...
Feature Neutral Mean is 0.0036643654309945423
calculating MMC stats...
MMC Mean: 0.002770026241170867
Corr Plus MMC Sharpe:0.8951028186438453
Corr Plus MMC Diff:-0.13613450129550209
Corr with example preds: 0.6326342232248564
Generating predictions...
On training the correlati