In [1]:
import pandas as pd
import numpy as np
import random 

from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, mean_absolute_error

import xgboost
import numerapi
NAPI = numerapi.NumerAPI(verbosity="info")
import random as rn
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
from bayes_opt import BayesianOptimization
import os
import pickle

from helpers.utils import *
from helpers.xgboost_feval import *
from models.burningcrusade.data_preparation import prepare_data

In [2]:
DIR = "/Users/brianbroeking/projects/numerai/data"
download_current_data(DIR)
train_static, val_static, tournament_static = load_data(DIR, reduce_memory=True)

You already have the newest data! Current round is: 261
Loading the data


In [3]:
train_with_group = get_group_stats(train_static)
tournament_with_group = get_group_stats(tournament_static)

In [4]:
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction"
feature_names = generate_features_list(train_static)

In [5]:
ft_corr_list = random.sample(feature_names, 31)
train, tournament = generate_polynomial_features(ft_corr_list, train_with_group, tournament_with_group)

In [6]:
X_train, y_train = clean_for_xgboost(train)
dtrain = xgboost.DMatrix(X_train, y_train)

X_tournament, y_tournament = clean_for_xgboost(tournament)
dtournament = xgboost.DMatrix(X_tournament, y_tournament)

In [None]:
def ar1(x):
    return np.corrcoef(x[:-1], x[1:])[0,1]

def autocorr_penalty(x):
    n = len(x)
    p = ar1(x)
    return np.sqrt(1 + 2*np.sum([((n - i)/n)*p**i for i in range(1,n)]))

def smart_sharpe(x):
    return np.mean(x)/(np.std(x, ddof=1)*autocorr_penalty(x))

def era_boost_train(X, y, era_col, proportion=0.5,
                    trees_per_step=10, num_iters=200,
                    one_shot=False, tree_method='hist',
                    test_model=None, note=None):
    print(f"\n#### Era boost train with proportion {proportion:0.3f} ####\n")
    if note is not None:
        print(note)
    if one_shot:
        trees_per_step = trees_per_step * num_iters
        num_iters=1

    if test_model is None:
        print(f"Train {num_iters} iterations")
        print(f"Train {trees_per_step} rounds per iteration")
    else:
        print("Testing model performance")
    features = X.columns
    new_df = X.copy()
    new_df["target"] = y
    new_df["era"] = era_col
    for i in range(num_iters):
        print(f"\nIteration {i+1}:\n")
        if test_model is None:
            if i==0:
                model = xgboost.XGBRegressor(max_depth=18,
                                             learning_rate=0.001,
                                             n_estimators=trees_per_step,
                                             n_jobs=-1,
                                             colsample_bytree=0.1,
                                             gamma=0.1,
                                             tree_method=tree_method)
                model.fit(X, y)
            else:
                model.n_estimators += trees_per_step
                booster = model.get_booster()
                print("fitting on worst eras")
                model.fit(worst_df[features], worst_df["target"], xgb_model=booster)
        else:
            model = test_model
        # score each era
        print("predicting on train")
        preds = model.predict(X)
        new_df["pred"] = preds
        era_scores = pd.Series(index=new_df["era"].unique())
        print("getting per era scores")
        for era in new_df["era"].unique():
            era_df = new_df[new_df["era"] == era]
            print(spearmanr(era_df["pred"], era_df["target"])[0])
            era_scores[era] = spearmanr(era_df["pred"], era_df["target"])[0]
        era_scores.sort_values(inplace=True)
        worst_eras = era_scores[era_scores <= era_scores.quantile(proportion)].index
        print(list(worst_eras))
        worst_df = new_df[new_df["era"].isin(worst_eras)]
        era_scores.sort_index(inplace=True)
        era_scores.plot(kind="bar")
        print("performance over time")
        plt.show()
        print("autocorrelation")
        print(ar1(era_scores))
        if (ar1(era_scores)) < 0.1:
            return model
        print("mean correlation")
        print(np.mean(era_scores))
        print("sharpe")
        print(np.mean(era_scores)/np.std(era_scores))
        print("smart sharpe")
        print(smart_sharpe(era_scores))
    return model

boost_model = era_boost_train(X_train, y_train,
                              era_col=train["era"], proportion=0.5,
                              trees_per_step=10, num_iters=20)


#### Era boost train with proportion 0.500 ####

Train 20 iterations
Train 10 rounds per iteration

Iteration 1:



In [None]:
X_train

In [None]:
save_model(boost_model, "burningcrusade.pkl")

In [None]:
tournament.loc[:, PREDICTION_NAME] = boost_model.predict(X_tournament)
current_round = NAPI.get_current_round()
tournament.set_index('id', inplace=True)
tournament[PREDICTION_NAME].to_csv(f"submissions/burningcrusade/submission_{current_round}.csv", header=True)