In [1]:
from mlens.ensemble import SuperLearner
from helpers.utils import *

[MLENS] backend: threading


In [2]:
DIR = "/Users/brianbroeking/projects/numerai/data"
train_static, val_static, tournament_static = load_data(DIR, reduce_memory=True)
features_list = generate_features_list(train_static)

Loading the data


In [3]:
def era_boost_train(X, y, era_col, proportion=0.5,
                    trees_per_step=10, num_iters=200,
                    one_shot=False, tree_method='hist',
                    test_model=None, note=None):
    print(f"\n#### Era boost train with proportion {proportion:0.3f} ####\n")
    if note is not None:
        print(note)
    if one_shot:
        trees_per_step = trees_per_step * num_iters
        num_iters=1

    if test_model is None:
        print(f"Train {num_iters} iterations")
        print(f"Train {trees_per_step} rounds per iteration")
    else:
        print("Testing model performance")
    features = X.columns
    new_df = X.copy()
    new_df["target"] = y
    new_df["era"] = era_col
    for i in range(num_iters):
        print(f"\nIteration {i+1}:\n")
        if test_model is None:
            if i==0:
                model = xgboost.XGBRegressor(max_depth=5,
                                             learning_rate=0.001,
                                             n_estimators=trees_per_step,
                                             n_jobs=-1,
                                             colsample_bytree=0.1,
                                             gamma=0.2,
                                             tree_method=tree_method)
                model.fit(X, y)
            else:
                model.n_estimators += trees_per_step
                booster = model.get_booster()
                print("fitting on worst eras")
                model.fit(worst_df[features], worst_df["target"], xgb_model=booster)
        else:
            if i == 0:
                model = test_model
            else:
                model.n_estimators += trees_per_step
                booster = model.get_booster()
                print("fitting on worst eras")
                model.fit(worst_df[features], worst_df["target"], xgb_model=booster)
        # score each era
        print("predicting on train")
        preds = model.predict(X)
        new_df["pred"] = preds
        era_scores = pd.Series(index=new_df["era"].unique())
        print("getting per era scores")
        for era in new_df["era"].unique():
            era_df = new_df[new_df["era"] == era]
            print(spearmanr(era_df["pred"], era_df["target"])[0])
            era_scores[era] = spearmanr(era_df["pred"], era_df["target"])[0]
        era_scores.sort_values(inplace=True)
        worst_eras = era_scores[era_scores <= era_scores.quantile(proportion)].index
        print(list(worst_eras))
        worst_df = new_df[new_df["era"].isin(worst_eras)]
        era_scores.sort_index(inplace=True)
        era_scores.plot(kind="bar")
        print("performance over time")
        plt.show()
        print("autocorrelation")
        print(ar1(era_scores))
#         if (ar1(era_scores)) < 0.1:
#             return model
        print("mean correlation")
        print(np.mean(era_scores))
        print("sharpe")
        print(np.mean(era_scores)/np.std(era_scores))
        print("smart sharpe")
        print(smart_sharpe(era_scores))
    return model

In [4]:
import random
def generate_model(train_with_group, tournament_with_group):
    ft_corr_list = random.sample(features_list, 63)
    train, tournament = generate_polynomial_features(ft_corr_list, train_with_group, tournament_with_group)
    X_train, y_train = clean_for_xgboost(train)
    X_tournament, y_tournament = clean_for_xgboost(tournament)

    model = era_boost_train(X_train, y_train,
                            era_col=train["era"], proportion=0.5,
                            trees_per_step=3, num_iters=8)
    
    return ft_corr_list, X_tournament, model

In [5]:
train_with_group = get_group_stats(train_static)
tournament_with_group = get_group_stats(tournament_static)

# generate many models
ft_corrs = []
xtournaments = []
models = []
for i in range(0, 5):
    ft_corr_list, X_tournament, model = generate_model(train_with_group, tournament_with_group)
    ft_corrs.append(ft_corr_list)
    xtournaments.append(X_tournament)
    models.append(model)

KeyboardInterrupt: 

In [None]:
# import set of models 
from sklearn.linear_model import LogisticRegression
seed = 101
ensemble = SuperLearner(scorer=correlation_score, random_state=seed, verbose=2)
ensemble.add([load_model(f'model_{str(i)}.pickle.dat') for i in range(0,5)])
ensemble.add_meta(LogisticRegression())

In [None]:
# Fit ensemble
ensemble.fit(X[:75], y[:75])

# Predict
preds = ensemble.predict(X[75:])