In [1]:
import gc
import json
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb

from pathlib import Path
from numerapi import NumerAPI
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from helper import print_both, categorical_encoded

napi = NumerAPI()
current_round = napi.get_current_round()

In [None]:
# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/validation.parquet")
napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
napi.download_dataset("v4/validation_example_preds.parquet")
napi.download_dataset("v4/features.json")

In [2]:
ERA_COL = "era"
TARGET_COL = "target_nomi_v4_20"
DATA_TYPE_COL = "data_type"
EXAMPLE_PREDS_COL = "example_preds"

print('Reading minimal training data')
with open("v4/features.json", "r") as f:
    feature_metadata = json.load(f)

features = feature_metadata["feature_sets"]["medium"]
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

Reading minimal training data


In [3]:
training_data = pd.read_parquet('v4/train.parquet', columns=read_columns)
training_data = training_data.replace([np.inf, -np.inf], np.nan)


validation_data = pd.read_parquet('v4/validation.parquet', columns=read_columns)
validation_data = validation_data.replace([np.inf, -np.inf], np.nan)

In [4]:
all_feature_corrs = training_data.groupby(ERA_COL).apply(lambda era: era[features].corrwith(era[TARGET_COL]))

In [5]:
def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n

In [6]:
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)

In [35]:
data = training_data.copy()
optuna_target = data[targets]

In [None]:
train_x, test_x, train_y, test_y = train_test_split(data, optuna_target, test_size=0.2, random_state=42)

In [32]:
data

Unnamed: 0,id,era,data_type,feature_honoured_observational_balaamite,feature_polaroid_vadose_quinze,feature_untidy_withdrawn_bargeman,feature_genuine_kyphotic_trehala,feature_unenthralled_sportful_schoolhouse,feature_divulsive_explanatory_ideologue,feature_ichthyotic_roofed_yeshiva,...,target_paul_v4_20,target_paul_v4_60,target_george_v4_20,target_george_v4_60,target_william_v4_20,target_william_v4_60,target_arthur_v4_20,target_arthur_v4_60,target_thomas_v4_20,target_thomas_v4_60
0,n003bba8a98662e4,0.0,0.0,1.00,0.50,1.00,1.00,0.00,0.00,1.00,...,0.50,0.25,0.25,0.00,0.333333,0.000000,0.500000,0.500000,0.166667,0.000000
1,n003bee128c2fcfc,0.0,0.0,0.50,1.00,0.25,0.75,0.00,0.75,0.50,...,0.75,1.00,1.00,1.00,0.666667,0.666667,0.833333,0.666667,0.833333,0.666667
2,n0048ac83aff7194,0.0,0.0,0.50,0.25,0.75,0.00,0.75,0.00,0.75,...,0.50,0.25,0.25,0.25,0.500000,0.333333,0.333333,0.500000,0.500000,0.333333
3,n00691bec80d3e02,0.0,0.0,1.00,0.50,0.50,0.75,0.00,1.00,0.25,...,0.50,0.50,0.50,0.50,0.666667,0.500000,0.500000,0.500000,0.666667,0.500000
4,n00b8720a2fdc4f2,0.0,0.0,1.00,0.75,1.00,1.00,0.00,0.00,1.00,...,0.50,0.50,0.50,0.50,0.666667,0.500000,0.666667,0.500000,0.666667,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420516,nffcc1dbdf2212e6,573.0,0.0,0.00,0.25,0.00,0.25,1.00,1.00,0.75,...,0.50,0.75,0.75,0.75,0.666667,0.833333,0.500000,0.833333,0.833333,0.833333
2420517,nffd71b7f6a128df,573.0,0.0,0.00,0.25,0.00,1.00,0.50,1.00,0.00,...,0.25,0.50,0.25,0.50,0.166667,0.500000,0.000000,0.333333,0.166667,0.333333
2420518,nffde3b371d67394,573.0,0.0,0.25,0.25,0.50,0.25,0.75,0.75,0.75,...,0.25,0.50,0.50,0.50,0.333333,0.666667,0.500000,0.666667,0.333333,0.666667
2420519,nfff1a1111b35e84,573.0,0.0,1.00,0.75,0.50,0.50,0.75,0.50,0.25,...,0.75,0.75,0.75,0.75,0.500000,0.666667,0.500000,0.500000,0.500000,0.500000


In [7]:
file = open(f'{current_round}_report.txt', 'w', encoding="utf-8")

def objective(trial):
    param = {
            'tree_method':'hist',  #this parameter means using the GPU when training our model to speedup the training process
            'n_estimators': trial.suggest_categorical('n_estimators', [300, 700, 1500]),
            'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.0, 1.0),
            'subsample': trial.suggest_float('subsample', 0.0, 1.0),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 5),
            'random_state': trial.suggest_categorical('random_state', [2020]),
            'min_child_weight': trial.suggest_int('min_child_weight', 10, 300),
            'eval_metric': trial.suggest_categorical('eval_metric', ['mae'])
        }
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation_0-mae')
    model = xgb.XGBRegressor(**param)  
    
    model.fit(training_data.filter(like='feature_', axis='columns'), training_data[TARGET_COL], eval_set=[(validation_data.filter(like='feature_', axis='columns'), validation_data[TARGET_COL])], early_stopping_rounds=20, callbacks=[pruning_callback])
    
    preds = model.predict(validation_data.filter(like='feature_', axis='columns'))
    
    mae = mean_absolute_error(validation_data[TARGET_COL], preds)
    
    return mae

In [8]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)
print_both(file, 'Number of finished trials:', len(study.trials))
print_both(file, 'Best trial:', study.best_trial.params)

gc.collect()

[32m[I 2022-11-23 14:55:30,105][0m A new study created in memory with name: no-name-e07e8000-9bfa-480e-853f-f6d8fe607372[0m
  self._init_valid()
  0%|          | 0/100 [00:59<?, ?it/s]


[33m[W 2022-11-23 14:56:29,632][0m Trial 0 failed because of the following error: XGBoostError('[14:56:29] C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/data/data.cc:487: Check failed: valid: Label contains NaN, infinity or a value too large.')[0m
Traceback (most recent call last):
  File "c:\Users\PC\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\PC\AppData\Local\Temp\ipykernel_18796\4087210166.py", line 23, in objective
    model.fit(training_data.filter(like='feature_', axis='columns'), training_data[TARGET_COL], eval_set=[(validation_data.filter(like='feature_', axis='columns'), validation_data[TARGET_COL])], early_stopping_rounds=20, callbacks=[pruning_callback])
  File "c:\Users\PC\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 532, in inner_f
    return f(**kwargs)
  File "c:\Users\PC\AppData\Local\Programs\Pyth

XGBoostError: [14:56:29] C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/data/data.cc:487: Check failed: valid: Label contains NaN, infinity or a value too large.

In [None]:
gc.collect()
    
X = df.drop(targets, axis=1)
y = df[target]


if os.path.exists(f'../models/best_trial_{project}_{ticker}.parquet'):
    trial = study.trials_dataframe()
    trial_best_score = trial['value'].min()

    historic_best_studies = pd.read_parquet(f'../models/best_trial_{project}_{ticker}.parquet')
    historic_best_score = historic_best_studies['value'].min()

    if trial_best_score > historic_best_score:
        print_both(file, f'- Current best score: {trial_best_score} is no improvement over historic best score: {historic_best_score}, do nothing')
        
    else:
        print_both(file, f'- Current best score: {trial_best_score} is an improvement over historic best score: {historic_best_score}, update best trial')
        trial.to_parquet(f'../models/best_trial_{project}_{ticker}.parquet')
        print_both(file, f'- Data saved to ../models/best_trial_{project}_{ticker}.parquet')
        
        best_trial = study.best_trial.params
        best_trial['tree_method'] = 'hist'

        model = xgb.XGBRegressor(**best_trial)
        model.fit(X, y)

        preds = model.predict(X)

        print_both(file, f"MAE for set: {mean_absolute_error(y, preds)}")
        print_both(file, f"R2 for set: {r2_score(y, preds)}")
        joblib.dump(model, f'../models/{project}_{ticker}_predictor.joblib')
else:
    trial = study.trials_dataframe()
    trial.to_parquet(f'../models/best_trial_{project}_{ticker}.parquet')
    best_trial = study.best_trial.params
    best_trial['tree_method'] = 'hist'

    model = xgb.XGBRegressor(**best_trial)
    model.fit(X, y)

    preds = model.predict(X)

    print_both(file, f"MAE for set: {mean_absolute_error(y, preds)}")
    print_both(file, f"R2 for set: {r2_score(y, preds)}")
    joblib.dump(model, f'../models/{project}_{ticker}_predictor.joblib')

In [None]:
 
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet('v4/train.parquet',
                                columns=read_columns)
validation_data = pd.read_parquet('v4/validation.parquet',
                                  columns=read_columns)
live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
                                  columns=read_columns)