In [1]:
import gc
import json
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb

from pathlib import Path
from numerapi import NumerAPI
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from helper import print_both, categorical_encoded

napi = NumerAPI()
current_round = napi.get_current_round()

In [None]:
# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')

Path("./v4").mkdir(parents=False, exist_ok=True)
napi.download_dataset("v4/train.parquet")
napi.download_dataset("v4/validation.parquet")
napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
napi.download_dataset("v4/validation_example_preds.parquet")
napi.download_dataset("v4/features.json")

In [2]:
ERA_COL = "era"
TARGET_COL = "target_nomi_v4_20"
DATA_TYPE_COL = "data_type"
EXAMPLE_PREDS_COL = "example_preds"

print('Reading minimal training data')
with open("v4/features.json", "r") as f:
    feature_metadata = json.load(f)

features = feature_metadata["feature_sets"]["medium"]
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

Reading minimal training data


In [3]:
training_data = pd.read_parquet('v4/train.parquet', columns=read_columns)
#training_data = training_data.replace([np.inf, -np.inf], np.nan)
training_data = categorical_encoded(training_data)


validation_data = pd.read_parquet('v4/validation.parquet', columns=read_columns)
#validation_data = validation_data.replace([np.inf, -np.inf], np.nan)

In [8]:
training_data = categorical_encoded(training_data)

target = TARGET_COL

data = training_data.copy()
optuna_target = data[target]

Names of categorical columns :  ['era', 'data_type']


In [9]:
train_x, test_x, train_y, test_y = train_test_split(data, optuna_target, test_size=0.2, random_state=42)

In [15]:
file = open(f'{current_round}_report.txt', 'w', encoding="utf-8")

def objective(trial):
    param = {
            'tree_method':'hist',  #this parameter means using the GPU when training our model to speedup the training process
            'n_estimators': trial.suggest_categorical('n_estimators', [300, 700]),
            'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.0, 1.0),
            'subsample': trial.suggest_float('subsample', 0.0, 1.0),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
            'max_depth': trial.suggest_int('max_depth', 3, 5),
            'random_state': trial.suggest_categorical('random_state', [2020]),
            'min_child_weight': trial.suggest_int('min_child_weight', 10, 300),
            'eval_metric': trial.suggest_categorical('eval_metric', ['mae'])
            }

    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation_0-mae')
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=20, callbacks=[pruning_callback])
    
    preds = model.predict(test_x)
    
    mae = mean_absolute_error(test_y, preds)
    
    return mae


In [16]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20, show_progress_bar=True)
print_both(file, 'Number of finished trials:', len(study.trials))
print_both(file, 'Best trial:', study.best_trial.params)

gc.collect()

[32m[I 2022-11-28 10:58:59,172][0m A new study created in memory with name: no-name-0289c8f7-6519-473b-adc2-2813a8d6514b[0m
  self._init_valid()
  0%|          | 0/20 [00:00<?, ?it/s]

[0]	validation_0-mae:0.14999
[1]	validation_0-mae:0.14409
[2]	validation_0-mae:0.13842
[3]	validation_0-mae:0.13851
[4]	validation_0-mae:0.13306
[5]	validation_0-mae:0.12783
[6]	validation_0-mae:0.12280
[7]	validation_0-mae:0.11798
[8]	validation_0-mae:0.11799
[9]	validation_0-mae:0.11336
[10]	validation_0-mae:0.10890
[11]	validation_0-mae:0.10462
[12]	validation_0-mae:0.10051
[13]	validation_0-mae:0.10056
[14]	validation_0-mae:0.09661
[15]	validation_0-mae:0.09282
[16]	validation_0-mae:0.08917
[17]	validation_0-mae:0.08922
[18]	validation_0-mae:0.08925
[19]	validation_0-mae:0.08929
[20]	validation_0-mae:0.08579
[21]	validation_0-mae:0.08243
[22]	validation_0-mae:0.07920
[23]	validation_0-mae:0.07610
[24]	validation_0-mae:0.07312
[25]	validation_0-mae:0.07314
[26]	validation_0-mae:0.07028
[27]	validation_0-mae:0.07030
[28]	validation_0-mae:0.07032
[29]	validation_0-mae:0.07035
[30]	validation_0-mae:0.07035
[31]	validation_0-mae:0.07037
[32]	validation_0-mae:0.06762
[33]	validation_0-ma

  5%|▌         | 1/20 [02:19<44:08, 139.37s/it]

[32m[I 2022-11-28 11:01:18,560][0m Trial 0 finished with value: 0.0015295931370928884 and parameters: {'n_estimators': 700, 'objective': 'reg:squarederror', 'reg_lambda': 0.9458982362682626, 'reg_alpha': 0.6142755517627096, 'gamma': 1.5693147926644302, 'colsample_bytree': 0.6393106948732752, 'subsample': 0.6920886101698128, 'learning_rate': 0.03936921685016123, 'max_depth': 3, 'random_state': 2020, 'min_child_weight': 294, 'eval_metric': 'mae'}. Best is trial 0 with value: 0.0015295931370928884.[0m




[0]	validation_0-mae:0.14892
[1]	validation_0-mae:0.14795
[2]	validation_0-mae:0.14698
[3]	validation_0-mae:0.14602
[4]	validation_0-mae:0.14506
[5]	validation_0-mae:0.14411
[6]	validation_0-mae:0.14317
[7]	validation_0-mae:0.14224
[8]	validation_0-mae:0.14130
[9]	validation_0-mae:0.14038
[10]	validation_0-mae:0.13946
[11]	validation_0-mae:0.13855
[12]	validation_0-mae:0.13764
[13]	validation_0-mae:0.13674
[14]	validation_0-mae:0.13585
[15]	validation_0-mae:0.13496
[16]	validation_0-mae:0.13407
[17]	validation_0-mae:0.13320
[18]	validation_0-mae:0.13233
[19]	validation_0-mae:0.13146
[20]	validation_0-mae:0.13060
[21]	validation_0-mae:0.12975
[22]	validation_0-mae:0.12890
[23]	validation_0-mae:0.12805
[24]	validation_0-mae:0.12721
[25]	validation_0-mae:0.12723
[26]	validation_0-mae:0.12640
[27]	validation_0-mae:0.12641
[28]	validation_0-mae:0.12558
[29]	validation_0-mae:0.12476
[30]	validation_0-mae:0.12394
[31]	validation_0-mae:0.12313
[32]	validation_0-mae:0.12233
[33]	validation_0-ma

 10%|█         | 2/20 [04:53<44:26, 148.16s/it]

[32m[I 2022-11-28 11:03:52,879][0m Trial 1 finished with value: 0.030955931171774864 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.951620032771144, 'reg_alpha': 3.8971436296150515, 'gamma': 4.379517251218031, 'colsample_bytree': 0.8306082160342882, 'subsample': 0.3569216411435332, 'learning_rate': 0.0065446432976429434, 'max_depth': 4, 'random_state': 2020, 'min_child_weight': 43, 'eval_metric': 'mae'}. Best is trial 0 with value: 0.0015295931370928884.[0m




[0]	validation_0-mae:0.14994
[1]	validation_0-mae:0.14991
[2]	validation_0-mae:0.14991
[3]	validation_0-mae:0.14992
[4]	validation_0-mae:0.13994
[5]	validation_0-mae:0.13994
[6]	validation_0-mae:0.13997
[7]	validation_0-mae:0.14001
[8]	validation_0-mae:0.14001
[9]	validation_0-mae:0.13998
[10]	validation_0-mae:0.13997
[11]	validation_0-mae:0.13995
[12]	validation_0-mae:0.13998
[13]	validation_0-mae:0.13997
[14]	validation_0-mae:0.13999
[15]	validation_0-mae:0.13998
[16]	validation_0-mae:0.13998
[17]	validation_0-mae:0.13995
[18]	validation_0-mae:0.13992
[19]	validation_0-mae:0.13993
[20]	validation_0-mae:0.13995
[21]	validation_0-mae:0.13996
[22]	validation_0-mae:0.13064
[23]	validation_0-mae:0.13066
[24]	validation_0-mae:0.13064
[25]	validation_0-mae:0.13065
[26]	validation_0-mae:0.13061
[27]	validation_0-mae:0.13063
[28]	validation_0-mae:0.13061
[29]	validation_0-mae:0.13062
[30]	validation_0-mae:0.13063
[31]	validation_0-mae:0.13067
[32]	validation_0-mae:0.13066
[33]	validation_0-ma

 15%|█▌        | 3/20 [05:56<30:54, 109.07s/it]

[32m[I 2022-11-28 11:04:55,435][0m Trial 2 finished with value: 0.08638197183609009 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.0858228523919987, 'reg_alpha': 1.4078405994341536, 'gamma': 4.318764216130164, 'colsample_bytree': 0.05916124820707591, 'subsample': 0.055289031023966984, 'learning_rate': 0.06661414404578787, 'max_depth': 3, 'random_state': 2020, 'min_child_weight': 145, 'eval_metric': 'mae'}. Best is trial 0 with value: 0.0015295931370928884.[0m




[0]	validation_0-mae:0.14108
[1]	validation_0-mae:0.13278
[2]	validation_0-mae:0.12497
[3]	validation_0-mae:0.11762
[4]	validation_0-mae:0.11070
[5]	validation_0-mae:0.10418
[6]	validation_0-mae:0.09805
[7]	validation_0-mae:0.09228
[8]	validation_0-mae:0.08685
[9]	validation_0-mae:0.08174
[10]	validation_0-mae:0.07693
[11]	validation_0-mae:0.07241
[12]	validation_0-mae:0.06815
[13]	validation_0-mae:0.06414
[14]	validation_0-mae:0.06036
[15]	validation_0-mae:0.05681
[16]	validation_0-mae:0.05347
[17]	validation_0-mae:0.05032
[18]	validation_0-mae:0.04736
[19]	validation_0-mae:0.04458
[20]	validation_0-mae:0.04195
[21]	validation_0-mae:0.03948
[22]	validation_0-mae:0.03716
[23]	validation_0-mae:0.03497
[24]	validation_0-mae:0.03292
[25]	validation_0-mae:0.03098
[26]	validation_0-mae:0.02916
[27]	validation_0-mae:0.02744
[28]	validation_0-mae:0.02583
[29]	validation_0-mae:0.02431
[30]	validation_0-mae:0.02288
[31]	validation_0-mae:0.02153
[32]	validation_0-mae:0.02027
[33]	validation_0-ma

 20%|██        | 4/20 [07:11<25:33, 95.83s/it] 

[32m[I 2022-11-28 11:06:10,960][0m Trial 3 finished with value: 0.0014188202330842614 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.06282602957116043, 'reg_alpha': 2.0117153388977527, 'gamma': 4.174687244371697, 'colsample_bytree': 0.9918576118038304, 'subsample': 0.7772513874534244, 'learning_rate': 0.05884316210903026, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 148, 'eval_metric': 'mae'}. Best is trial 3 with value: 0.0014188202330842614.[0m




[0]	validation_0-mae:0.14442
[1]	validation_0-mae:0.13914
[2]	validation_0-mae:0.13405
[3]	validation_0-mae:0.12915
[4]	validation_0-mae:0.12443
[5]	validation_0-mae:0.11988
[6]	validation_0-mae:0.11550
[7]	validation_0-mae:0.11127
[8]	validation_0-mae:0.11127
[9]	validation_0-mae:0.10721
[10]	validation_0-mae:0.10329
[11]	validation_0-mae:0.09951
[12]	validation_0-mae:0.09587
[13]	validation_0-mae:0.09236
[14]	validation_0-mae:0.08899
[15]	validation_0-mae:0.08573
[16]	validation_0-mae:0.08260
[17]	validation_0-mae:0.07958
[18]	validation_0-mae:0.07667
[19]	validation_0-mae:0.07672
[20]	validation_0-mae:0.07391
[21]	validation_0-mae:0.07121
[22]	validation_0-mae:0.06861
[23]	validation_0-mae:0.06610
[24]	validation_0-mae:0.06369
[25]	validation_0-mae:0.06369
[26]	validation_0-mae:0.06136
[27]	validation_0-mae:0.06136
[28]	validation_0-mae:0.05912
[29]	validation_0-mae:0.05696
[30]	validation_0-mae:0.05696
[31]	validation_0-mae:0.05696
[32]	validation_0-mae:0.05488
[33]	validation_0-ma

 25%|██▌       | 5/20 [08:57<24:49, 99.33s/it]

[32m[I 2022-11-28 11:07:56,504][0m Trial 4 finished with value: 0.0023187745828181505 and parameters: {'n_estimators': 700, 'objective': 'reg:squarederror', 'reg_lambda': 0.9498537233096949, 'reg_alpha': 3.1269875686460065, 'gamma': 4.506984577461225, 'colsample_bytree': 0.7375967917177928, 'subsample': 0.31979906496027555, 'learning_rate': 0.03657237746703636, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 180, 'eval_metric': 'mae'}. Best is trial 3 with value: 0.0014188202330842614.[0m




[0]	validation_0-mae:0.14081
[1]	validation_0-mae:0.13227
[2]	validation_0-mae:0.12425
[3]	validation_0-mae:0.11671
[4]	validation_0-mae:0.10963
[5]	validation_0-mae:0.10298
[6]	validation_0-mae:0.09674
[7]	validation_0-mae:0.09087
[8]	validation_0-mae:0.09095
[9]	validation_0-mae:0.08544
[10]	validation_0-mae:0.08026
[11]	validation_0-mae:0.07540
[12]	validation_0-mae:0.07083
[13]	validation_0-mae:0.07087
[14]	validation_0-mae:0.06658
[15]	validation_0-mae:0.06255
[16]	validation_0-mae:0.05876
[17]	validation_0-mae:0.05879
[18]	validation_0-mae:0.05883
[19]	validation_0-mae:0.05886
[20]	validation_0-mae:0.05531
[21]	validation_0-mae:0.05197
[22]	validation_0-mae:0.04883
[23]	validation_0-mae:0.04588
[24]	validation_0-mae:0.04312
[25]	validation_0-mae:0.04314
[26]	validation_0-mae:0.04054
[27]	validation_0-mae:0.04055
[28]	validation_0-mae:0.03811
[29]	validation_0-mae:0.03812
[30]	validation_0-mae:0.03812
[31]	validation_0-mae:0.03812
[32]	validation_0-mae:0.03583
[33]	validation_0-ma

 30%|███       | 6/20 [10:21<22:00, 94.30s/it]

[32m[I 2022-11-28 11:09:21,037][0m Trial 5 finished with value: 0.0009924035985022783 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.9600341204525804, 'reg_alpha': 0.07855811026900472, 'gamma': 1.0036481295357302, 'colsample_bytree': 0.6881113467654387, 'subsample': 0.8818520861043739, 'learning_rate': 0.0606575491282112, 'max_depth': 4, 'random_state': 2020, 'min_child_weight': 179, 'eval_metric': 'mae'}. Best is trial 5 with value: 0.0009924035985022783.[0m


 35%|███▌      | 7/20 [10:45<15:26, 71.26s/it]

[32m[I 2022-11-28 11:09:44,859][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m


 40%|████      | 8/20 [11:26<12:20, 61.67s/it]

[32m[I 2022-11-28 11:10:25,939][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.14996


 45%|████▌     | 9/20 [12:07<10:05, 55.01s/it]

[32m[I 2022-11-28 11:11:06,346][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.15005


 50%|█████     | 10/20 [12:48<08:29, 50.90s/it]

[32m[I 2022-11-28 11:11:48,046][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m


 55%|█████▌    | 11/20 [13:43<07:47, 51.91s/it]

[32m[I 2022-11-28 11:12:39,749][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.14432
[1]	validation_0-mae:0.13894
[2]	validation_0-mae:0.13376
[3]	validation_0-mae:0.12877
[4]	validation_0-mae:0.12397
[5]	validation_0-mae:0.11935
[6]	validation_0-mae:0.11490
[7]	validation_0-mae:0.11062
[8]	validation_0-mae:0.10649
[9]	validation_0-mae:0.10252
[10]	validation_0-mae:0.09870
[11]	validation_0-mae:0.09502
[12]	validation_0-mae:0.09148
[13]	validation_0-mae:0.08807
[14]	validation_0-mae:0.08478
[15]	validation_0-mae:0.08162
[16]	validation_0-mae:0.07858
[17]	validation_0-mae:0.07565
[18]	validation_0-mae:0.07283
[19]	validation_0-mae:0.07012
[20]	validation_0-mae:0.06750
[21]	validation_0-mae:0.06499
[22]	validation_0-mae:0.06256
[23]	validation_0-mae:0.06023
[24]	validation_0-mae:0.05799
[25]	validation_0-mae:0.05582
[26]	validation_0-mae:0.05374
[27]	validation_0-mae:0.05174
[28]	validation_0-mae:0.04981
[29]	validation_0-mae:0.04795
[30]	validation_0-m

 60%|██████    | 12/20 [15:43<09:42, 72.76s/it]

[32m[I 2022-11-28 11:14:42,702][0m Trial 11 finished with value: 0.0011043463600799441 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.32943148728596217, 'reg_alpha': 1.8960928791305047, 'gamma': 3.0289095708698324, 'colsample_bytree': 0.9987166983430296, 'subsample': 0.9336333358113397, 'learning_rate': 0.03728162646031609, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 114, 'eval_metric': 'mae'}. Best is trial 5 with value: 0.0009924035985022783.[0m




[0]	validation_0-mae:0.14590


 65%|██████▌   | 13/20 [16:27<07:29, 64.16s/it]

[32m[I 2022-11-28 11:15:27,066][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.14270
[1]	validation_0-mae:0.13585
[2]	validation_0-mae:0.12932
[3]	validation_0-mae:0.12310
[4]	validation_0-mae:0.11719
[5]	validation_0-mae:0.11156
[6]	validation_0-mae:0.10620
[7]	validation_0-mae:0.10110
[8]	validation_0-mae:0.09624
[9]	validation_0-mae:0.09161
[10]	validation_0-mae:0.08721
[11]	validation_0-mae:0.08302
[12]	validation_0-mae:0.07903
[13]	validation_0-mae:0.07524
[14]	validation_0-mae:0.07162
[15]	validation_0-mae:0.06818
[16]	validation_0-mae:0.06490
[17]	validation_0-mae:0.06179
[18]	validation_0-mae:0.05882
[19]	validation_0-mae:0.05886
[20]	validation_0-mae:0.05604
[21]	validation_0-mae:0.05335
[22]	validation_0-mae:0.05079
[23]	validation_0-mae:0.04835
[24]	validation_0-mae:0.04603
[25]	validation_0-mae:0.04605
[26]	validation_0-mae:0.04384
[27]	validation_0-mae:0.04384
[28]	validation_0-mae:0.04174
[29]	validation_0-mae:0.03974
[30]	validation_0-m

 70%|███████   | 14/20 [18:12<07:37, 76.26s/it]

[32m[I 2022-11-28 11:17:11,274][0m Trial 13 finished with value: 0.0012425982858985662 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.3312408741151736, 'reg_alpha': 1.5770763986604803, 'gamma': 3.0704586920939696, 'colsample_bytree': 0.7994487399554743, 'subsample': 0.8250301182811144, 'learning_rate': 0.048050443038306016, 'max_depth': 4, 'random_state': 2020, 'min_child_weight': 109, 'eval_metric': 'mae'}. Best is trial 5 with value: 0.0009924035985022783.[0m


 75%|███████▌  | 15/20 [18:43<05:13, 62.61s/it]

[32m[I 2022-11-28 11:17:42,266][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.15013


 80%|████████  | 16/20 [19:25<03:45, 56.42s/it]

[32m[I 2022-11-28 11:18:24,301][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.14225
[1]	validation_0-mae:0.13498
[2]	validation_0-mae:0.12809
[3]	validation_0-mae:0.12154
[4]	validation_0-mae:0.11534
[5]	validation_0-mae:0.10944
[6]	validation_0-mae:0.10385
[7]	validation_0-mae:0.09855
[8]	validation_0-mae:0.09863
[9]	validation_0-mae:0.09360
[10]	validation_0-mae:0.08882
[11]	validation_0-mae:0.08429
[12]	validation_0-mae:0.07999
[13]	validation_0-mae:0.08002
[14]	validation_0-mae:0.07594
[15]	validation_0-mae:0.07207
[16]	validation_0-mae:0.06839
[17]	validation_0-mae:0.06843
[18]	validation_0-mae:0.06846
[19]	validation_0-mae:0.06850
[20]	validation_0-mae:0.06502
[21]	validation_0-mae:0.06171
[22]	validation_0-mae:0.05857
[23]	validation_0-mae:0.05559
[24]	validation_0-mae:0.05276
[25]	validation_0-mae:0.05279
[26]	validation_0-mae:0.05011
[27]	validation_0-mae:0.05013
[28]	validation_0-mae:0.05015
[29]	validation_0-mae:0.05016
[30]	validation_0-m

 85%|████████▌ | 17/20 [21:00<03:24, 68.10s/it]

[32m[I 2022-11-28 11:19:59,562][0m Trial 16 pruned. Trial was pruned at iteration 105.[0m
[0]	validation_0-mae:0.14995


 90%|█████████ | 18/20 [21:32<01:54, 57.14s/it]

[32m[I 2022-11-28 11:20:31,212][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m
[0]	validation_0-mae:0.13796
[1]	validation_0-mae:0.12696
[2]	validation_0-mae:0.11685
[3]	validation_0-mae:0.10753
[4]	validation_0-mae:0.09897
[5]	validation_0-mae:0.09108
[6]	validation_0-mae:0.08382
[7]	validation_0-mae:0.07715
[8]	validation_0-mae:0.07100
[9]	validation_0-mae:0.06534
[10]	validation_0-mae:0.06014
[11]	validation_0-mae:0.05535
[12]	validation_0-mae:0.05094
[13]	validation_0-mae:0.04688
[14]	validation_0-mae:0.04315
[15]	validation_0-mae:0.03972
[16]	validation_0-mae:0.03655
[17]	validation_0-mae:0.03364
[18]	validation_0-mae:0.03097
[19]	validation_0-mae:0.02850
[20]	validation_0-mae:0.02624
[21]	validation_0-mae:0.02415
[22]	validation_0-mae:0.02223
[23]	validation_0-mae:0.02046
[24]	validation_0-mae:0.01883
[25]	validation_0-mae:0.01883
[26]	validation_0-mae:0.01734
[27]	validation_0-mae:0.01734
[28]	validation_0-mae:0.01596
[29]	validation_0-mae:0.01469
[30]	validation_0-m

 95%|█████████▌| 19/20 [22:36<00:59, 59.48s/it]

[32m[I 2022-11-28 11:21:36,132][0m Trial 18 finished with value: 0.002035460202023387 and parameters: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.21907157668734323, 'reg_alpha': 4.934030243902859, 'gamma': 1.8343608180378967, 'colsample_bytree': 0.9099046146283366, 'subsample': 0.1663568763646931, 'learning_rate': 0.07972915383407803, 'max_depth': 4, 'random_state': 2020, 'min_child_weight': 82, 'eval_metric': 'mae'}. Best is trial 5 with value: 0.0009924035985022783.[0m




[0]	validation_0-mae:0.14267
[1]	validation_0-mae:0.13578
[2]	validation_0-mae:0.12923
[3]	validation_0-mae:0.12299
[4]	validation_0-mae:0.11706
[5]	validation_0-mae:0.11141
[6]	validation_0-mae:0.10603
[7]	validation_0-mae:0.10092
[8]	validation_0-mae:0.10092
[9]	validation_0-mae:0.09605
[10]	validation_0-mae:0.09141
[11]	validation_0-mae:0.08700
[12]	validation_0-mae:0.08280
[13]	validation_0-mae:0.07881
[14]	validation_0-mae:0.07500
[15]	validation_0-mae:0.07138
[16]	validation_0-mae:0.06794
[17]	validation_0-mae:0.06466
[18]	validation_0-mae:0.06471
[19]	validation_0-mae:0.06476
[20]	validation_0-mae:0.06164
[21]	validation_0-mae:0.05867
[22]	validation_0-mae:0.05585
[23]	validation_0-mae:0.05316
[24]	validation_0-mae:0.05060
[25]	validation_0-mae:0.05060
[26]	validation_0-mae:0.04816
[27]	validation_0-mae:0.04816
[28]	validation_0-mae:0.04584
[29]	validation_0-mae:0.04363
[30]	validation_0-mae:0.04363
[31]	validation_0-mae:0.04363
[32]	validation_0-mae:0.04153
[33]	validation_0-ma

100%|██████████| 20/20 [23:27<00:00, 70.39s/it]

[32m[I 2022-11-28 11:22:26,919][0m Trial 19 pruned. Trial was pruned at iteration 33.[0m





Number of finished trials: 20
Best trial: {'n_estimators': 300, 'objective': 'reg:squarederror', 'reg_lambda': 0.9600341204525804, 'reg_alpha': 0.07855811026900472, 'gamma': 1.0036481295357302, 'colsample_bytree': 0.6881113467654387, 'subsample': 0.8818520861043739, 'learning_rate': 0.0606575491282112, 'max_depth': 4, 'random_state': 2020, 'min_child_weight': 179, 'eval_metric': 'mae'}


6325

In [19]:
y

id
n000101811a8a843    0.50
n001e1318d5072ac    0.00
n002a9c5ab785cbb    0.50
n002ccf6d0e8c5ad    0.25
n0051ab821295c29    0.50
                    ... 
nffcc311c1b4ffb2     NaN
nffcca880507acfc     NaN
nffce3e57217e7a3     NaN
nffd5ec0564586e7     NaN
nffe284f043b586f     NaN
Name: target_nomi_v4_20, Length: 2366074, dtype: float32

In [18]:
gc.collect()

validation_data = categorical_encoded(validation_data)
    
X = validation_data.drop(target, axis=1)
y = validation_data[target]


if os.path.exists(f'best_trial_{current_round}.parquet'):
    trial = study.trials_dataframe()
    trial_best_score = trial['value'].min()

    historic_best_studies = pd.read_parquet(f'best_trial_{current_round}.parquet')
    historic_best_score = historic_best_studies['value'].min()

    if trial_best_score > historic_best_score:
        print_both(file, f'- Current best score: {trial_best_score} is no improvement over historic best score: {historic_best_score}, do nothing')
        
    else:
        print_both(file, f'- Current best score: {trial_best_score} is an improvement over historic best score: {historic_best_score}, update best trial')
        trial.to_parquet(f'best_trial_{current_round}.parquet')
        print_both(file, f'- Data saved to best_trial_{current_round}.parquet')
        
        best_trial = study.best_trial.params
        best_trial['tree_method'] = 'hist'

        model = xgb.XGBRegressor(**best_trial)
        model.fit(X, y)

        preds = model.predict(X)

        print_both(file, f"MAE for set: {mean_absolute_error(y, preds)}")
        print_both(file, f"R2 for set: {r2_score(y, preds)}")
        joblib.dump(model, f'{current_round}_predictor.joblib')
else:
    trial = study.trials_dataframe()
    trial.to_parquet(f'best_trial_{current_round}.parquet')
    best_trial = study.best_trial.params
    best_trial['tree_method'] = 'hist'

    model = xgb.XGBRegressor(**best_trial)
    model.fit(X, y)

    preds = model.predict(X)

    print_both(file, f"MAE for set: {mean_absolute_error(y, preds)}")
    print_both(file, f"R2 for set: {r2_score(y, preds)}")
    joblib.dump(model, f'{current_round}_predictor.joblib')

Names of categorical columns :  ['era', 'data_type']
- Current best score: 0.0009924035985022783 is an improvement over historic best score: 0.0009924035985022783, update best trial
- Data saved to best_trial_364.parquet


XGBoostError: [11:24:50] C:/Users/Administrator/workspace/xgboost-win64_release_1.6.0/src/data/data.cc:487: Check failed: valid: Label contains NaN, infinity or a value too large.

In [None]:
 
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet('v4/train.parquet',
                                columns=read_columns)
validation_data = pd.read_parquet('v4/validation.parquet',
                                  columns=read_columns)
live_data = pd.read_parquet(f'v4/live_{current_round}.parquet',
                                  columns=read_columns)