In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import json
import gc
import matplotlib.pyplot as plt

### Refs

CatBoost:  
https://catboost.ai/en/docs/references/training-parameters/
https://catboost.ai/en/docs/concepts/python-reference_catboostregressor

AdaBoost:  
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html

LightGBM:  
https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html#lightgbm.LGBMRegressor

XGBoost:  
https://xgboost.readthedocs.io/en/latest/parameter.html

Sklearn Resources:  
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [12]:
with open("features.json", "r") as f:
    feature_metadata = json.load(f)
med_features = feature_metadata["feature_sets"]["medium"]
# read in just those features along with era and target columns
read_columns = med_features + ['era', 'data_type', 'target']
train = pq.ParquetFile("numerai_training_data.parquet")
train_read = train.read(columns=read_columns)
print(f'Completed Read, data has shape {train_read.shape}')

Completed Read, data has shape (2412105, 423)


In [13]:
x = 25000
last_xk = [i for i in range(train_read.shape[0] - x, train_read.shape[0])]
df = train_read.take(last_xk).to_pandas()

del(train)
del(train_read)
gc.collect()

47

In [14]:
feature_bool = list(map(lambda x: True if x.count('target') == 0 else False, list(df.columns)))
feature_names = [list(df.columns)[i] for i in range(len(feature_bool)) if feature_bool[i]]
era_data = df[['era', 'data_type']]
features = df[feature_names].drop(['era', 'data_type'], axis=1)
targets = df.filter(regex='target')

In [5]:
with open("features.json", "r") as f:
    feature_metadata = json.load(f)
feature_set_medium = feature_metadata["feature_sets"]["medium"]

medium_feature_data = era_data.merge(features[feature_set_medium], left_index=True, right_index=True)
total_medium_data = medium_feature_data.merge(targets, left_index=True, right_index=True)
# total_medium_data.to_csv('medium_feats_50k.csv')

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import AdaBoostRegressor, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor

In [18]:
def model_scoring(model, X, y, argument_dict=None, cv=5):
    # for sklearn API compatible gradient boosting models
    
    if not argument_dict:
        # train vanilla model
        cv_results = cross_validate(model, X, y, cv=cv,
                                    scoring='neg_root_mean_squared_error',
                                    return_estimator=True,
                                    return_train_score=True)
        return cv_results
    else:
        # do a grid search
        gs_results = GridSearchCV(model,
                                  param_grid = [argument_dict], cv=cv,
                                  scoring='neg_root_mean_squared_error',
                                  return_train_score=True)
        gs_results.fit(X, y)
        return gs_results

### Vanilla Testing

In [None]:
xgb_test = model_scoring(xgb.XGBRegressor(tree_method='gpu_hist'), features, targets['target'])
lgbm_test = model_scoring(lgb.LGBMRegressor(), features, targets['target'])
ada_test = model_scoring(AdaBoostRegressor(), features, targets['target'])
cat_test = model_scoring(cb.CatBoostRegressor(verbose=0), features, targets['target'])

In [3]:
params = {
    'xgb': {
        "colsample_bytree": 0.25,
        "learning_rate": 0.01,
        "max_depth": 5,
        "max_leaves": 32
    },
    'lgbm': {
        "colsample_bytree": 0.25,
        "learning_rate": 0.01,
        "max_depth": 6,
        "num_leaves": 32
    },
    'ada': {
        "learning_rate": 0.01,
        "n_estimators": 50
    },
    'cat': {
        "max_leaves": 32,
        "depth": 5,
        "rsm": 0.1,
        "learning_rate": 0.01
    }
}

xgb_tuned = xgb.XGBRegressor(**params['xgb'])

lgbm_tuned = lgb.LGBMRegressor(**params['lgbm'])

ada_tuned = AdaBoostRegressor(**params['ada'])

cat_tuned = cb.CatBoostRegressor(**params['cat'])


They define risky features as the ones that correlate with the target the least.
Thought: These features could make or break the model. How can we find a split of features that
            could intertwine riskiness with known correlated values.
`
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL]))
`

Other notes: They train the model on prior eras for the training on the next era. Makes typical cv invalid in a sense. Need the chronological relationship to get the full breadth of the model. Note: If we train our models on the entire dataset and validate with our validation set, we are already inherently doing this.

In [64]:
from sklearn.decomposition import PCA
pca = PCA(n_components=60)
pca_feats = pca.fit_transform(features.to_numpy())

In [63]:
np.mean(xgb_test['test_score'])

-0.17118387520313263

In [65]:
xgb_pca = model_scoring(xgb.XGBRegressor(tree_method='gpu_hist'), pca_feats, targets['target'])
np.mean(xgb_pca['test_score'])

-0.18208190202713012

#### Functions to implement
 - Need a cv splitter for our data when we input a dataframe with a column associated with era
 - Need to find solid hyperparams to tune (hopefully using constant n_estimators)
 - Feature engineering (might just wait on this, try to use medium sized feature set)

### Functions for train test splitting

In [9]:
def get_time_series_splits(X, cv=3, min_train_length=2000, era_col='era', test_split_pct=0.3):
    """
    This function is meant to split our training data into chronologically stable
    partitions to train. I added min_train_length as some eras are extremely short,
    so partitioning equally by era can create instability in train/test size.
    """
    remaining_eras = list(X[era_col].unique())
    total_split = divmod(len(remaining_eras), cv)
    test_length = round(total_split[0] * test_split_pct)
    if test_length == 0:
        test_length = 1
    train_eras = []
    test_eras = []
    start = 0
    while remaining_eras:
        next_split = remaining_eras[:total_split[0]]
        remaining_eras = remaining_eras[total_split[0]:]
        if len(remaining_eras) == total_split[1]:
            next_split += remaining_eras
            remaining_eras = None
        next_test_split = [next_split.pop() for i in range(test_length)]
        next_test_split.reverse()
        train_eras.append(next_split)
        test_eras.append(next_test_split)
    return zip(train_eras, test_eras)

In [10]:
def get_cross_val_indeces(data, splits):
    tr_indeces = list()
    te_indeces = list()
    for tr, te in splits:
        tr_indeces.append(list(data[data.era.isin(tr)].index))
        te_indeces.append(list(data[data.era.isin(te)].index))
    return tr_indeces, te_indeces

In [15]:
era_data = era_data.reset_index()
splits = get_time_series_splits(era_data)
tr, te = get_cross_val_indeces(era_data, splits)

In [82]:
xgb_test = model_scoring(xgb.XGBRegressor(tree_method='gpu_hist'), features, targets['target'], cv=zip(tr, te))

In [83]:
np.mean(xgb_test['test_score'])

-0.2243728538354238

### Possible Grid Search Params

In [84]:
xgb_params = {
    'learning_rate': [0.001, 0.01],
    'max_depth': [4, 5, 6],
    'colsample_bytree': [0.05, 0.1, 0.25],
    'max_leaves': [2**5]
}
ada_params = {
    'learning_rate': [0.001, 0.01],
    'n_estimators': [25, 50, 100],
    'base_estimator': [DecisionTreeRegressor(max_depth=3), DecisionTreeRegressor(max_depth=5)]
}
lgbm_params = {
    'learning_rate': [0.001, 0.01],
    'max_depth': [4, 5, 6],
    'colsample_bytree': [0.05, 0.1, 0.25],
    'num_leaves': [2**5]
}
cat_params = {
    'learning_rate': [0.001, 0.01],
    'depth': [4, 5, 6],
    'rsm': [0.05, 0.1, 0.25],
    'max_leaves': [2**5]
}

In [75]:
xgb_param_testing = model_scoring(xgb.XGBRegressor(tree_method='gpu_hist'), features, targets['target'], cv=zip(tr, te), argument_dict=xgb_params)

Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "num_leaves" } might not be used.



In [86]:
xgb_param_testing.best_score_

-0.22247137129306793

In [1]:
xgb_param_testing.best_params_

NameError: name 'xgb_param_testing' is not defined

In [3]:
x = {'x': np.array([1, 2, 3]), 'y': np.array([4, 5, 6])}
y = {val: list(x[val]) for val in x.keys()}

In [4]:
y

{'x': [1, 2, 3], 'y': [4, 5, 6]}

In [16]:
cat_params = {
    'learning_rate': [0.001, 0.01],
    'depth': [4, 5, 6],
    'rsm': [0.05, 0.1, 0.25],
    'max_leaves': [2**5]
}
cat_model = cb.CatBoostRegressor(verbose=0)
cat_param_testing = cat_model.grid_search(X=features,
                                          y=targets['target'],
                                          cv=zip(tr, te),
                                          param_grid=cat_params)


bestTest = 0.2897067363
bestIteration = 999

0:	loss: 0.2897067	best: 0.2897067 (0)	total: 4.13s	remaining: 1m 10s

bestTest = 0.2163547192
bestIteration = 999

1:	loss: 0.2163547	best: 0.2163547 (1)	total: 8.46s	remaining: 1m 7s

bestTest = 0.2896963538
bestIteration = 999

2:	loss: 0.2896964	best: 0.2163547 (1)	total: 13.6s	remaining: 1m 7s

bestTest = 0.2158175338
bestIteration = 999

3:	loss: 0.2158175	best: 0.2158175 (3)	total: 18.5s	remaining: 1m 4s

bestTest = 0.2896974271
bestIteration = 999

4:	loss: 0.2896974	best: 0.2158175 (3)	total: 24.9s	remaining: 1m 4s

bestTest = 0.2157763071
bestIteration = 999

5:	loss: 0.2157763	best: 0.2157763 (5)	total: 32s	remaining: 1m 4s
Estimating final quality...
Training on fold [0/3]

bestTest = 0.2171546662
bestIteration = 999

Training on fold [1/3]

bestTest = 0.2118745485
bestIteration = 999

Training on fold [2/3]

bestTest = 0.2072297563
bestIteration = 999



In [23]:
dict(cat_param_testing)

{'params': {'max_leaves': 32, 'depth': 5, 'rsm': 0.25, 'learning_rate': 0.01},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
          

In [19]:
xgb_params = {
    'learning_rate': [0.001, 0.01],
    'max_depth': [4, 5, 6],
    'colsample_bytree': [0.05, 0.1, 0.25],
    'max_leaves': [2**5]
}
xgb_param_testing = model_scoring(xgb.XGBRegressor(tree_method='gpu_hist'),
                                  features,
                                  targets['target'],
                                  cv=zip(tr, te),
                                  argument_dict=xgb_params)

In [20]:
xgb_param_testing.cv_results_

{'mean_fit_time': array([0.97660645, 0.45523715, 0.65430236, 0.38368885, 0.45419637,
        0.65069477, 0.34402474, 0.47936066, 0.71353857, 0.34674446,
        0.48186239, 0.70142849, 0.36619687, 0.5221076 , 0.80380472,
        0.3688004 , 0.52106237, 0.81584771]),
 'std_fit_time': array([0.87291141, 0.02040589, 0.02515294, 0.06464391, 0.01403206,
        0.02427136, 0.01564427, 0.02220202, 0.00867349, 0.02062235,
        0.01990438, 0.01842341, 0.01494741, 0.01888134, 0.02546269,
        0.0184633 , 0.02136655, 0.03036683]),
 'mean_score_time': array([0.02775677, 0.01362816, 0.01130017, 0.57819549, 0.011988  ,
        0.01129866, 0.01196456, 0.01346533, 0.01296377, 0.01096574,
        0.0114967 , 0.01130064, 0.01162799, 0.01096567, 0.01296274,
        0.01229882, 0.01229604, 0.01429137]),
 'std_score_time': array([2.23517612e-02, 2.48854427e-03, 4.68785994e-04, 8.02515525e-01,
        7.82665516e-04, 4.69853077e-04, 8.16145835e-04, 3.17854547e-03,
        8.14393825e-04, 8.14393437e-