In [1]:
#meta 3/6/2021 Numerai Starter - XGBoost Regression 
#src https://docs.numer.ai/tournament/learn

#history
# 3/6/2021 NUMERAI MODEL XGBOOST 
#     Test run on local compute
#     No submission

# 3/21/2021 NUMERAI MODEL XGBOOST ROUND 256
#     Submitted with model `anyaconda_xgboost`

In [2]:
import pandas as pd
from xgboost import XGBRegressor
import numerapi

# Numerai
Objective:  build a model to predict the future target using live features that correspond to the current stock market.

## 0. Load Data

### Datasets 
*   `training_data` is used to train your model
*   `tournament_data` is used to evaluate your model

### Column descriptions
*   id: a randomized id that corresponds to a stock 
*   era: a period of time
*   data_type: either `train`, `validation`, `test`, or `live` 
*   feature_*: abstract financial features of the stock 
*   target: abstract measure of stock performance

In [3]:
# download the latest training dataset (takes around 30s), contains features and targets
#training_data = pd.read_csv("numerai_training_data.csv").set_index("id")
training_data = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz")
print(training_data.shape)
training_data.head()

(501808, 314)


Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.25
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.25
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75


In [4]:
# download the latest tournament dataset (takes around 30s)
#tournament_data = pd.read_csv("numerai_tournament_data.csv").set_index("id")
tournament_data = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz")
print(tournament_data.shape)
tournament_data.head()

(1676742, 314)


Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n0003aa52cab36c2,era121,validation,0.25,0.75,0.5,0.5,0.0,0.75,0.5,...,0.75,0.75,1.0,0.75,0.5,0.5,1.0,0.0,0.0,0.25
1,n000920ed083903f,era121,validation,0.75,0.5,0.75,1.0,0.5,0.0,0.0,...,0.5,0.5,0.75,1.0,0.75,0.5,0.5,0.5,0.5,0.5
2,n0038e640522c4a6,era121,validation,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.5,0.25,0.0,0.0,0.5,0.5,0.0,1.0
3,n004ac94a87dc54b,era121,validation,0.75,1.0,1.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.25,0.5
4,n0052fe97ea0c05f,era121,validation,0.25,0.5,0.5,0.25,1.0,0.5,0.5,...,0.5,0.75,0.0,0.0,0.75,1.0,0.0,0.25,1.0,0.75


## 1. Data Prep
X&y

In [8]:
# define the feature columns
#was feature_cols = training_data.columns[training_data.columns.str.startswith('feature')]
feature_cols = [f for f in training_data.columns if "feature" in f]

X = training_data[feature_cols]
y = training_data['target']

X.shape, y.shape

((501808, 310), (501808,))

## 2. Model

In [9]:
# train a model to make predictions on tournament data
model = XGBRegressor(max_depth=5, learning_rate=0.01, \
                     n_estimators=2000, colsample_bytree=0.1)
model.fit(X,y)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=2000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
# submit predictions to numer.ai
predictions = model.predict(tournament_data[feature_cols])
predictions

array([0.4858516 , 0.4858599 , 0.53088856, ..., 0.5074054 , 0.5081322 ,
       0.47973642], dtype=float32)

In [16]:
# predictions must have an `id` column and a `prediction_kazutsugi` column
predictions_df = tournament_data["id"].to_frame()
predictions_df["prediction"] = predictions
predictions_df.head()

Unnamed: 0,id,prediction
0,n0003aa52cab36c2,0.485852
1,n000920ed083903f,0.48586
2,n0038e640522c4a6,0.530889
3,n004ac94a87dc54b,0.498161
4,n0052fe97ea0c05f,0.502542


In [17]:
#save to csv with no index
predictions_df.to_csv("output/predictions.csv", index=False)

## Submit

In [13]:
# Get your API keys and model_id from https://numer.ai/submit
public_id = "REPLACEME"
secret_key = "REPLACEME"
model_id = "REPLACEME"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)