Random Forest Numerai Model 

Based off of the script from Sakib 

Andrew Garvey

In [44]:
# used 'pip install --user numerox' (no quotes) in a terminal session
import numpy as np
import os as os
import pandas as pd
import time

In [45]:
# import modelling libraries
from sklearn import linear_model, model_selection
import numerox as nx

In [46]:
os.chdir('/global/home/mma_agarvey/823 P1')
#data = nx.download("numerai_dataset.zip")   # Used sftp instead
data = nx.load_zip("numerai_dataset.zip")

In [47]:
# environment settings
MODEL_NAME = "Random_Forest"
FOLDER_NAME = "Submission"

In [48]:
class logistic(nx.Model):

    def __init__(self, params):
        self.p = params

    def fit_predict(self, dfit, dpre, tournament):
        model = linear_model.LogisticRegression(C=self.p['C'], 
                                                solver=self.p['solver'], 
                                                multi_class=self.p['multi_class'])
        model.fit(dfit.x, dfit.y[tournament])
        yhat = model.predict_proba(dpre.x)[:, 1]
        return dpre.ids, yhat


In [49]:
# parameters required for hyper-tuning the model
C = [0.01]
solver = ["sag"]
multi_class = ["ovr"]
max_iter = [400]

In [50]:
# combination of parameters
parameters = {'C': C,
             'solver': solver,
             'multi_class': multi_class,
             'max_iter': max_iter}

In [51]:
# use grid search cv to find the best parameters
train_data = pd.read_csv(os.path.join(os.getcwd(), "numerai_dataset", "numerai_training_data.csv"), header=0)
X = np.array(train_data.loc[:, :"feature50"])

In [52]:
# list of tournaments
tournaments = ["bernie", "elizabeth", "jordan", "ken", "charles", "frank", "hillary"]

In [53]:
# set the directory to save the submissions
os.chdir(os.path.join(os.getcwd(), FOLDER_NAME, MODEL_NAME))

In [54]:
# define kfold cross validation split
kfold_split = 5

In [57]:
for index in range(0, len(tournaments)):
    # get the tournament name
    tournament = tournaments[index]
    
    print("*********** TOURNAMENT " + tournament + " ***********")
    
    # set the target name for the tournament
    target = "target_" + tournament 
    
    # set the y train with the target variable
    y = train_data.iloc[:, train_data.columns == target].values.reshape(-1,)
    
    # use GroupKFold for splitting the era
    group_kfold = model_selection.GroupKFold(n_splits=kfold_split)
    
    counter = 1
    
    print(">> group eras using kfold split\n")
    for train_index, test_index in group_kfold.split(X, y, groups=train_data['era']):
        # X_train takes the 50 features only for training and leave the other columns
        X_train = X[train_index][:,3:]
        # y_train remains the same
        y_train = y[train_index]
        
        print(">> running split #", counter)
        
        print(">> finding best params")
        clf = model_selection.GridSearchCV(linear_model.LogisticRegression(random_state=123), parameters, 
                                           scoring="neg_log_loss", 
                                           cv=kfold_split, n_jobs=-1,
                                           verbose = 1)
        clf.fit(X_train, y_train)
        best_params = clf.best_params_
        print(">> best params: ", best_params)

        # create a new logistic regression model for the tournament
        model = logistic(best_params)

        print(">> training info:")
        train = nx.backtest(model, data, tournament, verbosity=1)

        print(">> validation info:")
        validation = nx.production(model, data, tournament, verbosity=1)

        print(">> saving validation info: ")
        validation.to_csv(MODEL_NAME + "-" + tournament + "-" + str(counter) + ".csv")
        print(">> done saving validation info")

        print("\n")
        
        counter=counter+1

*********** TOURNAMENT bernie ***********
>> group eras using kfold split

>> running split # 1
>> finding best params
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Process ForkPoolWorker-2003:
Process ForkPoolWorker-2005:
Process ForkPoolWorker-2002:
Process ForkPoolWorker-2004:
Process ForkPoolWorker-2010:
Process ForkPoolWorker-2006:
Process ForkPoolWorker-2001:
Process ForkPoolWorker-2008:
Process ForkPoolWorker-2009:
Process ForkPoolWorker-2007:
Process ForkPoolWorker-2011:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py",

KeyboardInterrupt: 

  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/python/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  Fi

[     0      1      2 ... 502729 502730 502731]
[  2425   2426   2427 ... 484902 484903 484904]
[     0      1      2 ... 502729 502730 502731]
[     0      1      2 ... 502729 502730 502731]
[     0      1      2 ... 502729 502730 502731]
