# Authors: Chris Canal, Ben Soper, and Nathan VanBenschoten

To start, setup your conda environment by runing:

> conda env create -f numerai_packages.yml

> source activate numerai

> jupyter notebook numerai_classifier.ipynb

In [1]:
#import libraries
import pandas as pd

import numpy as np

from sklearn import metrics, preprocessing, linear_model
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.metrics import log_loss as ll
from sklearn.grid_search import GridSearchCV as GS
from sklearn.metrics import make_scorer as MS

# Set seed for reproducibility
np.random.seed(0)



In [2]:
# Load the data from the CSV files

print("Loading data...")
training_data = pd.read_csv('numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('numerai_tournament_data.csv', header=0)


Loading data...


In [3]:
# Transform the loaded CSV data into numpy arrays
Y = training_data['target']
X = training_data.drop('target', axis=1)
t_id = prediction_data['t_id']
x_prediction = prediction_data.drop('t_id', axis=1)

#Split Data for Cross Validation
x_train, x_test, y_train, y_test = tts(X,Y, test_size=0.15, random_state=0)

In [4]:
# Set up parameters for grids search with SGD
parameters = {
    "loss": ["log", "modified_huber"],
    "penalty": ["none", "l2", "l1", "elasticnet"],
    "alpha": [0.0001, 0.0003, 0.0009, 0.00003, 0.00001],
    "fit_intercept": [True, False],
    "n_iter": [5, 10],
    "verbose":[1]
}

best_params = {
   'alpha':[0.0002], 'average':[True], 'class_weight':[None, 'balanced'], 'epsilon':[ 0.03],
   'eta0':[0.0], 'fit_intercept':[False], 'l1_ratio':[0.15],
   'learning_rate':['optimal'], 'loss':['log'], 'n_iter':[5], 'n_jobs':[4],
   'penalty':['l1'], 'power_t':[0.5], 'random_state':[None], 'shuffle':[True],
   'verbose':[1], 'warm_start':[False]
}

# Create Scorer for GridSearch
scorer = MS(ll, greater_is_better=False, needs_proba=True, needs_threshold=False)

# Create Grid search object
grid_obj = GS(SGD(), best_params, scoring=scorer)

In [5]:
# Find the best parameters by runing gridsearch
grid_fit = grid_obj.fit(x_train, y_train)
best_model = grid_fit.best_estimator_
print(best_model)

-- Epoch 1
Norm: 221.05, NNZs: 4, Bias: 0.000000, T: 98511, Avg. loss: 0.956740
Total training time: 0.07 seconds.
-- Epoch 2
Norm: 221.92, NNZs: 5, Bias: 0.000000, T: 197022, Avg. loss: 0.826448
Total training time: 0.15 seconds.
-- Epoch 3
Norm: 222.20, NNZs: 6, Bias: 0.000000, T: 295533, Avg. loss: 0.782543
Total training time: 0.22 seconds.
-- Epoch 4
Norm: 222.34, NNZs: 9, Bias: 0.000000, T: 394044, Avg. loss: 0.760576
Total training time: 0.30 seconds.
-- Epoch 5
Norm: 222.42, NNZs: 8, Bias: 0.000000, T: 492555, Avg. loss: 0.747275
Total training time: 0.40 seconds.
-- Epoch 1
Norm: 195.80, NNZs: 4, Bias: 0.000000, T: 98511, Avg. loss: 1.025701
Total training time: 0.10 seconds.
-- Epoch 2
Norm: 196.82, NNZs: 4, Bias: 0.000000, T: 197022, Avg. loss: 0.860053
Total training time: 0.20 seconds.
-- Epoch 3
Norm: 197.16, NNZs: 4, Bias: 0.000000, T: 295533, Avg. loss: 0.804679
Total training time: 0.32 seconds.
-- Epoch 4
Norm: 197.32, NNZs: 6, Bias: 0.000000, T: 394044, Avg. loss: 0.

In [6]:
best_model.fit(x_train, y_train)
y_predict = best_model.predict_log_proba(x_test)
print(y_predict.shape)
print(y_test.shape)
print("Score", ll(y_test, y_predict))

-- Epoch 1
Norm: 215.67, NNZs: 5, Bias: 0.000000, T: 147767, Avg. loss: 0.877994
Total training time: 0.11 seconds.
-- Epoch 2
Norm: 216.26, NNZs: 7, Bias: 0.000000, T: 295534, Avg. loss: 0.786530
Total training time: 0.20 seconds.
-- Epoch 3
Norm: 216.45, NNZs: 8, Bias: 0.000000, T: 443301, Avg. loss: 0.755768
Total training time: 0.31 seconds.
-- Epoch 4
Norm: 216.55, NNZs: 7, Bias: 0.000000, T: 591068, Avg. loss: 0.740287
Total training time: 0.42 seconds.
-- Epoch 5
Norm: 216.60, NNZs: 8, Bias: 0.000000, T: 738835, Avg. loss: 0.730956
Total training time: 0.53 seconds.
(26077, 2)
(26077,)
('Score', 0.69314718055994518)


In [7]:
#Create Predictions and put in csv ready for submission

print("Predicting...")
# Your trained model is now used to make predictions on the numerai_tournament_data
# The model returns two columns: [probability of 0, probability of 1]
# We are just interested in the probability that the target is 1.
y_prediction = best_model.predict_proba(x_prediction)
results = y_prediction[:, 1]
results_df = pd.DataFrame(data={'probability':results})
joined = pd.DataFrame(t_id).join(results_df)

print("Writing predictions to predictions.csv")
# Save the predictions out to a CSV file
joined.to_csv("predictions.csv", index=False)
# Now you can upload these predictions on numer.ai

Predicting...
Writing predictions to predictions.csv
