Model used by Bukosabino in the Tournament 70 of Numer.ai

Results:
Logloss: 0.69199 - Consistency: 75% - Originality: Yes - Concordance: Yes

In [14]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model, svm, ensemble
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.decomposition import *
from sklearn.manifold import *
from sklearn.random_projection import *
from sklearn.preprocessing import *
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array

np.random.seed(0)

SUBMISSION = False # change to True if you want generate output.csv

In [15]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

In [16]:
# reading data
train = pd.read_csv('../data/input70/numerai_training_data.csv', header=0)
test = pd.read_csv('../data/input70/numerai_tournament_data.csv', header=0)
if not SUBMISSION:
    test = test[test.data_type == 'validation'] # only validation

features = [f for f in list(train) if "feature" in f]
X = train[features]
Y = train["target"]
x_prediction = test[features]
ids = test["id"]

n_comp = 2
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train[features])
grp_results_test = grp.transform(test[features])

for i in range(1, n_comp + 1):
    train['feature_grp_' + str(i)] = grp_results_train[:, i - 1]
    test['feature_grp_' + str(i)] = grp_results_test[:, i - 1]
    
features = [f for f in list(train) if "feature" in f]
X = train[features]
Y = train["target"]
x_prediction = test[features]
ids = test["id"]

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=linear_model.LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=ensemble.GradientBoostingRegressor(learning_rate=0.001, random_state=2345, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    linear_model.LassoLarsCV()
)
stacked_pipeline.fit(X, Y)
results_stacked = stacked_pipeline.predict(x_prediction)
if not SUBMISSION:
    print metrics.log_loss(test["target"], results_stacked)

In [17]:
# reading data
train = pd.read_csv('../data/input70/numerai_training_data.csv', header=0)
test = pd.read_csv('../data/input70/numerai_tournament_data.csv', header=0)
if not SUBMISSION:
    test = test[test.data_type == 'validation'] # only validation

features = [f for f in list(train) if "feature" in f]
X = train[features]
Y = train["target"]
x_prediction = test[features]
ids = test["id"]

n_comp = 2
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train[features])
grp_results_test = grp.transform(test[features])

for i in range(1, n_comp + 1):
    train['feature_grp_' + str(i)] = grp_results_train[:, i - 1]
    test['feature_grp_' + str(i)] = grp_results_test[:, i - 1]

features = [f for f in list(train) if "feature" in f]
X_gb = train[features]
x_prediction_gb = test[features]

n_comp = 2
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train[features])
grp_results_test = grp.transform(test[features])

for i in range(1, n_comp + 1):
    train['feature_grp_' + str(i)] = grp_results_train[:, i - 1]
    test['feature_grp_' + str(i)] = grp_results_test[:, i - 1]
    
features = [f for f in list(train) if "feature" in f]
X_gb = train[features]
x_prediction_gb = test[features]

model = ensemble.GradientBoostingClassifier(n_estimators=80, max_depth=2, random_state=17)
model.fit(X_gb, Y)
results_gb = model.predict_proba(x_prediction_gb)[:, 1]
if not SUBMISSION:
    print metrics.log_loss(test["target"], results_gb)

In [18]:
result_mean = np.mean(np.array([results_gb, results_stacked]), axis=0)
if not SUBMISSION:
    print metrics.log_loss(test["target"], result_mean)

In [19]:
if SUBMISSION:
    results_df = pd.DataFrame(data={'probability':result_mean})
    joined = pd.DataFrame(ids).join(results_df)
    joined.to_csv("predictions_stacked_gb.csv", index=False)