<a href="https://colab.research.google.com/github/ajenningsfrankston/numerai_tree_regression/blob/master/som_ridge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
som ridge regression model
"""

!pip install numerapi
!pip install susi

import os
import gc

import pandas as pd
from numerapi import NumerAPI
import zipfile
import os

import susi
from susi.SOMPlots import plot_nbh_dist_weight_matrix, plot_umatrix
import matplotlib.pyplot as plt


from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingClassifier

TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = f"target"
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"

data_directory = "../kazutsugi/datasets/"

BENCHMARK = 0.002
BAND = 0.04


# Submissions are scored by Spearman correlation
def score(df):
    return df[[TARGET_NAME, PREDICTION_NAME]].corr(method="spearman")[TARGET_NAME][PREDICTION_NAME]


# The payout function
def payout(scores):
    return ((scores - BENCHMARK)/BAND).clip(lower=-1, upper=1)


def download_data():

    data_archive = NumerAPI().download_current_dataset(dest_path='../tmp', unzip=False)

    with zipfile.ZipFile(data_archive, "r") as zip_ref:
        zip_ref.extractall("../kazutsugi/datasets")



Collecting numerapi
  Downloading numerapi-2.9.0-py3-none-any.whl (26 kB)
Installing collected packages: numerapi
Successfully installed numerapi-2.9.0
Collecting susi
  Downloading susi-1.2.zip (35 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: susi
  Building wheel for susi (PEP 517) ... [?25l[?25hdone
  Created wheel for susi: filename=susi-1.2-py3-none-any.whl size=26487 sha256=8e0d55a529a6fe9ad87a7357757be335c43c45631a8e95a5468e580ddf237be9
  Stored in directory: /root/.cache/pip/wheels/8e/eb/15/13e7fba3326d1fb0798590e0ecfcb0e9bdd05db07cccdfacff
Successfully built susi
Installing collected packages: susi
Successfully installed susi-1.2


In [3]:
def get_data():

    download_data()

    print("# Loading data...")
    # The training data is used to train your model how to predict the targets.
    training_data = pd.read_csv(data_directory + "numerai_training_data.csv").set_index("id")
    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = pd.read_csv(data_directory + "numerai_tournament_data.csv").set_index("id")

    feature_names = [ f for f in training_data.columns if f.startswith("feature")]

    print(f"Loaded {len(feature_names)} features")

    return training_data,feature_names,tournament_data


In [34]:
def to_matrix(df):
    return pd.DataFrame.to_numpy(df)


class Som_Ridge:
    def __init__(
        self,
        SOM,
    ):
        self.n_rows = SOM.n_rows
        self.n_columns= SOM.n_columns
        self.ridge_models = [[0] * self.n_rows] * self.n_columns
        
        self.SOM = SOM

    def fit(self,training_data,feature_names):

        X = training_data[feature_names]
        Y = training_data[TARGET_NAME]
       
        tdata = pd.DataFrame()
        bmu_list = self.SOM.get_bmus(to_matrix(X))
        xvals = [x[0] for x in bmu_list]
        yvals = [x[1] for x in bmu_list]
       
        tdata = pd.merge(X,Y,how='inner',on='id',sort=False)

        tdata['grid_x'] = xvals
        tdata['grid_y'] = yvals

        for x in range(self.n_rows):
          for y in range(self.n_columns):
            gdata = tdata[(tdata['grid_x'] == x) & (tdata['grid_y'] == y)]
            x_train = gdata[feature_names]
            y_train = gdata[TARGET_NAME]
            model = Ridge(alpha=0.9)
            model.fit(x_train,y_train)
            self.ridge_models[x][y] = model
        return self


    def predict(self,testing_data):

        tdata = testing_data
        bmu_list = self.SOM.get_bmus(to_matrix(tdata))
        xvals = [x[0] for x in bmu_list]
        yvals = [x[1] for x in bmu_list]

        tdata['grid_x'] = xvals
        tdata['grid_y'] = yvals

        results = pd.DataFrame()

        #  assemble results then merge with tdata on id
        # - the re-ordered prediction vector becomes overall result
        #

        for x in range(self.n_rows):
          for y in range(self.n_columns):
            gdata = tdata[(tdata['grid_x'] == x) & (tdata['grid_y'] == y)]
            model = self.ridge_models[x][y]
            preds = model.predict(gdata[feature_names])
            pred_batch = pd.DataFrame(index=gdata.index)
            pred_batch['predns'] = preds
            print(pred.batch.head())
            results.append(pred_batch)

        print('results')
        print(results.head())

        pdata = pd.merge(testing_data,results,how='left',on='id')
        pdata.drop(pdata.columns.difference(['preds']), 1, inplace=True)
        predictions = pdata

        print(predictions.head())
        return predictions




        

        



In [5]:
training_data,feature_names,tournament_data = get_data()

2021-09-27 06:31:49,897 INFO numerapi.utils: starting download
../tmp/numerai_dataset_283.zip: 423MB [00:23, 17.9MB/s]                           


# Loading data...
Loaded 310 features


In [16]:


X = training_data[feature_names]
Y = training_data[TARGET_NAME]

som = susi.SOMClustering(n_rows=10,n_columns=10)
som.fit(to_matrix(X))









KeyboardInterrupt: ignored

In [31]:
som_ridge = Som_Ridge(som)
som_ridge.fit(training_data,feature_names)

<__main__.Som_Ridge at 0x7eff80710a50>

In [18]:

def make_predictions(som_ridge,training_data,feature_names,tournament_data):

    model = som_ridge 
    
    print("Generating predictions")

    training_data[PREDICTION_NAME] = model.predict(training_data[feature_names])

    tournament_data[PREDICTION_NAME] = model.predict(tournament_data[feature_names])

    # Check the per-era correlations on the training set

    train_correlations = training_data.groupby("era").apply(score)
    print(
        f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
    print(
        f"On training the average per-era payout is {payout(train_correlations).mean()}")

    # Check the per-era correlations on the validation set

    validation_data = tournament_data[tournament_data.data_type == "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(
        f"On validation the correlation has mean {validation_correlations.mean()} and std {validation_correlations.std()}")
    print(
        f"On validation the average per-era payout is {payout(validation_correlations).mean()}")

    # create destination directory if it does not exist
    #

    destination_dir = "../kazutsugi/submissions/"

    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    submission_file = destination_dir + TOURNAMENT_NAME + "_submission.csv"

    tournament_data[PREDICTION_NAME].to_csv(submission_file,header=True)


In [29]:
print(training_data.index.name)

id


In [32]:
make_predictions(som_ridge,training_data,feature_names,tournament_data)


Generating predictions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


results
Empty DataFrame
Columns: []
Index: []


KeyError: ignored

In [40]:
tdata = training_data
X = tdata[feature_names]
bmu_list = som.get_bmus(to_matrix(X))
xvals = [x[0] for x in bmu_list]
yvals = [x[1] for x in bmu_list]

tdata['grid_x'] = xvals
tdata['grid_y'] = yvals

In [49]:
       
results = pd.DataFrame()

        #  assemble results then merge with tdata on id
        # - the re-ordered prediction vector becomes overall result
        #

for x in range(10):
  for y in range(10):
    gdata = tdata[(tdata['grid_x'] == x) & (tdata['grid_y'] == y)]
    model = som_ridge.ridge_models[x][y]
    preds = model.predict(gdata[feature_names])
    pred_batch = pd.DataFrame(index=gdata.index)
    pred_batch['predns'] = preds
    results = pd.concat([results,pred_batch])

print('results')
print(results.head())
print('no rows')
print(len(results))

pdata = pd.merge(tdata,results,how='left',on='id')
pdata.drop(pdata.columns.difference(['predns']), 1, inplace=True)
predictions = pdata

print(predictions.head())

results
                    predns
id                        
n02a3913d0b15efc  0.518131
n051b3b558836642  0.532921
n0af5822ad9a3056  0.455511
n12ef62669a49726  0.346655
n1a60018c6b53e0d  0.368555
no rows
501808
                    predns
id                        
n000315175b67977  0.492685
n0014af834a96cdd  0.539970
n001c93979ac41d4  0.477433
n0034e4143f22a13  0.460841
n00679d1a636062f  0.469679


In [None]:
from numerapi import NumerAPI


n_id = "OML65REYFDPC5O7N22XCRP44BG2M74XH"
key = "YSTL455VERL7WZ4D7OQ6XEYEQN2MRCCICBMILNFP3DUZC4MSAS2WSH2MV7ED6WB3"

api = NumerAPI(public_id=n_id,secret_key=key)

base_path = "../kazutsugi/submissions/"

path = base_path + 'kazutsugi' + "_submission.csv"
#print('uploading')
#api.upload_predictions(path)