# ORIE 5256 Numerai Tournament

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Initialize NumerAPI - the official Python API client for Numerai
from numerapi import NumerAPI
napi = NumerAPI()

# list the datasets and available versions
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

# Set data version to one of the latest datasets
DATA_VERSION = "v5.0"

# Print all files available for download for our version
current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]
print("Available", DATA_VERSION, "files:\n", current_version_files)

Available versions:
 ['v5.0']
Available v5.0 files:
 ['v5.0/features.json', 'v5.0/live.parquet', 'v5.0/live_benchmark_models.parquet', 'v5.0/live_example_preds.csv', 'v5.0/live_example_preds.parquet', 'v5.0/meta_model.parquet', 'v5.0/train.parquet', 'v5.0/train_benchmark_models.parquet', 'v5.0/validation.parquet', 'v5.0/validation_benchmark_models.parquet', 'v5.0/validation_example_preds.csv', 'v5.0/validation_example_preds.parquet']


## 1. Feature Engineering

We will use the `medium` feature set offer by Numerai. This feature set contains a total of 705 features. In this section, we will perform some feature engineering methods to ensure the stationarity of the data, and to reduce the dimensionality to avoid curse of dimensionality.

In [3]:
import json

napi = NumerAPI()  # initialize API client
DATA_VERSION = 'v5.0'

# Load metadata
napi.download_dataset(f'{DATA_VERSION}/features.json')
feature_metadata = json.load(open(f'{DATA_VERSION}/features.json'))
feature_sets = feature_metadata['feature_sets']
medium_features = feature_sets['medium']

# Load training data
napi.download_dataset(f'{DATA_VERSION}/train.parquet')
train_set = pd.read_parquet(f'{DATA_VERSION}/train.parquet', columns=['era', 'target'] + medium_features)

# Downsample to every 4th era
train_set = train_set[train_set['era'].isin(train_set['era'].unique()[::4])]

2024-11-27 23:54:55,382 INFO numerapi.utils: target file already exists
2024-11-27 23:54:55,383 INFO numerapi.utils: download complete
2024-11-27 23:54:56,071 INFO numerapi.utils: target file already exists
2024-11-27 23:54:56,073 INFO numerapi.utils: download complete


In [4]:
train_set.head()

Unnamed: 0_level_0,era,target,feature_able_deprived_nona,feature_ablest_inflexional_egeria,feature_absorbable_hyperalgesic_mode,feature_accoutered_revolute_vexillology,feature_acetose_crackerjack_needlecraft,feature_acheulian_conserving_output,feature_acronychal_bilobate_stevenage,feature_acrylic_gallic_wine,...,feature_working_jain_acromegaly,feature_wrapround_chrestomathic_timarau,feature_xanthic_transpadane_saleswoman,feature_xanthochroid_petrified_gutenberg,feature_zincy_cirrhotic_josh,feature_zippy_trine_diffraction,feature_zonal_snuffly_chemism,feature_zygotic_middlebrow_caribbean,feature_zymolytic_intertidal_privet,feature_zymotic_windswept_cooky
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n0007b5abb0c3a25,1,0.25,1,2,3,2,3,2,2,2,...,2,0,3,2,4,3,2,1,0,0
n003bba8a98662e4,1,0.25,3,2,4,1,0,2,3,2,...,2,0,0,2,0,0,2,0,0,0
n003bee128c2fcfc,1,0.75,1,2,0,2,4,2,0,2,...,2,3,3,2,2,3,2,2,2,4
n0048ac83aff7194,1,0.25,1,2,3,4,0,2,3,2,...,2,0,2,1,1,4,2,0,2,1
n0055a2401ba6480,1,0.25,3,2,3,4,1,2,4,2,...,2,1,3,3,2,4,2,4,1,3


### 1.1 Stationarity

In [5]:
pass

### 1.2 Low Mutual Information

In this part, we filter out those features that are highly correlated with each other. 

In [6]:
# Calculate pairwise correlations between features. Drop one from each highly correlated pari (threshold = .8)

# correlation_matrix = train_set[medium_features].corr().abs()
# upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
# train_set.drop(columns=to_drop, inplace=True)

In [7]:
# train_set.to_parquet(f'train_set_low_corr.parquet')

In [8]:
# train_set = pd.read_parquet('train_set_low_corr.parquet')

In [9]:
# Store ne wfeatures
# low_corr_features = list(train_set.columns[2:])

In [10]:
# len(low_corr_features)

### 1.3 Dimension Reduction

We will use Principal Component Analysis (PCA) to reduce the dimensionality of the data. The first 100 principal components will be kept.

In [11]:
# Apply PCA to the features and store the first 100 components

from sklearn.decomposition import PCA
pca = PCA(n_components=.95)
# fit PCA to the features
pca_X = pca.fit_transform(train_set[medium_features])

In [12]:
# Store the PCA features in the training set
num_pca_features = pca_X.shape[1]
pca_features = [f'pca_{i}' for i in range(num_pca_features)]  # name of the pca features
df_pca_features = pd.DataFrame(pca_X, index=train_set.index, columns=pca_features)
train_set = pd.concat([train_set, df_pca_features], axis=1)

In [13]:
# Drop the original features
train_set.drop(columns=medium_features, inplace=True)
train_set.head()

Unnamed: 0_level_0,era,target,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,...,pca_302,pca_303,pca_304,pca_305,pca_306,pca_307,pca_308,pca_309,pca_310,pca_311
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n0007b5abb0c3a25,1,0.25,-9.288973,3.066574,-3.821182,3.503268,3.933269,7.436123,4.061404,-2.971538,...,-0.808702,1.037759,-0.650644,-1.411315,-0.522225,1.117435,0.255024,0.073524,0.838775,-0.226892
n003bba8a98662e4,1,0.25,-8.709131,-8.615216,-1.399786,6.5907,0.165589,1.353093,-0.327783,-3.305413,...,-0.382571,-0.123572,0.725335,1.030723,-1.313305,0.207756,0.01486,-1.004689,-2.195562,-0.734417
n003bee128c2fcfc,1,0.75,-0.232459,10.493623,-6.811598,-1.677351,-1.198478,-0.748976,1.672055,1.551992,...,-0.520323,-0.056187,-0.871155,-0.420036,0.723029,-0.071916,-0.083538,0.067414,-0.597459,0.102226
n0048ac83aff7194,1,0.25,1.826326,-13.853773,-0.894217,-6.760077,0.588427,6.413077,2.290896,6.145121,...,0.215866,-0.467865,-0.701282,0.439762,0.014501,-0.085253,0.726732,0.381191,0.428412,-0.114618
n0055a2401ba6480,1,0.25,-4.024114,-5.980213,-1.30233,-1.849217,-5.212901,0.188909,-0.113856,5.412827,...,0.017906,0.027337,0.639182,0.313109,0.246711,0.251916,0.505785,0.107952,0.278314,0.390047


## Feature Selection

We will use the Mean Decrease Accuracy (MDA) analysis to select the most important features. For this multi-class classification problem, our baseline classifier is Random Forest. We will use Purged K-Fold Cross Validation with AUC-ROC as scoring metric. Features with positive mean score improvement will be kept.

In [14]:
train_set['era'] = train_set['era'].astype(int)

In [15]:
# Construct inputs

t1 = pd.Series((train_set['era'] + 4).values, index=train_set['era'])
X = train_set[pca_features].copy()
X.index = t1.index
y = train_set['target'].copy()
y.index = t1.index
y = y.astype(str)

In [16]:
# Compute sample weights
from sklearn.utils.class_weight import compute_sample_weight
sample_weight = compute_sample_weight(class_weight='balanced', y=train_set['target'])
sample_weight = pd.Series(sample_weight, index=train_set.index)

In [17]:
from sklearn.model_selection._split import _BaseKFold

class PurgedKFold(_BaseKFold):
    """Extend KFold class to work with labels that span intervals.

    The train is purged of observations overlapping test-label intervals.
    Test set is assumed contiguous (shuffle=False), w/o training samples in between.
    """

    def __init__(self, n_splits=3, t1=None, pctEmbargo=0.0):
        """Initialize PurgedKFold object.

        Args:
            n_splits (int): Number of splits. Default is 3.
            t1 (pd.Series): 
                t1.index: time when the observation started
                t1.value: time when the observation ended
            pctEmbargo (float): Percentage of embargo on test set. Embargo step = pctEmbargo * T. Default is 0.
        """
        if not isinstance(t1, pd.Series):
            raise ValueError('Label Through Dates must be a pd.Series')
        super(PurgedKFold, self).__init__(
            n_splits, shufﬂe=False, random_state=None
        )

        self.t1 = t1
        self.pctEmbargo = pctEmbargo

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.

        Args:
            X (pd.DataFrame): Features.
            y (pd.Series): Labels.
            groups: Ignored.
        """
        if (X.index == self.t1.index).sum() != len(self.t1):
            raise ValueError('X and ThruDateValues must have the same index')

        indices = np.arange(X.shape[0])

        mbrg = int(X.shape[0] * self.pctEmbargo)
        test_starts = [
            (i[0], i[-1] + 1)
            for i in np.array_split(np.arange(X.shape[0]), self.n_splits)
        ]
        for test_start, test_end in test_starts:
            t0 = self.t1.index[test_start]   # start of test set
            test_indices = indices[test_start: test_end]

            max_t1 = self.t1.iloc[test_indices].max()
            maxT1Idx = self.t1.index.searchsorted(self.t1.iloc[test_indices].max())
            train_indices = list(t1[t1 <= t0].reset_index(drop=True).index)
            if maxT1Idx < X.shape[0]:   # right train (with embargo)
                train_indices = np.concatenate(
                    (train_indices, indices[maxT1Idx + mbrg :])
                )
            yield train_indices, test_indices

In [18]:
def featImpMDA(
    clf, X, y, cv, sample_weight, t1, pctEmbargo, scoring='auc-roc'
):
    """feat importance based on OOS score reduction"""
    if scoring not in ['auc-roc']:
        raise Exception('wrong scoring method.')
    from sklearn.metrics import roc_auc_score

    cvGen = PurgedKFold(
        n_splits=cv, t1=t1, pctEmbargo=pctEmbargo
    )   # purged cv
    scr0 = pd.Series()
    scr1 = pd.DataFrame(columns=X.columns)

    for i, (train, test) in enumerate(cvGen.split(X=X)):
        X0, y0, w0 = X.iloc[train, :], y.iloc[train], sample_weight.iloc[train]
        X1, y1, w1 = X.iloc[test, :], y.iloc[test], sample_weight.iloc[test]
        fit = clf.fit(X=X0, y=y0, sample_weight=w0.values)
        if scoring == 'auc-roc':
            prob = fit.predict_proba(X1)
            scr0.loc[i] = roc_auc_score(
                y1, prob, sample_weight=w1.values, labels=clf.classes_, multi_class='ovr', average='macro'
            )
        else:
            raise Exception('Only auc-roc scoring is supported')
        for j in X.columns:
            X1_ = X1.copy(deep=True)
            np.random.shuffle(X1_[j].values)   # permutation of a single column
            if scoring == 'auc-roc':
                prob = fit.predict_proba(X1_)
                scr1.loc[i, j] = roc_auc_score(
                    y1, prob, sample_weight=w1.values, labels=clf.classes_, multi_class='ovr', average='macro'
                )
            else:
                raise Exception('Only auc-roc scoring is supported')
    imp = (-scr1).add(scr0, axis=0)
    if scoring == 'auc-roc':
        imp = imp / (1.0 - scr1)
    else:
        raise Exception('Only auc-roc scoring is supported')
    imp = pd.concat(
        {'mean': imp.mean(), 'std': imp.std() * imp.shape[0] ** -0.5}, axis=1
    )
    return imp, scr0.mean()

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_features=int(1))
imp, scr0mean = featImpMDA(
    clf,
    X=X,
    y=y,
    cv=5,
    sample_weight=sample_weight,
    t1=t1,
    pctEmbargo=0.01,
)

KeyboardInterrupt: 

In [20]:
# Find features with import mean > 0
imp_pca_features = list(imp[imp['mean'] > 0].index)

In [24]:
train_set_selected = train_set[['era', 'target'] + imp_pca_features]

In [25]:
train_set_selected.to_parquet('train_set_selected.parquet')

## Modeling

We will use the LightGBM model to train our model



In [None]:
from lightgbm import LGBMClassifier

clf_lgbm = LGBMClassifier(
    n_estimators=1024,
    learning_rate=.01,
    max_depth=10,
    num_leaves=2**5-1,
    colsample_bytree=.1,
    min_data_in_leaf=10000,
)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_set_selected['target'] = le.fit_transform(train_set_selected['target'].astype(str))


fit = clf_lgbm.fit(train_set[pca_features], train_set['target'].astype(str), sample_weight=sample_weight)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_selected['target'] = le.fit_transform(train_set_selected['target'].astype(str))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.124051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79560
[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 312
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


In [None]:
model = fit

from sklearn.metrics import roc_auc_score

# Get predicted probabilities on the training set
train_probs = model.predict_proba(train_set[pca_features])

# Encode string labels to integers if necessary
true_labels = train_set['target'].astype(str)

# Compute the ROC AUC score
train_score = roc_auc_score(true_labels, train_probs, multi_class='ovr', average='macro')
print(f"ROC AUC Score: {train_score}")

ROC AUC Score: 0.6539346691733892


In [42]:
type(true_labels)

pandas.core.series.Series