## First Numerai Submission

Following example script listed [here](https://github.com/numerai/example-scripts/blob/master/example_model.py)

In [1]:
# import dependencies
import pandas as pd
from lightgbm import LGBMRegressor
import gc
import json
from pathlib import Path
from numerapi import NumerAPI

## 1. Download Tournament Items and Data

### Setup API and Access Tokens

In [2]:
# access api tokens
with open('/Users/akg/.secret/numerai/numerai-keys.json', 'r') as f:
    keys = json.load(f)
    
# pull out public and secret key
pub_key = keys['public-key']
secret = keys['secret-key']

# setup api
napi = NumerAPI(pub_key, secret)

### Details on Current Round

In [3]:
# get current round details
current_round = napi.get_current_round()
print(f'Current round: {current_round}')

Current round: 411


In [4]:
# get current leaderboard
leaderboard = napi.get_leaderboard()

In [5]:
# check if a new round has started
if napi.check_new_round():
    print('New round has started within the last 12 hours!')
else: 
    print('No new round within last 12 hours')

New round has started within the last 12 hours!


### Download Data Files

Tournament data changes every week so we specify the round in their name. Training and validation data only change periodically, so no need to download them every time.

In [6]:
# download datasets to specific subfolder
print('Downloading dataset files...')
Path("./v4.1").mkdir(parents=False, exist_ok=True)

napi.download_dataset("v4.1/train.parquet")
napi.download_dataset("v4.1/validation.parquet")
napi.download_dataset("v4.1/live.parquet", f"v4.1/live_{current_round}.parquet")
napi.download_dataset("v4.1/validation_example_preds.parquet")
napi.download_dataset("v4.1/features.json")

napi.download_dataset("v4.1/live_example_preds.parquet")
napi.download_dataset("v4.1/meta_model.parquet")

Downloading dataset files...


2023-02-01 13:51:06,665 INFO numerapi.utils: target file already exists
2023-02-01 13:51:06,668 INFO numerapi.utils: resuming download
v4.1/train.parquet: 1.45GB [00:53, 26.9MB/s]                            
2023-02-01 13:52:02,015 INFO numerapi.utils: starting download
v4.1/validation.parquet: 1.51GB [02:14, 11.2MB/s]                            
2023-02-01 13:54:18,744 INFO numerapi.utils: starting download
v4.1/live_411.parquet: 4.52MB [00:00, 6.15MB/s]                            
2023-02-01 13:54:20,529 INFO numerapi.utils: starting download
v4.1/validation_example_preds.parquet: 56.9MB [00:02, 21.6MB/s]                            
2023-02-01 13:54:23,986 INFO numerapi.utils: starting download
v4.1/features.json: 703kB [00:00, 1.46MB/s]                           
2023-02-01 13:54:25,399 INFO numerapi.utils: starting download
v4.1/live_example_preds.parquet: 131kB [00:00, 634kB/s]                            
2023-02-01 13:54:26,483 INFO numerapi.utils: starting download
v4.1/meta_mod

## 2. Read Training Data

### Read Feature Metadata

In [7]:
# read metadata from downloaded json
with open('v4.1/features.json', 'r') as f: 
    feature_metadata = json.load(f)

In [8]:
# metadata keys
feature_metadata.keys()

dict_keys(['feature_stats', 'feature_sets', 'targets'])

In [13]:
# display list of targets
feature_metadata['targets']

['target',
 'target_nomi_v4_20',
 'target_nomi_v4_60',
 'target_tyler_v4_20',
 'target_tyler_v4_60',
 'target_victor_v4_20',
 'target_victor_v4_60',
 'target_ralph_v4_20',
 'target_ralph_v4_60',
 'target_waldo_v4_20',
 'target_waldo_v4_60',
 'target_jerome_v4_20',
 'target_jerome_v4_60',
 'target_janet_v4_20',
 'target_janet_v4_60',
 'target_ben_v4_20',
 'target_ben_v4_60',
 'target_alan_v4_20',
 'target_alan_v4_60',
 'target_paul_v4_20',
 'target_paul_v4_60',
 'target_george_v4_20',
 'target_george_v4_60',
 'target_william_v4_20',
 'target_william_v4_60',
 'target_arthur_v4_20',
 'target_arthur_v4_60',
 'target_thomas_v4_20',
 'target_thomas_v4_60']

In [4]:
# all_features = list(feature_metadata["feature_stats"].keys()) 
# small_feature_set = feature_metadata["feature_sets"]["small"]
# medium_feature_set = feature_metadata["feature_sets"]["medium"]
features = feature_metadata['feature_sets']['medium']

# read in just those features along with era and target columns
read_columns = features + ['era', 'data_type', 'target_nomi_v4_20']

## 3. Read Data from Downloaded Parquet Files

Sometimes when reading downloaded data, an error is raised about invalid magic parquet bytes. If this occurs, delete the file and rerun the napi.download_dataset to fix the corrupted file

In [7]:
# read data from parquet files
training_data = pd.read_parquet('v4.1/train.parquet', columns=read_columns)
validation_data = pd.read_parquet('v4.1/validation.parquet', columns=read_columns)
live_data = pd.read_parquet(f'v4.1/live_{current_round}.parquet', columns=read_columns)

In [16]:
# available models
models = napi.get_models()
print(models)

{'akg923': '112c86c7-3d27-4a8b-bba7-cb6be34c7f4d'}
