In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [3]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [4]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 1
FORCE_REWRITE = True

In [5]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [6]:
#IMPORT_SCRIPT!
if not ON_KAGGLE and '.' not in sys.path:
    sys.path.append('.')
from src.metrics import *
from src.data import merge_asset_details, infer_dtypes, get_data_for_asset
from src.cv import TimeSeriesSplit, get_date_range
from src.modeling import Evaluator
from src.metrics import compute_metrics, compute_correlation
from src.pipeline.feature_gen import compute_instant_features
from src.data import infer_dtypes
from src.pipeline import EXPECTED_RAW_COLS, inference
from src.pipeline.transforms import FilterFeatures
from src.modeling import Evaluator

In [7]:
def setup_dir(on_kaggle: bool = True, sample_level: int = 0):
    if on_kaggle:
        data_dir = Path('../input/g-research-crypto-forecasting/')
        raw_train_dir = (Path('../input/create-sample-dataset/data/raw/')
                         if sample_level > 0 else data_dir) 
    else:
        data_dir = raw_train_dir = Path('data/raw')
    
    if sample_level > 0:
        raw_train_dir = raw_train_dir.joinpath('sample', str(sample_level))
    
    return data_dir, raw_train_dir

In [8]:
DATA_DIR, RAW_TRAIN_DIR = setup_dir(ON_KAGGLE, sample_level=SAMPLE_LEVEL)
RAW_TRAIN_PATH = RAW_TRAIN_DIR / 'train.csv'
ASSET_DETAILS_PATH = DATA_DIR / 'asset_details.csv'

In [9]:
RAW_TRAIN_DIR

PosixPath('data/raw/sample/1')

In [10]:
DATA_DIR

PosixPath('data/raw')

In [11]:
%%time 
raw_data = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 5.58 s, sys: 668 ms, total: 6.25 s
Wall time: 6.97 s


## create train set

In [12]:
PREPRO_PARAMS = {'window': 60}
MAIN_INDEX = ['timestamp', 'Asset_ID']

In [13]:
# get valid data only, drop where the target is NaN 
data = raw_data.dropna(subset=['Target'])
# format time to human readable 
data['time'] = pd.to_datetime(data['timestamp'], unit='s')
# merge asset names
data = merge_asset_details(data, ASSET_DETAILS_PATH)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,time,Asset_Name
0,1609459260,3,104.0,0.181465,0.181702,0.18127,0.181493,613433.3,0.181527,0.001255,2021-01-01 00:01:00,Cardano
1,1609459260,2,275.0,342.556667,343.5,342.42,342.966667,419.931,342.842188,-0.002884,2021-01-01 00:01:00,Bitcoin Cash
2,1609459260,0,219.0,37.385,37.4221,37.3487,37.389,2749.569,37.387059,-0.001669,2021-01-01 00:01:00,Binance Coin
3,1609459260,1,4039.0,28985.51125,29086.9,28960.0,29032.3875,163.787,29017.66487,0.000396,2021-01-01 00:01:00,Bitcoin
4,1609459260,4,84.0,0.004671,0.004683,0.004668,0.004677,2202482.0,0.004675,-0.008679,2021-01-01 00:01:00,Dogecoin


In [15]:
data.tail()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,time,Asset_Name
5137881,1631491200,9,176.0,183.043333,183.24,182.94,183.118833,1127.703,183.079643,0.003215,2021-09-13,Litecoin
5137882,1631491200,10,15.0,2872.870767,2876.0,2872.3387,2874.284467,0.908415,2872.969592,0.005277,2021-09-13,Maker
5137883,1631491200,13,873.0,0.112415,0.112522,0.11229,0.11239,3465484.0,0.112399,0.007712,2021-09-13,TRON
5137884,1631491200,12,200.0,0.332417,0.3329,0.33197,0.33257,187516.7,0.332414,0.003338,2021-09-13,Stellar
5137885,1631491200,11,7.0,256.3175,256.44,256.2,256.2925,1.234097,256.303736,0.003787,2021-09-13,Monero


In [16]:
data = infer_dtypes(data)

In [17]:
data = compute_instant_features(data)

In [18]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,High,Low,Close,Volume,VWAP,Target,time,Asset_Name,high_low_return,open_close_return,upper_shadow,lower_shadow,dolar_amount,vol_per_trades
0,1609459260,3,104.0,1.001304,0.998924,0.181493,613433.3,0.181527,0.001255,2021-01-01 00:01:00,Cardano,0.694338,0.693223,5.517041,0.181661,111333.9,5898.397437
1,1609459260,2,275.0,1.002754,0.999601,342.966675,419.931,342.842194,-0.002884,2021-01-01 00:01:00,Bitcoin Cash,0.694723,0.693745,0.002924,342.69339,144022.3,1.527022
2,1609459260,0,219.0,1.000992,0.999029,37.389,2749.569,37.387058,-0.001669,2021-01-01 00:01:00,Binance Coin,0.694129,0.693201,0.026772,37.421329,102803.6,12.555111
3,1609459260,1,4039.0,1.003498,0.99912,29032.386719,163.787,29017.664062,0.000396,2021-01-01 00:01:00,Bitcoin,0.695336,0.693955,3.5e-05,29011.046875,4755129.0,0.040551
4,1609459260,4,84.0,1.002655,0.999443,0.004677,2202482.0,0.004675,-0.008679,2021-01-01 00:01:00,Dogecoin,0.694753,0.693843,214.375336,0.004673,10301.23,26220.028831


In [19]:
data.isna().mean()

timestamp            0.0
Asset_ID             0.0
Count                0.0
High                 0.0
Low                  0.0
Close                0.0
Volume               0.0
VWAP                 0.0
Target               0.0
time                 0.0
Asset_Name           0.0
high_low_return      0.0
open_close_return    0.0
upper_shadow         0.0
lower_shadow         0.0
dolar_amount         0.0
vol_per_trades       0.0
dtype: float64

In [20]:
data.fillna({'VWAP': data['VWAP'].mean()}, inplace=True)

In [21]:
data.isna().mean()

timestamp            0.0
Asset_ID             0.0
Count                0.0
High                 0.0
Low                  0.0
Close                0.0
Volume               0.0
VWAP                 0.0
Target               0.0
time                 0.0
Asset_Name           0.0
high_low_return      0.0
open_close_return    0.0
upper_shadow         0.0
lower_shadow         0.0
dolar_amount         0.0
vol_per_trades       0.0
dtype: float64

In [22]:
list(data.columns)

['timestamp',
 'Asset_ID',
 'Count',
 'High',
 'Low',
 'Close',
 'Volume',
 'VWAP',
 'Target',
 'time',
 'Asset_Name',
 'high_low_return',
 'open_close_return',
 'upper_shadow',
 'lower_shadow',
 'dolar_amount',
 'vol_per_trades']

In [23]:
FEATURES = [
'Count',
'High',
'Low',
'Close',
'Volume',
'VWAP',
'high_low_return',
'open_close_return',
'upper_shadow',
'lower_shadow',
'dolar_amount',
'vol_per_trades'
]

In [24]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline

def build_model(params={'alpha': 0.001}):
    model = Pipeline([('filter', FilterFeatures(FEATURES)),
                      ('norm', MinMaxScaler()),
                      ('model', Ridge(params['alpha'], random_state=1))])
    return model

In [25]:
def train_model(config, train_data, valid_data, pipeline=None):
    model  = build_model(config['model'])

    if config['training']['time_decay_alpha'] is not None:
        time_decay_alpha = config['training']['time_decay_alpha']
        print(f'using exponential_time_decay with alpha {time_decay_alpha}')
        timesteps = ((train_data['timestamp'].max() - train_data['timestamp'])//60//60//24)
        weight = time_decay_alpha ** timesteps
    else:
        weight = None

    model.fit(train_data, train_data['Target'], model__sample_weight=weight)
    return model

In [26]:
PULIC_LB_RANGE = ['2021-06-13',
                  '2021-09-13'] # 3 MONTH WORTH OF DATA

In [27]:
EVAL_PERIODS = [PULIC_LB_RANGE]

In [28]:
CV_PARAMS = {'gap_unit': 'min', 'dt_col': 'time'}

CV = TimeSeriesSplit(EVAL_PERIODS, **CV_PARAMS)

In [29]:
train_idx, valid_idx = next(iter(CV.split(data)))

In [30]:
train_data = data.loc[train_idx, :].reset_index(drop=True)

In [31]:
valid_data = data.loc[valid_idx, :].reset_index(drop=True)

In [32]:
MODEL_CONFIG = {'alpha': 0.001}
TRAIN_CONFIG = {'time_decay_alpha': 0.99}
CONFIG = {'model': MODEL_CONFIG, 'training': TRAIN_CONFIG}

In [33]:
asset_ids = sorted(data['Asset_ID'].unique())
MODELS = {}
for asset_id in asset_ids:
    print(f'training asset_id = {asset_id}')
    train_asset_data = get_data_for_asset(train_data, asset_id)
    model = train_model(CONFIG, train_asset_data, valid_data)
    MODELS[asset_id] = model

training asset_id = 0
using exponential_time_decay with alpha 0.99
training asset_id = 1
using exponential_time_decay with alpha 0.99
training asset_id = 2
using exponential_time_decay with alpha 0.99
training asset_id = 3
using exponential_time_decay with alpha 0.99
training asset_id = 4
using exponential_time_decay with alpha 0.99
training asset_id = 5
using exponential_time_decay with alpha 0.99
training asset_id = 6
using exponential_time_decay with alpha 0.99
training asset_id = 7
using exponential_time_decay with alpha 0.99
training asset_id = 8
using exponential_time_decay with alpha 0.99
training asset_id = 9
using exponential_time_decay with alpha 0.99
training asset_id = 10
using exponential_time_decay with alpha 0.99
training asset_id = 11
using exponential_time_decay with alpha 0.99
training asset_id = 12
using exponential_time_decay with alpha 0.99
training asset_id = 13
using exponential_time_decay with alpha 0.99


In [34]:
if not ON_KAGGLE:
    sys.path.append(str(DATA_DIR))

import gresearch_crypto
import traceback
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

In [35]:
for i, (raw_test_df, submission) in enumerate(iter_test):

    submission = inference(test_data=raw_test_df, submission=submission,
                            models=MODELS)
    if i % 1000 == 0 or i < 10:
        display(submission)    
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Unnamed: 0,row_id,Target
0,0,-0.0001408325
1,1,6.09823e-05
2,2,-0.0001334446
3,3,9.565149e-05
4,4,-0.0002011465
5,5,-0.0001085952
6,6,1.522752e-07
7,7,-2.611904e-05
8,8,0.0001802241
9,9,-6.191673e-05


Unnamed: 0,row_id,Target
0,14,-1.3e-05
1,15,8.8e-05
2,16,-8e-06
3,17,9.4e-05
4,18,3e-06
5,19,-0.000163
6,20,9.7e-05
7,21,-2.8e-05
8,22,0.000291
9,23,3.7e-05


Unnamed: 0,row_id,Target
0,28,0.000136
1,29,6.7e-05
2,30,2e-06
3,31,6.9e-05
4,32,0.000102
5,33,-9.7e-05
6,34,0.000309
7,35,-3.2e-05
8,36,0.000245
9,37,5.5e-05


Unnamed: 0,row_id,Target
0,42,0.000332
1,43,-1.5e-05
2,44,5.5e-05
3,45,1.5e-05
4,46,-2.8e-05
5,47,-1.3e-05
6,48,5e-06
7,49,3.2e-05
8,50,0.000237
9,51,5.9e-05
