In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [3]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [4]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 1
USE_SAMPLE = SAMPLE_LEVEL == 1
USE_TOY_SAMPLE = SAMPLE_LEVEL == 2

FORCE_REWRITE = True#(ON_KAGGLE and SAMPLE_LEVEL == 0)

In [5]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [6]:
#IMPORT_SCRIPT!
if not ON_KAGGLE and '.' not in sys.path:
    sys.path.append('.')
from src.metrics import *
from src.preprocessing import process_train_data, process_test_data, inference
from src.preprocessing.ingest_data import merge_asset_details, fill_gaps_crypto_data, infer_dtypes
from src.cv import TimeSeriesSplit, get_date_range
from src.modeling import Evaluator
from src.metrics import compute_metrics

In [7]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [8]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING SAMPLE DATASET


In [9]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [10]:
%%time 
raw_df = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 4.55 s, sys: 357 ms, total: 4.9 s
Wall time: 4.9 s


## create train set

In [11]:
PREPRO_PARAMS = {'window': 60} 

In [12]:
# get valid data only, drop where the target is NaN 
data = raw_df[['timestamp', 'Asset_ID', 'Target']].dropna(subset=['Target'])
# format time to human readable 
data['time'] = pd.to_datetime(data['timestamp'], unit='s')
# merge asset names
data = merge_asset_details(data, ASSET_DETAILS_PATH)

In [13]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Target,time,Asset_Name
0,1609459260,3,0.001255,2021-01-01 00:01:00,Cardano
1,1609459260,2,-0.002884,2021-01-01 00:01:00,Bitcoin Cash
2,1609459260,0,-0.001669,2021-01-01 00:01:00,Binance Coin
3,1609459260,1,0.000396,2021-01-01 00:01:00,Bitcoin
4,1609459260,4,-0.008679,2021-01-01 00:01:00,Dogecoin


In [14]:
data.tail()

Unnamed: 0,timestamp,Asset_ID,Target,time,Asset_Name
5298791,1632181440,9,-0.001154,2021-09-20 23:44:00,Litecoin
5298792,1632181440,10,0.009831,2021-09-20 23:44:00,Maker
5298793,1632181440,13,0.004163,2021-09-20 23:44:00,TRON
5298794,1632181440,12,0.000449,2021-09-20 23:44:00,Stellar
5298795,1632181440,11,0.002568,2021-09-20 23:44:00,Monero


In [15]:
PROCESSED_DIR = Path('data/processed')
if USE_SAMPLE:
    print('USING SAMPLE DATASET')
    PROCESSED_DIR /= 'sample'

elif USE_TOY_SAMPLE:
    print('USING TOY SAMPLE DATASET')
    PROCESSED_DIR /= 'toy_sample'

FEATURES_PATH = (PROCESSED_DIR / 'features.parquet')
PROCESSED_DIR.mkdir(exist_ok=True, parents=True)

USING SAMPLE DATASET


In [16]:
FEATURES_PATH

PosixPath('data/processed/sample/features.parquet')

In [17]:
if FEATURES_PATH.exists():
    features_df = pd.read_parquet(FEATURES_PATH)
else:
    features_df = process_train_data(raw_df, **PREPRO_PARAMS)
    features_df.to_parquet(FEATURES_PATH, index=False)

In [18]:
features_df

Unnamed: 0,Close__mean__60min_ft,timestamp,Asset_ID
0,37.389000,1609459260,0
1,37.360149,1609459320,0
2,37.337265,1609459380,0
3,37.325924,1609459440,0
4,37.303341,1609459500,0
...,...,...,...
5302075,0.091925,1632182160,13
5302076,0.091886,1632182220,13
5302077,0.091847,1632182280,13
5302078,0.091817,1632182340,13


In [19]:
assert features_df['timestamp'].isna().sum() == 0

In [20]:
data = data.merge(features_df, on=['Asset_ID', 'timestamp'], how='left')

In [21]:
data.isna().mean()

timestamp                0.0
Asset_ID                 0.0
Target                   0.0
time                     0.0
Asset_Name               0.0
Close__mean__60min_ft    0.0
dtype: float64

In [22]:
FEATURES = ['timestamp', 'Asset_ID', 'Target_x', 'time', 'Asset_Name', 'Count',
       'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Target_y',
       'high_low_return', 'open_close_return', 'upper_shadow', 'lower_shador',
       'price_return_1', 'vwap_return_1']

In [23]:
data.columns

Index(['timestamp', 'Asset_ID', 'Target', 'time', 'Asset_Name',
       'Close__mean__60min_ft'],
      dtype='object')

In [24]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


class FilterFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, pattern: str = '_ft$', sort: bool = False):
        self.pattern = pattern
        self.sort = sort
    
    def fit(self, X: pd.DataFrame, y=None):
        self.features = list(X.filter(regex=self.pattern).columns)
        if self.sort:
            self.features.sort()
        return self
    
    def transform(self, X: pd.DataFrame):
        return X.loc[:, self.features]

    
def test_filter_features():
    n_feat = 100
    size = 500
    t = 100
    expected_columns = [f'{f}_ft' for f in np.arange(n_feat)]
    
    expected_df = pd.DataFrame(np.random.randn(size, n_feat), columns=expected_columns)
    tmf = FilterFeatures()
    tmf.fit(expected_df)
    
    for i in range(t):
        shuffle_columns = np.random.permutation(expected_columns)
        shuffle_df = expected_df.loc[:, shuffle_columns]
        actual_columns = list(tmf.transform(shuffle_df).columns)
        assert (actual_columns == expected_columns), f'cols do not match at iter {i}'
    
test_filter_features()

def build_model(params={'alpha': 0.001}):
    model = Pipeline([('filter', FilterFeatures()),
                      ('norm', MinMaxScaler()),
                      ('model', Ridge(params['alpha']))])
    return model

In [25]:
def train_model(config, train_data, valid_data, pipeline=None):
    model  = build_model(config['model'])

    if config['training']['time_decay_alpha'] is not None:
        time_decay_alpha = config['training']['time_decay_alpha']
        print(f'using exponential_time_decay with alpha {time_decay_alpha}')
        timesteps = ((train_data['timestamp'].max() - train_data['timestamp'])//60//60//24)
        weight = time_decay_alpha ** timesteps
    else:
        weight = None

    model.fit(train_data, train_data['Target'], model__sample_weight=weight)
    
    return model

In [26]:
PULIC_LB_RANGE = ['2021-06-13 00:00:00',
                  '2021-09-13 00:00:00'] # 3 MONTH WORTH OF DATA

EVAL_PERIODS = [PULIC_LB_RANGE]

In [27]:
CV_PARAMS = {'gap_unit': 'min', 'dt_col': 'time'}

CV = TimeSeriesSplit(EVAL_PERIODS, **CV_PARAMS)

In [28]:
train_idx, valid_idx = next(iter(CV.split(data)))

In [29]:
train_data = data.loc[train_idx, :].reset_index(drop=True)

In [30]:
get_date_range(train_data['time'])

min   2021-01-01 00:01:00
max   2021-06-12 23:59:00
Name: time, dtype: datetime64[ns]

In [31]:
MODEL_CONFIG = {'alpha': 0.001}
TRAIN_CONFIG = {'time_decay_alpha': 0.99}
CONFIG = {'model': MODEL_CONFIG, 'training': TRAIN_CONFIG}

In [32]:
MODELS = {}
for asset_id, train_asset_data in data.groupby("Asset_ID"):
    print(f'training model for asset_ID == {asset_id}')
    train_asset_data = train_asset_data.reset_index(drop=True)    
    model = train_model(CONFIG, train_asset_data, train_asset_data)
    MODELS[asset_id] = model

training model for asset_ID == 0
using exponential_time_decay with alpha 0.99
training model for asset_ID == 1
using exponential_time_decay with alpha 0.99
training model for asset_ID == 2
using exponential_time_decay with alpha 0.99
training model for asset_ID == 3
using exponential_time_decay with alpha 0.99
training model for asset_ID == 4
using exponential_time_decay with alpha 0.99
training model for asset_ID == 5
using exponential_time_decay with alpha 0.99
training model for asset_ID == 6
using exponential_time_decay with alpha 0.99
training model for asset_ID == 7
using exponential_time_decay with alpha 0.99
training model for asset_ID == 8
using exponential_time_decay with alpha 0.99
training model for asset_ID == 9
using exponential_time_decay with alpha 0.99
training model for asset_ID == 10
using exponential_time_decay with alpha 0.99
training model for asset_ID == 11
using exponential_time_decay with alpha 0.99
training model for asset_ID == 12
using exponential_time_decay

In [33]:
# create submission history

In [34]:
# create submission history

def build_history(t: int):
    raw_train_df = raw_df.loc[raw_df['timestamp'] < t]
    HISTORY_DFS = {}
    for asset_id, raw_asset_data in raw_train_df.groupby("Asset_ID"):
        raw_asset_data = raw_asset_data.tail(24*60)
        raw_asset_data = fill_gaps_crypto_data(raw_asset_data)
        raw_asset_data = infer_dtypes(raw_asset_data)
        HISTORY_DFS[asset_id] = raw_asset_data
    return HISTORY_DFS

In [35]:
if not ON_KAGGLE:
    sys.path.append(str(RAW_DIR))

import gresearch_crypto
import traceback
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

FIRST_TEST_TIMESTAMP = None

In [36]:
for i, (raw_test_df, submission) in enumerate(iter_test):
    if FIRST_TEST_TIMESTAMP is None:
        FIRST_TEST_TIMESTAMP = raw_test_df['timestamp'].iloc[0]
        HISTORY_DFS = build_history(FIRST_TEST_TIMESTAMP)

    submission, HISTORY_DFS = inference(test_data=raw_test_df, submission=submission,
                                        history_dfs=HISTORY_DFS, models=MODELS, prepro_kwargs=PREPRO_PARAMS)
    if i % 1000 == 0 or i < 10:
        display(submission)    
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Unnamed: 0,row_id,Target
0,0,4.639353e-05
1,1,-1.581949e-05
2,2,3.888669e-05
3,3,3.150972e-05
4,4,-3.91565e-05
5,5,-1.592307e-07
6,6,1.706918e-05
7,7,1.317283e-05
8,8,0.0001031815
9,9,-2.322495e-05


Unnamed: 0,row_id,Target
0,14,4.639589e-05
1,15,-1.582199e-05
2,16,3.890331e-05
3,17,3.152966e-05
4,18,-3.913607e-05
5,19,-1.640128e-07
6,20,1.70819e-05
7,21,1.317617e-05
8,22,0.0001032136
9,23,-2.321943e-05


Unnamed: 0,row_id,Target
0,28,4.639947e-05
1,29,-1.582619e-05
2,30,3.893304e-05
3,31,3.155497e-05
4,32,-3.910459e-05
5,33,-1.705321e-07
6,34,1.710262e-05
7,35,1.318095e-05
8,36,0.0001032546
9,37,-2.321163e-05


Unnamed: 0,row_id,Target
0,42,4.640574e-05
1,43,-1.583202e-05
2,44,3.897425e-05
3,45,3.158879e-05
4,46,-3.906523e-05
5,47,-1.786757e-07
6,48,1.712514e-05
7,49,1.318763e-05
8,50,0.0001033196
9,51,-2.320136e-05
