In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [2]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [3]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 0
USE_SAMPLE = SAMPLE_LEVEL == 1
USE_TOY_SAMPLE = SAMPLE_LEVEL == 2

FORCE_REWRITE = (ON_KAGGLE and SAMPLE_LEVEL == 0)

In [4]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [5]:
#IMPORT_SCRIPT!
# THIS CELL WILL BE REMOVE WITH SCRIPTS IN SRC
if './src/' not in sys.path:
    sys.path.append('./src')

from preprocessing import *
from metrics import *

In [6]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [7]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING RAW DATASET


In [8]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [9]:
%%time 
raw_df = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 19.1 s, sys: 1.89 s, total: 20.9 s
Wall time: 22.4 s


In [10]:
raw_df = ingest_data(raw_df, asset_details_path=ASSET_DETAILS_PATH)

In [11]:
raw_df.sort_values(by=['Asset_ID', 'date'], inplace=True)
raw_df.reset_index(drop=True, inplace=True)

In [12]:
raw_df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date
0,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,Binance Coin,2018-01-01 00:01:00
1,1514764920,0,7.0,8.53,8.53,8.5145,8.5145,71.39,8.520215,-0.015875,Binance Coin,2018-01-01 00:02:00
2,1514764980,0,45.0,8.5065,8.5299,8.4848,8.4848,1546.82,8.501393,-0.01541,Binance Coin,2018-01-01 00:03:00
3,1514765040,0,14.0,8.5009,8.5066,8.4744,8.5009,125.8,8.47981,-0.012524,Binance Coin,2018-01-01 00:04:00
4,1514765100,0,5.0,8.5007,8.5007,8.456,8.456,125.01,8.458435,-0.00594,Binance Coin,2018-01-01 00:05:00


In [13]:
TARGET = 'Target'
INDEX_COLS = ['Asset_ID', 'Asset_Name', 'date', 'timestamp']

In [14]:
feature_data = compute_features(raw_df)

In [15]:
feature_data = feature_data.dropna(subset=[TARGET]).reset_index(drop=True)

In [16]:
feature_data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date,upper_shadow,lower_shador
0,1630454460,0,373.0,462.223511,463.300995,462.200012,463.005493,865.269358,462.689636,0.002667,Binance Coin,2021-09-01 00:01:00,0.295502,0.023499
1,1630454520,0,772.0,462.939514,463.106995,461.0,461.312988,1723.8521,461.899841,0.002237,Binance Coin,2021-09-01 00:02:00,0.16748,0.312988
2,1630454580,0,864.0,461.265503,461.330994,459.799988,460.632507,2172.706734,460.532562,0.001018,Binance Coin,2021-09-01 00:03:00,0.065491,0.83252
3,1630454640,0,419.0,460.7995,463.0,460.600006,462.755005,1050.0094,461.469299,0.001898,Binance Coin,2021-09-01 00:04:00,0.244995,0.199493
4,1630454700,0,1215.0,462.825989,465.399994,462.726013,465.266998,3102.896636,463.948822,0.001,Binance Coin,2021-09-01 00:05:00,0.132996,0.099976


In [17]:
feature_data.tail()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date,upper_shadow,lower_shador
402816,1632181200,13,986.0,0.090885,0.091054,0.09071,0.090941,5734456.0,0.090831,0.000199,TRON,2021-09-20 23:40:00,0.000113,0.000175
402817,1632181260,13,715.0,0.09101,0.091277,0.09098,0.091203,1989048.0,0.091125,-0.003477,TRON,2021-09-20 23:41:00,7.4e-05,3e-05
402818,1632181320,13,527.0,0.091171,0.091295,0.091092,0.09118,2386078.0,0.091185,-0.002437,TRON,2021-09-20 23:42:00,0.000115,7.9e-05
402819,1632181380,13,463.0,0.091206,0.09129,0.091028,0.091081,1568854.0,0.091131,0.004843,TRON,2021-09-20 23:43:00,8.4e-05,5.3e-05
402820,1632181440,13,512.0,0.090989,0.091133,0.090831,0.090885,1900695.0,0.090943,0.004163,TRON,2021-09-20 23:44:00,0.000144,5.4e-05


In [18]:
PULIC_LB_RANGE = ['2021-06-13 00:00:00',
                  '2021-09-13 00:00:00'] # 3 MONTH WORTH OF DATA

if USE_TOY_SAMPLE:
    EVAL_PERIODS = ['2021-09-15', '2021-09-22']

else:
    EVAL_PERIODS = PULIC_LB_RANGE

In [19]:
train_idx = feature_data['date'] < EVAL_PERIODS[0]

In [20]:
train_data = feature_data.loc[train_idx, :].reset_index(drop=True)

In [21]:
assert train_data.date.max() < pd.to_datetime(EVAL_PERIODS[0]), \
       'train set includes future data'

In [22]:
train_data.date.dt.date.nunique()

14

In [23]:
COLUMNS_TO_DROP = [TARGET, 'Asset_Name', 'date', 'timestamp', 'Asset_ID']
FEATURES = train_data.columns.drop(COLUMNS_TO_DROP)
MODEL_PARAMS = {'n_estimators': 500,
 'max_depth': 11,
 'learning_rate': 0.05,
 'subsample': 0.9,
 'colsample_bytree': 0.7,
 'missing': -999,
 'random_state': 2020}
OUTPUT_PATH = Path('artifacts/baseline/')
if SAMPLE_LEVEL > 0:
    OUTPUT_PATH = OUTPUT_PATH / f'sample/{SAMPLE_LEVEL}'
OUTPUT_PATH.mkdir(exist_ok=True, parents=True)

In [24]:
FEATURES

Index(['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP',
       'upper_shadow', 'lower_shador'],
      dtype='object')

In [25]:
from xgboost import XGBRegressor
import joblib

In [26]:
%%time
CRYPTO_MODELS = {}
for crypto, train_crypto in train_data.groupby(by=['Asset_Name']):
    print(f'training model for {crypto}')
    model_path = OUTPUT_PATH / f'{crypto}.pkl'
    if model_path.exists() and (not FORCE_REWRITE):
        model = joblib.load(model_path)
    else:
        model = XGBRegressor(**MODEL_PARAMS)
        model.fit(train_crypto[FEATURES], train_crypto[TARGET])
        joblib.dump(model, model_path)
    CRYPTO_MODELS[crypto] = model    

training model for Binance Coin
training model for Bitcoin
training model for Bitcoin Cash
training model for Cardano
training model for Dogecoin
training model for EOS.IO
training model for Ethereum
training model for Ethereum Classic
training model for IOTA
training model for Litecoin
training model for Maker
training model for Monero
training model for Stellar
training model for TRON
CPU times: user 4.13 s, sys: 105 ms, total: 4.24 s
Wall time: 4.22 s


In [27]:
assert len(CRYPTO_MODELS) == 14, 'missing cryptos'

In [28]:
from typing import Any, Dict, List, Union
import numpy as np

def predict_groupby(test_features: pd.DataFrame,
                    models: Dict[str, Any],
                    on: str = 'Asset_Name') -> np.ndarray:
    yhat = np.zeros(len(test_features))
    for crypto, crypto_data in test_features.groupby(on):
        assert crypto in models, f'model not found {crypto}'
        model = models[crypto]
        test_idx = (test_features[on] == crypto)
        yhat[test_idx] = model.predict(crypto_data[FEATURES])
    return yhat


# INFERENCE
def inference(raw_test_df: pd.DataFrame,
              submission: pd.DataFrame,
              models: Dict[str, Any]) -> pd.DataFrame:
    # ingest new data
    test_df = ingest_data(raw_test_df, asset_details_path=ASSET_DETAILS_PATH)
    # create features
    test_features = compute_features(test_df)
    yhat = predict_groupby(test_features, models=models)
    submission.loc[:, 'Target'] = yhat
    
    assert submission['Target'].isna().sum() == 0, 'submission contains NaN values'
    return submission
   

In [29]:
if not ON_KAGGLE:
    sys.path.append(str(RAW_DIR))

import gresearch_crypto
import traceback
env = gresearch_crypto.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission

for i, (raw_test_df, submission) in enumerate(iter_test):
    submission = inference(raw_test_df, submission, models=CRYPTO_MODELS)
    display(submission)
        
    assert list(submission.columns) == ['row_id', "Target"], 'submission do not match expected columns'
    assert len(submission) == len(raw_test_df), 'submission do not match expected lenght'
    assert submission['Target'].isna().sum() == 0, 'target includes NaNs'
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


Unnamed: 0,row_id,Target
0,0,0.007498
1,1,0.009569
2,2,0.012006
3,3,7.3e-05
4,4,0.000169
5,5,-0.000301
6,6,0.000466
7,7,0.005858
8,8,0.01701
9,9,0.000178


Unnamed: 0,row_id,Target
0,14,0.007424
1,15,0.012274
2,16,0.0141
3,17,-0.000115
4,18,-0.000221
5,19,-0.002002
6,20,0.001123
7,21,0.006211
8,22,0.009704
9,23,-0.000107


Unnamed: 0,row_id,Target
0,28,0.006543
1,29,0.007052
2,30,0.013527
3,31,2.1e-05
4,32,-0.00198
5,33,-0.000315
6,34,-0.001207
7,35,0.006131
8,36,0.007828
9,37,0.000524


Unnamed: 0,row_id,Target
0,42,0.00908
1,43,0.0062
2,44,0.014233
3,45,-1.4e-05
4,46,-0.000502
5,47,-0.00058
6,48,-5.4e-05
7,49,0.006174
8,50,0.00459
9,51,0.000827
