In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [2]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [3]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 0
USE_SAMPLE = SAMPLE_LEVEL == 1
USE_TOY_SAMPLE = SAMPLE_LEVEL == 2

FORCE_REWRITE = (ON_KAGGLE and SAMPLE_LEVEL == 0)

In [4]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [5]:
#IMPORT_SCRIPT!
# THIS CELL WILL BE REMOVE WITH SCRIPTS IN SRC
if './src/' not in sys.path:
    sys.path.append('./src')

# from preprocessing import *
from metrics import *

In [6]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [7]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING RAW DATASET


In [8]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [9]:
%%time 
raw_df = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 19.2 s, sys: 1.98 s, total: 21.2 s
Wall time: 21.2 s


# feature gen 

In [10]:
asset = 1
raw_local_data = raw_df.query("Asset_ID==@asset").reset_index(drop=True)

In [11]:
from preprocessing.ingest_data import fill_gaps_crypto_data, infer_dtypes, merge_asset_details

In [12]:
# compute minute features

In [13]:
from preprocessing.feature_gen import compute_base_features, FEATURE_DICT, compute_features_on_train

In [14]:
# window 60 min
WINDOW = 60

In [15]:
OUTPUT_DIR = Path('data/processed/')
if USE_TOY_SAMPLE:
    OUTPUT_DIR = OUTPUT_DIR / 'toy_sample'
elif USE_SAMPLE:
    OUTPUT_DIR = OUTPUT_DIR / 'sample'
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

OUTPUT_PATH = OUTPUT_DIR / 'features.parquet'

In [16]:
OUTPUT_PATH

PosixPath('data/processed/features.parquet')

In [17]:
%%time
if not OUTPUT_PATH.exists() or FORCE_REWRITE:
    raw_features = fill_gaps_crypto_data(raw_local_data)
    raw_features = infer_dtypes(raw_features)
    raw_features = compute_base_features(raw_features)
    features = compute_features_on_train(raw_features, WINDOW, FEATURE_DICT)
    features['timestamp'] = raw_features['timestamp'].to_numpy()
    features.to_parquet(OUTPUT_PATH, index=False)
else:
    features = pd.read_parquet(OUTPUT_PATH)

CPU times: user 3.98 s, sys: 759 ms, total: 4.74 s
Wall time: 2.25 s


In [18]:
features.head()

Unnamed: 0,High__amax__60min,Low__amin__60min,Close__mean__60min,price_return_1__sum__60min,price_return_1__realized_volatility__60min,vwap_return_1__sum__60min,vwap_return_1__realized_volatility__60min,Count__sum__60min,Count__amax__60min,Volume__sum__60min,Volume__amax__60min,high_low_return__mean__60min,open_close_return__mean__60min,timestamp
0,14013.799805,13666.110352,13850.175781,0.0,0.0,0.0,0.0,229.0,229.0,31.550062,31.550062,0.025124,0.001082,1514764860
1,14052.299805,13666.110352,13839.138672,-0.001595,0.001595,0.000961,0.000961,464.0,235.0,62.596494,31.550062,0.025987,0.00029,1514764920
2,14052.299805,13601.0,13826.530599,-0.003534,0.002511,-0.001519,0.00266,992.0,528.0,117.658314,55.06182,0.026972,-0.000351,1514764980
3,14052.299805,13576.280273,13811.907959,-0.005948,0.003483,-0.003148,0.003119,1427.0,528.0,156.438843,55.06182,0.027895,-0.000889,1514765040
4,14052.299805,13554.44043,13794.50918,-0.009086,0.004688,-0.006638,0.00468,2169.0,742.0,264.94048,108.501637,0.028153,-0.001309,1514765100


In [19]:
assert features['timestamp'].isna().sum() == 0

In [20]:
data = raw_local_data[['timestamp', 'Asset_ID', 'Target']].dropna(subset=['Target'])
data = merge_asset_details(data, ASSET_DETAILS_PATH)

In [21]:
data['time'] = pd.to_datetime(data['timestamp'], unit='s')

In [22]:
data = data.merge(features, on=['timestamp'], how='left')

In [23]:
data

Unnamed: 0,timestamp,Asset_ID,Target,Asset_Name,time,High__amax__60min,Low__amin__60min,Close__mean__60min,price_return_1__sum__60min,price_return_1__realized_volatility__60min,vwap_return_1__sum__60min,vwap_return_1__realized_volatility__60min,Count__sum__60min,Count__amax__60min,Volume__sum__60min,Volume__amax__60min,high_low_return__mean__60min,open_close_return__mean__60min
0,1514764860,1,-0.014643,Bitcoin,2018-01-01 00:01:00,14013.799805,13666.110352,13850.175781,0.000000,0.000000,0.000000,0.000000,229.0,229.0,31.550062,31.550062,0.025124,0.001082
1,1514764920,1,-0.015037,Bitcoin,2018-01-01 00:02:00,14052.299805,13666.110352,13839.138672,-0.001595,0.001595,0.000961,0.000961,464.0,235.0,62.596494,31.550062,0.025987,0.000290
2,1514764980,1,-0.010309,Bitcoin,2018-01-01 00:03:00,14052.299805,13601.000000,13826.530599,-0.003534,0.002511,-0.001519,0.002660,992.0,528.0,117.658314,55.061820,0.026972,-0.000351
3,1514765040,1,-0.008999,Bitcoin,2018-01-01 00:04:00,14052.299805,13576.280273,13811.907959,-0.005948,0.003483,-0.003148,0.003119,1427.0,528.0,156.438843,55.061820,0.027895,-0.000889
4,1514765100,1,-0.008079,Bitcoin,2018-01-01 00:05:00,14052.299805,13554.440430,13794.509180,-0.009086,0.004688,-0.006638,0.004680,2169.0,742.0,264.940480,108.501637,0.028153,-0.001309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1955973,1632181200,1,0.002084,Bitcoin,2021-09-20 23:40:00,43906.730469,42515.089844,43139.958008,-0.027511,0.015431,-0.028177,0.012507,218616.0,11138.0,8558.595222,562.284391,0.003382,-0.000451
1955974,1632181260,1,0.003246,Bitcoin,2021-09-20 23:41:00,43874.980469,42515.089844,43121.878581,-0.025040,0.015526,-0.025928,0.012735,219971.0,11138.0,8612.258859,562.284391,0.003410,-0.000416
1955975,1632181320,1,0.003108,Bitcoin,2021-09-20 23:42:00,43851.398438,42515.089844,43103.707682,-0.025180,0.015531,-0.025602,0.012727,221336.0,11138.0,8697.184311,562.284391,0.003447,-0.000415
1955976,1632181380,1,0.002770,Bitcoin,2021-09-20 23:43:00,43843.871094,42515.089844,43085.273307,-0.025563,0.015549,-0.024973,0.012711,222425.0,11138.0,8743.669115,562.284391,0.003478,-0.000425


In [24]:
from cv import TimeSeriesSplit

In [25]:
PULIC_LB_RANGE = ['2021-06-13 00:00:00',
                  '2021-09-13 00:00:00'] # 3 MONTH WORTH OF DATA

if USE_TOY_SAMPLE:
    EVAL_PERIODS = [['2021-09-15', '2021-09-22']]

elif USE_SAMPLE:
    EVAL_PERIODS = [PULIC_LB_RANGE]
    
else:
    EVAL_PERIODS = [['2019-01-01', '2019-04-01'],
                    ['2021-03-13', '2021-06-12 00:00:00'], 
                    PULIC_LB_RANGE]
    
CV_PARAMS = {'gap_unit': 'min', 'dt_col': 'time', 'train_days': 30}

In [26]:
CV = TimeSeriesSplit(EVAL_PERIODS, **CV_PARAMS)

In [27]:
FEATURES = data.columns.drop(['timestamp', 'Asset_ID', 'Target', 'time', 'Asset_Name'])

In [28]:
FEATURES

Index(['High__amax__60min', 'Low__amin__60min', 'Close__mean__60min',
       'price_return_1__sum__60min',
       'price_return_1__realized_volatility__60min',
       'vwap_return_1__sum__60min',
       'vwap_return_1__realized_volatility__60min', 'Count__sum__60min',
       'Count__amax__60min', 'Volume__sum__60min', 'Volume__amax__60min',
       'high_low_return__mean__60min', 'open_close_return__mean__60min'],
      dtype='object')

In [29]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

In [30]:
def train_model(config, train_data, valid_data, pipeline=None):
    model  = Pipeline([('filter', FunctionTransformer(lambda d: d[FEATURES])),
                    ('norm', MinMaxScaler()),
                   ('model', Ridge(alpha=0.001))])
    timesteps = ((train_data['timestamp'].max() - train_data['timestamp'])//60//60//24)
    weight = 0.99 ** timesteps
    
    norm_target = train_data['Target'] / train_data['Target'].std()
    model.fit(train_data, train_data['Target'])
    
    return model

In [31]:
from modeling import Evaluator
from metrics import compute_metrics

In [32]:
def get_fi(config, model, valid_data):
    return pd.DataFrame({'feature': FEATURES, 'importance': model['model'].coef_.ravel()})

In [33]:
evaluator = Evaluator(CV, score_fn=compute_metrics, fi_fn=get_fi)

In [34]:
output = evaluator.run(train_model, config=None, data=data)

fold=0
theor_corr          0.008188
weighted_corr       0.001358
sharpe              0.383458
corr_period_mean    0.008752
corr_period_std     0.022824
corr_min            0.008188
corr_max            0.008188
corr_std                 NaN
dtype: float64
fold=1


  return compute_weighted_cov(y, yhat, w) / np.sqrt(var_y * var_yhat)


AssertionError: period corrs contains NaN values

fold=0
theor_corr       0.009791
sharpe           0.312054
weighted_corr    0.001624
corr_min         0.009791
corr_max         0.009791
corr_std              NaN
dtype: float64
fold=1
theor_corr       0.018405
sharpe           0.137166
weighted_corr    0.003053
corr_min         0.018405
corr_max         0.018405
corr_std              NaN
dtype: float64
fold=2
theor_corr       0.044109
sharpe           0.232303
weighted_corr    0.007316
corr_min         0.044109
corr_max         0.044109
corr_std              NaN

fold=0
theor_corr       0.019243
weighted_corr    0.003191
corr_min         0.019243
corr_max         0.019243
corr_std              NaN
dtype: float64
fold=1
theor_corr      -0.008768
weighted_corr   -0.001454
corr_min        -0.008768
corr_max        -0.008768
corr_std              NaN
dtype: float64
fold=2
theor_corr       0.043924
weighted_corr    0.007285
corr_min         0.043924
corr_max         0.043924
corr_std              NaN

In [None]:
scores = pd.DataFrame([fold['scores'] for fold in output])
scores['eval_period'] = EVAL_PERIODS

In [None]:
scores

In [None]:
scores.mean()

normal
theor_corr          0.018127
weighted_corr       0.003006
sharpe              0.351094
corr_period_mean    0.013315
corr_period_std     0.047743
corr_min            0.018127
corr_max            0.018127
corr_std                 NaN

In [None]:
fi = pd.concat([fold['fi'] for fold in output])

In [None]:
fi_stats = fi.groupby(['feature'])['importance'].describe()

In [None]:
fi_stats['abs'] = fi_stats['mean'].abs()
fi_stats = fi_stats.sort_values(by='abs',ascending=False)

In [None]:
plt.figure(figsize=(7, 7))
sns.boxenplot(y='feature', x='importance', order=fi_stats.index, data=fi)

In [None]:
7 * 24