In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [3]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [4]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 0
USE_SAMPLE = SAMPLE_LEVEL == 1
USE_TOY_SAMPLE = SAMPLE_LEVEL == 2

FORCE_REWRITE = True#(ON_KAGGLE and SAMPLE_LEVEL == 0)

In [5]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [6]:
#IMPORT_SCRIPT!
# THIS CELL WILL BE REMOVE WITH SCRIPTS IN SRC
if './src/' not in sys.path:
    sys.path.append('./src/')

from preprocessing import *
from metrics import *
from cv import *
from modeling import *
from tracking import track_experiment
import util

In [7]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [8]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING RAW DATASET


In [9]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [10]:
%%time 
raw_df = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 19.4 s, sys: 1.93 s, total: 21.3 s
Wall time: 23.9 s


In [11]:
raw_df = ingest_data(raw_df, asset_details_path=ASSET_DETAILS_PATH)

In [12]:
raw_df.sort_values(by=['Asset_ID', 'date'], inplace=True)
raw_df.reset_index(drop=True, inplace=True)

In [13]:
raw_df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date
0,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,Binance Coin,2018-01-01 00:01:00
1,1514764920,0,7.0,8.53,8.53,8.5145,8.5145,71.39,8.520215,-0.015875,Binance Coin,2018-01-01 00:02:00
2,1514764980,0,45.0,8.5065,8.5299,8.4848,8.4848,1546.82,8.501393,-0.01541,Binance Coin,2018-01-01 00:03:00
3,1514765040,0,14.0,8.5009,8.5066,8.4744,8.5009,125.8,8.47981,-0.012524,Binance Coin,2018-01-01 00:04:00
4,1514765100,0,5.0,8.5007,8.5007,8.456,8.456,125.01,8.458435,-0.00594,Binance Coin,2018-01-01 00:05:00


In [14]:
TARGET = 'Target'
INDEX_COLS = ['Asset_ID', 'Asset_Name', 'date', 'timestamp']

In [15]:
RAW_FEATURES = ['Count', 'Open', 'High', 'Low', 'Close',
       'Volume', 'VWAP']

In [16]:
from sklearn.preprocessing import QuantileTransformer
normalized_features = np.empty(shape=(len(raw_df), len(RAW_FEATURES)))

for crypto in raw_df['Asset_Name'].unique():
    index = (raw_df['Asset_Name'] == crypto)
    X = raw_df.loc[index, RAW_FEATURES].to_numpy()
    Xt = QuantileTransformer(output_distribution='normal', random_state=1).fit_transform(X)
    normalized_features[index] = Xt
raw_df[RAW_FEATURES] = normalized_features.astype(np.float32)

In [17]:
feature_data = compute_features(raw_df)

In [18]:
feature_data = feature_data.dropna(subset=[TARGET]).reset_index(drop=True)

In [19]:
feature_data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date,upper_shadow,lower_shador
0,1514764860,0,-1.644369,-1.473639,-1.474215,-1.477134,-1.465109,-1.571186,-1.470163,-0.014399,Binance Coin,2018-01-01 00:01:00,-0.009106,0.003496
1,1514764920,0,-1.384628,-1.473639,-1.474215,-1.478583,-1.466534,-1.610889,-1.471004,-0.015875,Binance Coin,2018-01-01 00:02:00,-0.007681,0.004945
2,1514764980,0,-0.001255,-1.475582,-1.474226,-1.481368,-1.469799,0.354653,-1.472624,-0.01541,Binance Coin,2018-01-01 00:03:00,-0.004426,0.005786
3,1514765040,0,-0.846278,-1.476045,-1.476538,-1.482346,-1.467787,-1.370906,-1.474487,-0.012524,Binance Coin,2018-01-01 00:04:00,-0.008751,0.006301
4,1514765100,0,-1.644369,-1.476061,-1.477093,-1.48427,-1.473464,-1.373546,-1.476324,-0.00594,Binance Coin,2018-01-01 00:05:00,-0.003629,0.008209


In [20]:
feature_data.tail()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date,upper_shadow,lower_shador
23486463,1632181200,13,1.955779,1.625927,1.626286,1.637673,1.620733,1.602699,1.625785,0.000199,TRON,2021-09-20 23:40:00,0.000359,-0.016939
23486464,1632181260,13,1.731196,1.627776,1.629381,1.64236,1.625668,0.760857,1.630635,-0.003477,TRON,2021-09-20 23:41:00,0.001606,-0.016691
23486465,1632181320,13,1.531872,1.630145,1.629632,1.644315,1.625294,0.900464,1.631643,-0.002437,TRON,2021-09-20 23:42:00,-0.000513,-0.019021
23486466,1632181380,13,1.447533,1.630671,1.629562,1.643197,1.623404,0.579362,1.630741,0.004843,TRON,2021-09-20 23:43:00,-0.001108,-0.019793
23486467,1632181440,13,1.512917,1.627457,1.627381,1.639769,1.619671,0.726923,1.627627,0.004163,TRON,2021-09-20 23:44:00,-7.6e-05,-0.020098


In [21]:
PULIC_LB_RANGE = ['2021-06-13 00:00:00',
                  '2021-09-13 00:00:00'] # 3 MONTH WORTH OF DATA

if USE_TOY_SAMPLE:
    EVAL_PERIODS = [['2021-09-15', '2021-09-22']]

elif USE_SAMPLE:
    EVAL_PERIODS = [PULIC_LB_RANGE]
    
else:
    EVAL_PERIODS = [['2019-01-01', '2019-04-01'],
                    ['2021-03-13', '2021-06-12 00:00:00'], 
                    PULIC_LB_RANGE]
    
CV_PARAMS = {'gap_unit': 'min'}

In [22]:
EVAL_PERIODS

[['2019-01-01', '2019-04-01'],
 ['2021-03-13', '2021-06-12 00:00:00'],
 ['2021-06-13 00:00:00', '2021-09-13 00:00:00']]

In [23]:
# FEATURE TO TRAIN
COLUMNS_TO_DROP = [TARGET, 'Asset_Name', 'date', 'timestamp']
FEATURES = feature_data.columns.drop(COLUMNS_TO_DROP)

# OUTPUT DIR
OUTPUT_PATH = Path('artifacts/baseline/')
if SAMPLE_LEVEL > 0:
    OUTPUT_PATH = OUTPUT_PATH / f'sample/{SAMPLE_LEVEL}'
OUTPUT_PATH.mkdir(exist_ok=True, parents=True)
OUTPUT_MODEL_PATH = OUTPUT_PATH / 'xgb.pkl'


In [24]:
model_params = util.load_config('conf/model/xgboost/base.yml')
config = {'model': model_params, 'output_dir': str(OUTPUT_MODEL_PATH)}

In [25]:
### make transformer

In [26]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib

In [27]:
from sklearn.preprocessing import FunctionTransformer

In [28]:
filter_pipeline = FunctionTransformer(lambda d: d[FEATURES])

In [29]:
def train_model(config, train_data: pd.DataFrame, valid_data: pd.DataFrame=None, pipeline=None):
    model = Pipeline([('filter', filter_pipeline),
                      ('model', XGBRegressor(**config['model']))])
    model.fit(train_data, train_data[TARGET])    
    return model

In [30]:
def filter_by_index(df: pd.DataFrame, index) -> pd.DataFrame:
    return df.loc[index, :].reset_index(drop=True)

In [None]:
CV = TimeSeriesSplit(EVAL_PERIODS, **CV_PARAMS)
evaluator = Evaluator(cv=CV, score_fn=compute_metrics)
eval_output = evaluator.run(train_model, config, feature_data)

fold=0
theor_corr       0.010384
weighted_corr    0.009833
corr_min        -0.001408
corr_max         0.027467
corr_std         0.009773
dtype: float64
fold=1


In [None]:
scores = pd.DataFrame([fold['scores'] for fold in eval_output])
scores['eval_period'] = EVAL_PERIODS

In [None]:
scores

In [None]:
valid_corrs = pd.concat([fold['corrs'].assign(fold=e) for e, fold in enumerate(eval_output)])
valid_corrs['eval_period'] = valid_corrs['fold'].map(lambda d: EVAL_PERIODS[d])

In [None]:
valid_corrs

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x='Asset_Name', y='corr', data=valid_corrs)
plt.xticks(rotation=90);

In [None]:
scores

In [None]:
if SAMPLE_LEVEL == 0:
    scores_mean = scores.mean().to_dict()
    valid_corrs.to_csv(OUTPUT_PATH / 'corrs.csv', index=False)
    
    scores.to_csv(OUTPUT_PATH / 'cv_scores.csv', index=False)
    track_experiment('quantile-normalization-single-model', mlflow_experiment='leaderboard',
                     config=config,
                     scores=scores_mean, artifacts_dir=str(OUTPUT_PATH))