In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from pathlib import Path
import os
import sys
from IPython.display import display

In [3]:
def on_kaggle() -> bool:
    try:
        import gresearch_crypto
        return True
    except ModuleNotFoundError:
        return False

In [4]:
# HYPER PARAMETERS
ON_KAGGLE = on_kaggle()
SAMPLE_LEVEL = 1
USE_SAMPLE = SAMPLE_LEVEL == 1
USE_TOY_SAMPLE = SAMPLE_LEVEL == 2

FORCE_REWRITE = (ON_KAGGLE and SAMPLE_LEVEL == 0)

In [5]:
if not ON_KAGGLE and os.path.abspath('.').endswith('notebook'):
    os.chdir('../')

In [6]:
#IMPORT_SCRIPT!
# THIS CELL WILL BE REMOVE WITH SCRIPTS IN SRC
if './src/' not in sys.path:
    sys.path.append('./src/')

from preprocessing import *
from metrics import *

In [7]:
if ON_KAGGLE:
    RAW_DIR = Path('../input/g-research-crypto-forecasting/')
    SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/sample/')
    TOY_SAMPLE_DIR = Path('../input/create-sample-dataset/data/raw/toy_sample/')
else:
    RAW_DIR =  Path('data/raw')
    TOY_SAMPLE_DIR = RAW_DIR.joinpath('toy_sample')
    SAMPLE_DIR = RAW_DIR.joinpath('sample')

# filename
TRAIN_FILE = 'train.csv'
ASSET_DETAILS_PATH = RAW_DIR / 'asset_details.csv'

In [8]:
if USE_TOY_SAMPLE:
    print('USING TOY DATASET')
    RAW_TRAIN_PATH = TOY_SAMPLE_DIR / TRAIN_FILE

elif USE_SAMPLE:
    print('USING SAMPLE DATASET')
    RAW_TRAIN_PATH = SAMPLE_DIR / TRAIN_FILE

else:
    print('USING RAW DATASET')
    RAW_TRAIN_PATH = RAW_DIR / TRAIN_FILE

assert RAW_TRAIN_PATH.exists()

USING SAMPLE DATASET


In [9]:
!ls {RAW_DIR}

asset_details.csv		   sample
example_sample_submission.csv	   supplemental_train.csv
example_test.csv		   toy_sample
g-research-crypto-forecasting.zip  train.csv
gresearch_crypto


In [10]:
%%time 
raw_df = pd.read_csv(RAW_TRAIN_PATH)

CPU times: user 3.92 s, sys: 457 ms, total: 4.38 s
Wall time: 4.38 s


In [11]:
raw_df = ingest_data(raw_df, asset_details_path=ASSET_DETAILS_PATH)

In [12]:
raw_df.sort_values(by=['Asset_ID', 'date'], inplace=True)
raw_df.reset_index(drop=True, inplace=True)

In [13]:
raw_df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name,date
0,1609459260,0,219.0,37.384998,37.4221,37.348701,37.389,2749.5692,37.387058,-0.001669,Binance Coin,2021-01-01 00:01:00
1,1609459320,0,133.0,37.390499,37.402,37.3298,37.331299,778.868,37.351677,-0.001542,Binance Coin,2021-01-01 00:02:00
2,1609459380,0,151.0,37.317051,37.3367,37.2729,37.2915,890.921,37.301258,-0.001028,Binance Coin,2021-01-01 00:03:00
3,1609459440,0,123.0,37.299149,37.323002,37.250198,37.291901,489.9361,37.297272,0.000174,Binance Coin,2021-01-01 00:04:00
4,1609459500,0,623.0,37.212799,37.285,37.104801,37.213001,15144.3836,37.184082,0.00095,Binance Coin,2021-01-01 00:05:00


In [14]:
TARGET = 'Target'
INDEX_COLS = ['Asset_ID', 'Asset_Name', 'date', 'timestamp']

### recreating target 

In [286]:
def ResidualizeMarket(df, mktColumn, window):
    if mktColumn not in df.columns:
        return df

    mkt = df[mktColumn]

    num = df.multiply(mkt.values, axis=0).rolling(window).mean().values  #numerator of linear regression coefficient
    denom = mkt.multiply(mkt.values, axis=0).rolling(window).mean().values  #denominator of linear regression coefficient
    beta = np.nan_to_num( num.T / denom, nan=0., posinf=0., neginf=0.)  #if regression fell over, use beta of 0
    resultRet = df - (beta * mkt.values).T  #perform residualization
    resultBeta = 0.*df + beta.T  #shape beta

    return resultRet.drop(columns=[mktColumn]), resultBeta.drop(columns=[mktColumn])

In [287]:
from metrics import ASSET_WEIGHT, TOTAL_WEIGHT_SUM

In [288]:
df_time = pd.pivot(raw_df, 'timestamp', 'Asset_Name', 'Close')
df_time = df_time#.shift(-1).iloc[:-1]

In [289]:
df_time = np.log(df_time)

In [290]:
s = 16
df_time = df_time.diff(s).iloc[s:]

In [291]:
mkt = (df_time * pd.Series(ASSET_WEIGHT)).to_numpy()
mkt = np.sum(mkt, axis=1) / TOTAL_WEIGHT_SUM

In [292]:
df_time['mkt'] = mkt

In [293]:
actual_target_table, beta = ResidualizeMarket(df_time, mktColumn='mkt', window=3750)

In [294]:
actual_target = pd.melt(actual_target_table.reset_index(), id_vars='timestamp', value_name='Target')

In [295]:
actual_target

Unnamed: 0,timestamp,Asset_Name,Target
0,1609460220,Binance Coin,-0.008008
1,1609460280,Binance Coin,-0.006671
2,1609460340,Binance Coin,-0.004580
3,1609460400,Binance Coin,-0.005990
4,1609460460,Binance Coin,-0.004437
...,...,...,...
5301739,1632182160,TRON,0.003076
5301740,1632182220,TRON,-0.003740
5301741,1632182280,TRON,-0.003524
5301742,1632182340,TRON,0.002682


In [296]:
actual_target = actual_target.merge(raw_df[['timestamp', 'Asset_Name', 'Target']], on=['timestamp', 'Asset_Name'], how='left',
                    suffixes=('_actual', '_expected'))

In [297]:
actual_target = actual_target.dropna(subset=['Target_expected'])

In [298]:
actual_target.tail()

Unnamed: 0,timestamp,Asset_Name,Target_actual,Target_expected
5301723,1632181200,TRON,-0.008113,0.000199
5301724,1632181260,TRON,-0.004042,-0.003477
5301725,1632181320,TRON,-0.00364,-0.002437
5301726,1632181380,TRON,-0.004887,0.004843
5301727,1632181440,TRON,-0.00458,0.004163


In [299]:
actual_target[['Target_actual', 'Target_expected']].corr()

Unnamed: 0,Target_actual,Target_expected
Target_actual,1.0,-0.019083
Target_expected,-0.019083,1.0
