In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../data/g-research-crypto-forecasting/train.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,2018-01-01 00:01:00,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,2018-01-01 00:01:00,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,2018-01-01 00:01:00,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,2018-01-01 00:01:00,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,2018-01-01 00:01:00,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [3]:
'''
timestamp - A timestamp for the minute covered by the row.
Asset_ID - An ID code for the cryptoasset.
Count - The number of trades that took place this minute.
Open - The USD price at the beginning of the minute.
High - The highest USD price during the minute.
Low - The lowest USD price during the minute.
Close - The USD price at the end of the minute.
Volume - The number of cryptoasset units traded during the minute.
VWAP - The volume weighted average price for the minute.
Target - 15 minute residualized returns. See the 'Prediction and Evaluation' section of this notebook for details of how the target is calculated.
'''

"\ntimestamp - A timestamp for the minute covered by the row.\nAsset_ID - An ID code for the cryptoasset.\nCount - The number of trades that took place this minute.\nOpen - The USD price at the beginning of the minute.\nHigh - The highest USD price during the minute.\nLow - The lowest USD price during the minute.\nClose - The USD price at the end of the minute.\nVolume - The number of cryptoasset units traded during the minute.\nVWAP - The volume weighted average price for the minute.\nTarget - 15 minute residualized returns. See the 'Prediction and Evaluation' section of this notebook for details of how the target is calculated.\n"

In [4]:
sorted(data['Asset_ID'].unique())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [5]:
data.describe()

Unnamed: 0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
count,24236810.0,24236810.0,24236810.0,24236810.0,24236810.0,24236810.0,24236810.0,24236800.0,23486470.0
mean,6.292544,286.4593,1432.64,1436.35,1429.568,1432.64,286853.0,,7.121752e-06
std,4.091861,867.3982,6029.605,6039.482,6020.261,6029.611,2433935.0,,0.005679042
min,0.0,1.0,0.0011704,0.001195,0.0002,0.0011714,-0.3662812,-inf,-0.5093509
25%,3.0,19.0,0.26765,0.26816,0.2669,0.2676483,141.0725,0.2676368,-0.001694354
50%,6.0,64.0,14.2886,14.3125,14.263,14.2892,1295.415,14.28769,-4.289844e-05
75%,9.0,221.0,228.8743,229.3,228.42,228.8729,27297.64,228.8728,0.00160152
max,13.0,165016.0,64805.94,64900.0,64670.53,64808.54,759755400.0,inf,0.9641699


In [6]:
asset_details = pd.read_csv('../../data/g-research-crypto-forecasting/asset_details.csv')
id_to_names = dict(zip(asset_details['Asset_ID'], asset_details['Asset_Name']))

In [7]:
data['Asset_Name'] = [id_to_names[a] for a in data['Asset_ID']]

In [8]:
data.sort_values(['timestamp', 'Asset_ID'], inplace=True)
data.set_index(['timestamp', 'Asset_ID'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01 00:01:00,0,5.0,8.530000,8.530000,8.530000,8.530000,7.838000e+01,8.530000,-0.014399,Binance Coin
2018-01-01 00:01:00,1,229.0,13835.194000,14013.800000,13666.110000,13850.176000,3.155006e+01,13827.062093,-0.014643,Bitcoin
2018-01-01 00:01:00,2,40.0,2376.580000,2399.500000,2357.140000,2374.590000,1.923301e+01,2373.116392,-0.004218,Bitcoin Cash
2018-01-01 00:01:00,5,32.0,7.659600,7.659600,7.656700,7.657600,6.626713e+03,7.657713,-0.013922,EOS.IO
2018-01-01 00:01:00,6,173.0,738.302500,746.000000,732.510000,738.507500,3.359879e+02,738.839291,-0.004809,Ethereum
...,...,...,...,...,...,...,...,...,...,...
2021-09-21 00:00:00,9,775.0,157.181571,157.250000,156.700000,156.943857,4.663725e+03,156.994319,,Litecoin
2021-09-21 00:00:00,10,34.0,2437.065067,2438.000000,2430.226900,2432.907467,3.975460e+00,2434.818747,,Maker
2021-09-21 00:00:00,11,48.0,232.695000,232.800000,232.240000,232.275000,1.035123e+02,232.569697,,Monero
2021-09-21 00:00:00,12,177.0,0.282168,0.282438,0.281842,0.282051,1.828508e+05,0.282134,,Stellar


In [9]:
# test_idx = 
data.set_index(['timestamp', 'Asset_ID']).loc[data['timestamp'].min()].values

array([[5.0, 8.53, 8.53, 8.53, 8.53, 78.38, 8.53, -0.0143989664689647,
        'Binance Coin'],
       [229.0, 13835.194, 14013.8, 13666.11, 13850.176, 31.55006152,
        13827.062092689885, -0.0146432243557361, 'Bitcoin'],
       [40.0, 2376.58, 2399.5, 2357.14, 2374.59, 19.23300519,
        2373.1163915061647, -0.0042181523874292, 'Bitcoin Cash'],
       [32.0, 7.6596, 7.6596, 7.6567, 7.6576, 6626.713369870001,
        7.657712894055892, -0.0139224470071963, 'EOS.IO'],
       [173.0, 738.3025, 746.0, 732.51, 738.5075, 335.98785619,
        738.839291493523, -0.0048086040148456, 'Ethereum'],
       [5.0, 25.92, 25.92, 25.874, 25.877, 121.08731, 25.89136300520674,
        -0.0082635054253386, 'Ethereum Classic'],
       [167.0, 225.33, 227.78, 222.98, 225.20666666666668, 411.89664234,
        225.1979435524601, -0.0097914226840802, 'Litecoin'],
       [7.0, 329.09, 329.88, 329.09, 329.46, 6.63571014,
        329.4541176122712, nan, 'Monero']], dtype=object)

In [10]:
data.set_index(['timestamp', 'Asset_ID'])['Target']

timestamp            Asset_ID
2018-01-01 00:01:00  0          -0.014399
                     1          -0.014643
                     2          -0.004218
                     5          -0.013922
                     6          -0.004809
                                   ...   
2021-09-21 00:00:00  9                NaN
                     10               NaN
                     11               NaN
                     12               NaN
                     13               NaN
Name: Target, Length: 24236806, dtype: float64

In [11]:
data.isnull().sum() / len(data)

timestamp     0.000000e+00
Asset_ID      0.000000e+00
Count         0.000000e+00
Open          0.000000e+00
High          0.000000e+00
Low           0.000000e+00
Close         0.000000e+00
Volume        0.000000e+00
VWAP          3.713361e-07
Target        3.095862e-02
Asset_Name    0.000000e+00
dtype: float64

In [12]:
data.fillna(method='ffill', inplace=True)
data.fillna(value=0, inplace=True)

In [13]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Asset_Name
1,2018-01-01 00:01:00,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399,Binance Coin
2,2018-01-01 00:01:00,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643,Bitcoin
0,2018-01-01 00:01:00,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218,Bitcoin Cash
3,2018-01-01 00:01:00,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922,EOS.IO
5,2018-01-01 00:01:00,6,173.0,738.3025,746.0,732.51,738.5075,335.987856,738.839291,-0.004809,Ethereum


In [14]:
import torch
from torch.utils.data import Dataset, IterableDataset, DataLoader

In [15]:
class CryptoFeed(Dataset):
    def __init__(self, df, technicals=None):
        """
        Args:
            df: pandas Dataframe, contains price data on crypto assets. Assumes 
            technicals: dict, string (key) mapped to function (value) that calculates technical indicator from df
        """
        df.sort_values(['timestamp', 'Asset_ID'], inplace=True)
        
        self.features = df.copy()
        if technicals is not None:
            for k, v in technicals.items():
                self.features[k] = v(df)
        self.features.set_index(['timestamp', 'Asset_ID'], inplace=True)
        for col in ['timestamp', 'Asset_ID', 'Asset_Name', 'Target']:
            if col in self.features.columns:
                self.features.drop(col, axis=1, inplace=True)
                
        self.targets = df.set_index(['timestamp', 'Asset_ID'])['Target']
        self.index_to_date = dict(zip(range(len(df['timestamp'].unique)), sorted(df['timestamp'].unique())))
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        date = self.index_to_date[idx]
        return self.features.loc[date].values, self.targets.loc[date].values

In [16]:
class CryptoFeed(IterableDataset):
    def __init__(self, df, seq_len=5, technicals=None):
        """
        Creates an iterable feed of crypto market states increasing in time given an input df

        Args:
            df: pandas Dataframe, contains price data on crypto assets. Assumes 
            technicals: dict, string (key) mapped to function (value) that calculates technical indicator from df
        """
        df.sort_values(['timestamp', 'Asset_ID'], inplace=True)
        
        self.id_to_name = dict(zip(df['Asset_ID'], df['Asset_Name']))
        self.data = [df[df['Asset_ID'] == i].copy() for i in sorted(df['Asset_ID'].unique())]
        self.targets = pd.concat([tdf.set_index('timestamp')['Target'] for tdf in self.data], axis=1)

        if technicals is not None:
            for tdf in self.data:
                for k, v in technicals.items():
                    tdf[k] = v(tdf)
        for tdf in self.data:
            tdf.set_index('timestamp', inplace=True)
        for col in ['timestamp', 'Asset_ID', 'Asset_Name', 'Target']:
            for tdf in self.data:
                if col in tdf.columns:
                    tdf.drop(col, axis=1, inplace=True)

        self.features = pd.concat(self.data, axis=1)
                        
        self.seq_len = seq_len
        self.valid_dates = list(self.features.index)
        # self.num_valid_starts = self.features.shape[0] - self.seq_len
    
    def __len__(self):
        return self.features.shape[0]
    
    def __iter__(self):
        for i in range(self.seq_len, len(self.valid_dates)):
            dates_idx = self.valid_dates[i-self.seq_len:i]
            features = self.features.loc[dates_idx].values
            target = self.targets.loc[dates_idx[-1]].values # target is target of end of window
            adj = self.targets.loc[dates_idx].corr().values # correlation matrix between previous seq_len target values
            print(self.targets.loc[dates_idx])
            print(self.targets.loc[dates_idx].corr())
            print(adj)
            yield features, target, adj

In [17]:
dataset = CryptoFeed(data)

In [18]:
dataloader = DataLoader(dataset)

In [19]:
i = 0
for x, y, a in dataloader:
    if i > 2:
        break
    print(x.shape, y.shape, a.shape)
    i += 1

                       Target    Target    Target  Target  Target    Target  \
timestamp                                                                     
2018-01-01 00:01:00 -0.014399 -0.014643 -0.004218     NaN     NaN -0.013922   
2018-01-01 00:02:00 -0.015875 -0.015037 -0.004079     NaN     NaN -0.014534   
2018-01-01 00:03:00 -0.015410 -0.010309 -0.002892     NaN     NaN -0.012546   
2018-01-01 00:04:00 -0.012524 -0.008999 -0.003718     NaN     NaN -0.011170   
2018-01-01 00:05:00 -0.005940 -0.008079 -0.002171     NaN     NaN -0.006154   

                       Target    Target  Target    Target  Target    Target  \
timestamp                                                                     
2018-01-01 00:01:00 -0.004809 -0.008264     NaN -0.009791     NaN -0.009791   
2018-01-01 00:02:00 -0.004441 -0.029902     NaN -0.012991     NaN -0.009690   
2018-01-01 00:03:00 -0.004206 -0.030832     NaN -0.003572     NaN  0.006567   
2018-01-01 00:04:00 -0.002205 -0.028899     NaN -0.

In [None]:
a.squeeze()

In [76]:
len(dataset)

1956782

In [81]:
pd.DataFrame(np.random.random((4, 3))).values

array([[0.21048091, 0.07814175, 0.21098803],
       [0.88533118, 0.63138025, 0.91717235],
       [0.03144147, 0.22048096, 0.1377318 ],
       [0.46196295, 0.21895655, 0.89900687]])