In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset

In [30]:
def get_data(currency_lst,
             n_classes,
             frequency, 
             window_size,
             neutral_quantile = 0.25,
             beg_date = pd.Timestamp(2013,1,1),
             end_date = pd.Timestamp.now(),
             log_price = True,
             remove_trend = True,
             include_indicators = True,
             include_imfs = True):
        
        X, y, dfs = {}, {}, {}     
        
        for cur in currency_lst:
            df = pd.read_csv(f"../data/0_raw/Binance/{str.lower(cur)}_usdt_1d.csv", index_col=0)
            df.index = pd.to_datetime(df.index, unit='s')
            df.sort_index(inplace=True)
            #df.index = df.Date.apply(pd.Timestamp)
            #df.sort_values("Date", inplace=True)
            #df.set_index("Date", inplace=True)
            df.drop(["Date", "Open", "High", "Low"], axis=1, inplace=True)
            df.rename(str.lower, axis=1, inplace=True)
            
            if log_price:
                df["close"] = df["close"].apply(np.log)
                   
            if n_classes == 3:
                df['pct_diff'] = df['close'].pct_change()
                neutral_quantiles = df['pct_diff'].abs().quantile(neutral_quantile)
                
                conditions = [(df['pct_diff'] < 0) & (df['pct_diff'].abs() > neutral_quantiles),
                              (df['pct_diff'] > 0) & (df['pct_diff'].abs() > neutral_quantiles)]

                classes = [0,1] # 2 is the default class if none of conditions is met, i.e. price change in the neutral range
            
                change_dir = np.select(conditions, classes, default=2)
            
            else:
                df['diff'] = df['close'].diff()
                change_dir = df['diff'].apply(lambda x: 0 if x <= 0 else 1)
                
            if remove_trend:
                from statsmodels.tsa.seasonal import seasonal_decompose
                components = seasonal_decompose(df["close"], model="additive")
                df["close"] -= components.trend
            
            df.insert(loc=0, column="change_dir", value=change_dir)   
            df.dropna(inplace=True)       
            
            if include_indicators:
                from ta import add_all_ta_features
                df = add_all_ta_features(df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)
            else:
                df.drop("volume", axis=1, inplace=True)
            
            if include_imfs:
                from PyEMD import EEMD
                eemd = EEMD()
                imfs = eemd(df["close"].values)
                imf_features = ["imf_"+str(i) for i in range(imfs.shape[0])]
                df = pd.concat((df, pd.DataFrame(imfs.T, columns=imf_features, index=df.index)), axis=1)


            dfs[cur] = df
        
        min_dates = [df.index.min() for cur, df in dfs.items()]
        max_dates = [df.index.max() for cur, df in dfs.items()]
        beg_date = max([max(min_dates), beg_date])
        end_date = min([min(max_dates), end_date])
        common_range = pd.date_range(beg_date, end_date, freq=frequency)
        
        diff_col = 'pct_diff' if n_classes == 3 else 'diff'
        X = np.array([dfs[cur].loc[common_range].drop(["change_dir", diff_col], axis=1).values for cur in currency_lst])
        y = np.array([dfs[cur].loc[common_range, "change_dir"].values for cur in currency_lst])
        features = df.columns.tolist()
        
        return X, y, features, dfs

In [41]:
CURRENCY_LST = ['BTC', 'ETH', 'LTC']
N_CLASSES = 3
FREQUENCY = "D"
WINDOW_SIZE = 50
NEUTRAL_QUANTILE = 0.33

In [42]:
X, y, features, dfs = get_data(CURRENCY_LST,
                               N_CLASSES,
                                 FREQUENCY, 
                                 WINDOW_SIZE,
                                 neutral_quantile = NEUTRAL_QUANTILE,
                                 log_price=True,
                                 remove_trend=False,
                                 include_indicators = False,
                                 include_imfs = False
                                )

In [43]:
(X.shape, y.shape)

((3, 1241, 1), (3, 1241))

In [44]:
X[0]

array([[ 9.7010641 ],
       [ 9.77222957],
       [ 9.85758287],
       ...,
       [10.95842848],
       [10.94011237],
       [10.93067562]])

In [45]:
#the first WINDOW_SIZE values of change direction will be seen as NaN 

dfs['BTC']['change_dir'].value_counts() / len(dfs['BTC'])

1    0.366446
2    0.330390
0    0.303164
Name: change_dir, dtype: float64

In [8]:
N_CURRENCIES = 3
INPUT_FEATURE_SIZE = 1
WINDOW_SIZE = 50
TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = 0.70, 0.15, 0.15

In [9]:
class TimeSeriesDataset(Dataset):
    def __init__(self, 
                 x: np.ndarray,
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len
        
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x) * train_percentage)
        self.val_size = int(len(self.x) * val_percentage)
        self.test_size = int(len(self.x) * test_percentage)
        self.train_size = len(self.x) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - self.seq_len
        
        elif self.data_use_type == "val":
            return self.val_size 
        
        else:
            return self.test_size
        
    def __getitem__(self, index):
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        window = self.x[index:index+self.seq_len]
        price_change = self.y[index+self.seq_len]
        
        return (window, price_change)

In [10]:
#datasets for the currency at 0 index of X
train, val, test = [TimeSeriesDataset(X[0], y[0], dtype) for dtype in ['train', 'val', 'test']]

In [11]:
len(train) + len(val) + len(test) + WINDOW_SIZE

1235

In [12]:
train[0]

(tensor([[-9.7841],
         [-9.7766],
         [-9.7373],
         [-9.6856],
         [-9.6378],
         [-9.5922],
         [-9.5783],
         [-9.5691],
         [-9.5564],
         [-9.5673],
         [-9.5577],
         [-9.5600],
         [-9.5566],
         [-9.5471],
         [-9.5420],
         [-9.5506],
         [-9.5742],
         [-9.6194],
         [-9.6427],
         [-9.6581],
         [-9.6554],
         [-9.6553],
         [-9.6369],
         [-9.6068],
         [-9.5806],
         [-9.5547],
         [-9.5410],
         [-9.5013],
         [-9.4577],
         [-9.4307],
         [-9.4050],
         [-9.3901],
         [-9.3678],
         [-9.3350],
         [-9.3336],
         [-9.3383],
         [-9.3410],
         [-9.3361],
         [-9.3207],
         [-9.3250],
         [-9.3314],
         [-9.3237],
         [-9.3097],
         [-9.2822],
         [-9.2504],
         [-9.2186],
         [-9.1654],
         [-9.0964],
         [-9.0548],
         [-9.0116]])

In [13]:
class MultiTimeSeriesDataset(Dataset):
    def __init__(self, 
                 n_currencies,
                 x: np.ndarray, 
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x[:n_currencies]).float()
        self.y = torch.tensor(y[:n_currencies]).float()
        self.seq_len = seq_len
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x[0]) * train_percentage)
        self.val_size = int(len(self.x[0]) * val_percentage)
        self.test_size = int(len(self.x[0]) * test_percentage)
        self.train_size = len(self.x[0]) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - ( self.seq_len)

        elif self.data_use_type == "val":
            return self.val_size
  
        else:
            return self.test_size
        
    
    def __getitem__(self, index):
        
        item = dict()
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        for i in range(N_CURRENCIES):
            item["currency_" + str(i) + "_window"] = self.x[i][index:index+self.seq_len]
            item["currency_" + str(i) + "_label"]  = self.y[i][index+self.seq_len]

        return item

In [14]:
train, val, test = [MultiTimeSeriesDataset(N_CURRENCIES, X, y, dtype) 
                    for dtype in ['train', 'val', 'test']]

In [15]:
#first day data for multi-task learning
train[0]

{'currency_0_window': tensor([[-9.7841],
         [-9.7766],
         [-9.7373],
         [-9.6856],
         [-9.6378],
         [-9.5922],
         [-9.5783],
         [-9.5691],
         [-9.5564],
         [-9.5673],
         [-9.5577],
         [-9.5600],
         [-9.5566],
         [-9.5471],
         [-9.5420],
         [-9.5506],
         [-9.5742],
         [-9.6194],
         [-9.6427],
         [-9.6581],
         [-9.6554],
         [-9.6553],
         [-9.6369],
         [-9.6068],
         [-9.5806],
         [-9.5547],
         [-9.5410],
         [-9.5013],
         [-9.4577],
         [-9.4307],
         [-9.4050],
         [-9.3901],
         [-9.3678],
         [-9.3350],
         [-9.3336],
         [-9.3383],
         [-9.3410],
         [-9.3361],
         [-9.3207],
         [-9.3250],
         [-9.3314],
         [-9.3237],
         [-9.3097],
         [-9.2822],
         [-9.2504],
         [-9.2186],
         [-9.1654],
         [-9.0964],
         [-9.0548],