In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset

In [2]:
def get_data(currency_lst,
             n_classes,
             frequency, 
             window_size,
             neutral_quantile = 0.25,
             beg_date = pd.Timestamp(2013,1,1),
             end_date = pd.Timestamp.now(),
             log_price = True,
             remove_trend = True,
             include_indicators = True,
             include_imfs = True):
        
        X, y, dfs = {}, {}, {}     
        
        for cur in currency_lst:
            df = pd.read_csv(f"../data/0_raw/Binance/{str.lower(cur)}_usdt_1d.csv", index_col=0)
            df.index = pd.to_datetime(df.index, unit='s')
            df.sort_index(inplace=True)
            #df.index = df.Date.apply(pd.Timestamp)
            #df.sort_values("Date", inplace=True)
            #df.set_index("Date", inplace=True)
            df.drop(["Date"], axis=1, inplace=True)
            df.rename(str.lower, axis=1, inplace=True)
            
            if log_price:
                df[["close", "open", "high", "low"]] = df[["close", "open", "high", "low"]].apply(np.log, axis=1)
                   
            if n_classes == 3:
                df['pct_diff'] = df['close'].pct_change()
                neutral_quantiles = df['pct_diff'].abs().quantile(neutral_quantile)
                
                conditions = [(df['pct_diff'] < 0) & (df['pct_diff'].abs() > neutral_quantiles),
                              (df['pct_diff'] > 0) & (df['pct_diff'].abs() > neutral_quantiles)]

                classes = [0,1] # 2 is the default class if none of conditions is met, i.e. price change in the neutral range
            
                change_dir = np.select(conditions, classes, default=2)
            
            else:
                df['diff'] = df['close'].diff()
                change_dir = df['diff'].apply(lambda x: 0 if x <= 0 else 1)
                
            if remove_trend:
                from statsmodels.tsa.seasonal import seasonal_decompose
                components = seasonal_decompose(df["close"], model="additive")
                df["close"] -= components.trend
            
            df.insert(loc=0, column="change_dir", value=change_dir)   
            df.dropna(inplace=True)       
            
            if include_indicators:
                from ta import add_all_ta_features
                indicators_df = add_all_ta_features(df, open="open", high="high", low="low", close="close", volume="volume", fillna=True)
                df[indicators_df.columns] = indicators_df
            else:
                df.drop(["volume", "open", "high", "low"], axis=1, inplace=True)
            
            if include_imfs:
                from PyEMD import EEMD
                eemd = EEMD()
                imfs = eemd(df["close"].values)
                imf_features = ["imf_"+str(i) for i in range(imfs.shape[0])]
                df = pd.concat((df, pd.DataFrame(imfs.T, columns=imf_features, index=df.index)), axis=1)

            dfs[cur] = df
        
        min_dates = [df.index.min() for cur, df in dfs.items()]
        max_dates = [df.index.max() for cur, df in dfs.items()]
        beg_date = max([max(min_dates), beg_date])
        end_date = min([min(max_dates), end_date])
        common_range = pd.date_range(beg_date, end_date, freq=frequency)
        
        diff_col = 'pct_diff' if n_classes == 3 else 'diff'
        X = np.array([dfs[cur].loc[common_range].drop(["change_dir", diff_col], axis=1).values for cur in currency_lst])
        y = np.array([dfs[cur].loc[common_range, "change_dir"].values for cur in currency_lst])
        features = df.columns.tolist()
        
        return X, y, features, dfs

In [3]:
CURRENCY_LST = ['BTC', 'ETH', 'LTC']
N_CLASSES = 3
FREQUENCY = "D"
WINDOW_SIZE = 50
NEUTRAL_QUANTILE = 0.33

In [None]:
X, y, features, dfs = get_data(CURRENCY_LST,
                               N_CLASSES,
                                 FREQUENCY, 
                                 WINDOW_SIZE,
                                 neutral_quantile = NEUTRAL_QUANTILE,
                                 log_price=True,
                                 remove_trend=False,
                                 include_indicators = True,
                                 include_imfs = True
                                )

  dip[i] = 100 * (self._dip[i] / self._trs[i])
  din[i] = 100 * (self._din[i] / self._trs[i])


In [37]:
(X.shape, y.shape)

((3, 1241, 88), (3, 1241))

In [38]:
dfs['BTC']

Unnamed: 0_level_0,change_dir,open,high,low,close,volume,pct_diff,volume_adi,volume_obv,volume_cmf,...,momentum_wr,momentum_ao,momentum_kama,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,others_dr,others_dlr,others_cr
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-18,0,8.362895,8.382866,8.278624,8.320782,1199.888264,-0.005036,-2.293660e+02,1.199888e+03,-0.191156,...,-59.557808,0.000000,8.320782,0.000000,0.000000,0.000000,0.000000,-8.931409,0.000000,0.000000
2017-08-19,2,8.320782,8.339188,8.255828,8.328446,381.309763,0.000921,5.367288e+01,1.581198e+03,0.033944,...,-42.837559,0.000000,8.324054,0.000000,-5.731801,-1.146360,-4.585441,0.092114,0.092072,0.092114
2017-08-20,0,8.323846,8.345474,8.302172,8.315393,467.083022,-0.001567,-1.281916e+02,1.114115e+03,-0.062585,...,-53.112852,0.000000,8.320160,0.000000,-9.993367,-2.915762,-7.077605,-0.156734,-0.156857,-0.064764
2017-08-21,0,8.311184,8.323516,8.271750,8.298042,691.743060,-0.002087,-1.172772e+02,4.223719e+02,-0.042802,...,-66.771078,0.000000,8.310203,0.000000,-11.684041,-4.669418,-7.014624,-0.208662,-0.208880,-0.273291
2017-08-22,2,8.298042,8.319917,8.131531,8.304000,966.684858,0.000718,6.860528e+02,1.389057e+03,0.185084,...,-31.378827,0.000000,8.307474,0.000000,-10.658271,-5.867188,-4.791083,0.071804,0.071778,-0.201684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-03,2,10.943379,10.984978,10.940845,10.953774,57649.931286,0.000950,8.652741e+06,4.203085e+06,0.074489,...,-13.652182,-0.009853,10.878215,0.559774,-7.078421,-1.360356,-5.718066,0.094986,0.094941,31.643569
2021-05-04,0,10.953774,10.954309,10.878928,10.881814,85324.625903,-0.006569,8.573950e+06,4.117760e+06,0.022443,...,-45.135486,-0.003277,10.878322,0.265139,-3.381598,-1.764604,-1.616994,-0.656943,-0.659110,30.778746
2021-05-05,1,10.881909,10.969401,10.876159,10.958428,77263.923439,0.007041,8.633029e+06,4.195024e+06,0.052343,...,-11.615750,-0.001614,10.885671,1.073570,-1.524961,-1.716675,0.191715,0.704061,0.701594,31.699508
2021-05-06,0,10.958428,10.974386,10.918718,10.940112,70181.671908,-0.001671,8.616792e+06,4.124843e+06,0.056400,...,-19.629247,-0.003621,10.886832,1.103231,-0.923564,-1.558053,0.634489,-0.167142,-0.167282,31.479383


In [45]:
#the first WINDOW_SIZE values of change direction will be seen as NaN 

dfs['BTC']['change_dir'].value_counts() / len(dfs['BTC'])

1    0.366446
2    0.330390
0    0.303164
Name: change_dir, dtype: float64

In [8]:
N_CURRENCIES = 3
INPUT_FEATURE_SIZE = 1
WINDOW_SIZE = 50
TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = 0.70, 0.15, 0.15

In [9]:
class TimeSeriesDataset(Dataset):
    def __init__(self, 
                 x: np.ndarray,
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len
        
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x) * train_percentage)
        self.val_size = int(len(self.x) * val_percentage)
        self.test_size = int(len(self.x) * test_percentage)
        self.train_size = len(self.x) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - self.seq_len
        
        elif self.data_use_type == "val":
            return self.val_size 
        
        else:
            return self.test_size
        
    def __getitem__(self, index):
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        window = self.x[index:index+self.seq_len]
        price_change = self.y[index+self.seq_len]
        
        return (window, price_change)

In [10]:
#datasets for the currency at 0 index of X
train, val, test = [TimeSeriesDataset(X[0], y[0], dtype) for dtype in ['train', 'val', 'test']]

In [11]:
len(train) + len(val) + len(test) + WINDOW_SIZE

1235

In [12]:
train[0]

(tensor([[-9.7841],
         [-9.7766],
         [-9.7373],
         [-9.6856],
         [-9.6378],
         [-9.5922],
         [-9.5783],
         [-9.5691],
         [-9.5564],
         [-9.5673],
         [-9.5577],
         [-9.5600],
         [-9.5566],
         [-9.5471],
         [-9.5420],
         [-9.5506],
         [-9.5742],
         [-9.6194],
         [-9.6427],
         [-9.6581],
         [-9.6554],
         [-9.6553],
         [-9.6369],
         [-9.6068],
         [-9.5806],
         [-9.5547],
         [-9.5410],
         [-9.5013],
         [-9.4577],
         [-9.4307],
         [-9.4050],
         [-9.3901],
         [-9.3678],
         [-9.3350],
         [-9.3336],
         [-9.3383],
         [-9.3410],
         [-9.3361],
         [-9.3207],
         [-9.3250],
         [-9.3314],
         [-9.3237],
         [-9.3097],
         [-9.2822],
         [-9.2504],
         [-9.2186],
         [-9.1654],
         [-9.0964],
         [-9.0548],
         [-9.0116]])

In [13]:
class MultiTimeSeriesDataset(Dataset):
    def __init__(self, 
                 n_currencies,
                 x: np.ndarray, 
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x[:n_currencies]).float()
        self.y = torch.tensor(y[:n_currencies]).float()
        self.seq_len = seq_len
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x[0]) * train_percentage)
        self.val_size = int(len(self.x[0]) * val_percentage)
        self.test_size = int(len(self.x[0]) * test_percentage)
        self.train_size = len(self.x[0]) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - ( self.seq_len)

        elif self.data_use_type == "val":
            return self.val_size
  
        else:
            return self.test_size
        
    
    def __getitem__(self, index):
        
        item = dict()
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        for i in range(N_CURRENCIES):
            item["currency_" + str(i) + "_window"] = self.x[i][index:index+self.seq_len]
            item["currency_" + str(i) + "_label"]  = self.y[i][index+self.seq_len]

        return item

In [14]:
train, val, test = [MultiTimeSeriesDataset(N_CURRENCIES, X, y, dtype) 
                    for dtype in ['train', 'val', 'test']]

In [15]:
#first day data for multi-task learning
train[0]

{'currency_0_window': tensor([[-9.7841],
         [-9.7766],
         [-9.7373],
         [-9.6856],
         [-9.6378],
         [-9.5922],
         [-9.5783],
         [-9.5691],
         [-9.5564],
         [-9.5673],
         [-9.5577],
         [-9.5600],
         [-9.5566],
         [-9.5471],
         [-9.5420],
         [-9.5506],
         [-9.5742],
         [-9.6194],
         [-9.6427],
         [-9.6581],
         [-9.6554],
         [-9.6553],
         [-9.6369],
         [-9.6068],
         [-9.5806],
         [-9.5547],
         [-9.5410],
         [-9.5013],
         [-9.4577],
         [-9.4307],
         [-9.4050],
         [-9.3901],
         [-9.3678],
         [-9.3350],
         [-9.3336],
         [-9.3383],
         [-9.3410],
         [-9.3361],
         [-9.3207],
         [-9.3250],
         [-9.3314],
         [-9.3237],
         [-9.3097],
         [-9.2822],
         [-9.2504],
         [-9.2186],
         [-9.1654],
         [-9.0964],
         [-9.0548],