In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
try:
    from PyEMD import EEMD
except:
    !pip install EMD-signal
    from PyEMD import EEMD
    
from ta import add_all_ta_features

In [140]:
def get_data(currency_lst,
             frequency, 
             window_size,
             neutral_quantile = 0.25,
             beg_date = pd.Timestamp(2013,1,1),
             end_date = pd.Timestamp.now(),
             log_price = True, 
             include_indicators = True,
             include_imfs = True):
        
        X, y, dfs = {}, {}, {}     
        
        for cur in currency_lst:
            df = pd.read_csv(f"../data/0_raw/Binance/{str.lower(cur)}_usdt_1d.csv", index_col=0).reset_index()   
            
            if include_indicators:
                df = add_all_ta_features(df, open="Open", high="High", low="Low", close="Close", volume="Volume", 
                                         fillna=True)
            else:
                df.drop("Volume", axis=1, inplace=True)
            
            df.Date = df.Date.apply(pd.Timestamp)
            df.sort_values("Date", ascending=True, inplace=True)
            df.set_index("Date", inplace=True)
            df.drop(["Timestamp", "Open", "High", "Low"], axis=1, inplace=True)
            df.rename(str.lower, axis=1, inplace=True)
            
            if log_price:
                df["close"] = df["close"].apply(np.log)
      
            if include_imfs:
                eemd = EEMD()
                imfs = eemd(df[PRICE_TYPE].values)
                imf_features = ["imf_"+str(i) for i in range(imfs.shape[0])]
                df = pd.concat((df, pd.DataFrame(imfs.T, columns=imf_features, index=df.index)), axis=1)
                
            price_diff = df["close"].diff().dropna()
            rolling_quantiles = price_diff.abs().rolling(window_size).quantile(neutral_quantile).dropna()
            conditions = [(price_diff[window_size-1:] < 0) & (price_diff[window_size-1:].abs() > rolling_quantiles),
                          (price_diff[window_size-1:] > 0) & (price_diff[window_size-1:].abs() > rolling_quantiles)]

            classes = [1,2] #1 is decrease, 2 is decrease, and 0 is neutral if none of conditions is met

            y = pd.Series(np.select(conditions, classes, default=0), index=price_diff[window_size-1:].index)
            df.insert(loc=0, column="change_dir", value=y)
            dfs[cur] = df
            
        min_dates = [df.index.min() for cur, df in dfs.items()]
        max_dates = [df.index.max() for cur, df in dfs.items()]
        beg_date = max([max(min_dates), beg_date])
        end_date = min([min(max_dates), end_date])
        common_range = pd.date_range(beg_date, end_date, freq=frequency)
        
        X = np.array([dfs[cur].drop("change_dir", axis=1).loc[common_range].values for cur in currency_lst])
        y = np.array([dfs[cur].loc[common_range, "change_dir"].values for cur in currency_lst])
        features = df.columns.tolist()
        
        return X, y, features, dfs

In [141]:
CURRENCY_LST = ['BTC', 'ETH', 'LTC']
PRICE_TYPE = 'close'
FREQUENCY = "D"
WINDOW_SIZE = 50
NEUTRAL_QUANTILE = 0.25

In [142]:
X, y, features, dfs = get_data(CURRENCY_LST,
                                 FREQUENCY, 
                                 WINDOW_SIZE,
                                 neutral_quantile = NEUTRAL_QUANTILE,
                                 log_price=True,
                                 include_indicators = False,
                                 include_imfs = False
                                )

In [145]:
(X.shape, y.shape)

((3, 1242, 1), (3, 1242))

In [144]:
#the first WINDOW_SIZE values of change direction will be seen as NaN 

dfs['BTC'].iloc[WINDOW_SIZE-2:WINDOW_SIZE+2]

Unnamed: 0_level_0,change_dir,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-10-04,,8.344883
2017-10-05,,8.364608
2017-10-06,2.0,8.382289
2017-10-07,2.0,8.394573


In [150]:
class TimeSeriesDataset(Dataset):
    def __init__(self, 
                 x: np.ndarray,
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x).float()
        self.y = torch.tensor(y).float()
        self.seq_len = seq_len
        
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x) * train_percentage)
        self.val_size = int(len(self.x) * val_percentage)
        self.test_size = int(len(self.x) * test_percentage)
        self.train_size = len(self.x) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - self.seq_len
        
        elif self.data_use_type == "val":
            return self.val_size 
        
        else:
            return self.test_size
        
    def __getitem__(self, index):
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        window = self.x[index:index+self.seq_len]
        price_change = self.y[index+self.seq_len]
        
        return (window, price_change)

In [166]:
N_CURRENCIES = 3
INPUT_FEATURE_SIZE = 1
WINDOW_SIZE = 50
TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = 0.70, 0.15, 0.15

In [153]:
train, val, test = [TimeSeriesDataset(X[0], y[0], dtype) for dtype in ['train', 'val', 'test']]

In [154]:
len(train) + len(val) + len(test) + WINDOW_SIZE

1242

In [155]:
train[0]

(array([[9.68242246],
        [9.7010641 ],
        [9.77222957],
        [9.85758287],
        [9.84479962],
        [9.8445997 ],
        [9.75818429],
        [9.71044756],
        [9.64812035],
        [9.49751807],
        [9.49551931],
        [9.51044496],
        [9.52510294],
        [9.66071575],
        [9.64601068],
        [9.55973659],
        [9.57351713],
        [9.42867317],
        [9.52634456],
        [9.50151633],
        [9.59390814],
        [9.61042503],
        [9.61976696],
        [9.7386359 ],
        [9.74506551],
        [9.68967719],
        [9.60928695],
        [9.57498349],
        [9.60959222],
        [9.49090568],
        [9.52806729],
        [9.56170122],
        [9.50859065],
        [9.51339838],
        [9.29651807],
        [9.30463094],
        [9.30218729],
        [9.34792429],
        [9.45719576],
        [9.35270761],
        [9.28359548],
        [9.28722548],
        [9.33697214],
        [9.32145858],
        [9.3137089 ],
        [9

In [165]:
class MultiTimeSeriesDataset(Dataset):
    def __init__(self, 
                 n_currencies,
                 x: np.ndarray, 
                 y: np.ndarray,
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x[:n_currencies]).float()
        self.y = torch.tensor(y[:n_currencies]).float()
        self.seq_len = seq_len
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x[0]) * train_percentage)
        self.val_size = int(len(self.x[0]) * val_percentage)
        self.test_size = int(len(self.x[0]) * test_percentage)
        self.train_size = len(self.x[0]) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - ( self.seq_len)

        elif self.data_use_type == "val":
            return self.val_size
  
        else:
            return self.test_size
        
    
    def __getitem__(self, index):
        
        item = dict()
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        for i in range(N_CURRENCIES):
            item["currency_" + str(i) + "_window"] = self.x[i][index:index+self.seq_len]
            item["currency_" + str(i) + "_label"]  = self.y[i][index+self.seq_len]

        return item

In [167]:
train, val, test = [MultiTimeSeriesDataset(N_CURRENCIES, X, y, dtype) 
                    for dtype in ['train', 'val', 'test']]

In [168]:
train[0]

{'currency_0_window': array([[9.68242246],
        [9.7010641 ],
        [9.77222957],
        [9.85758287],
        [9.84479962],
        [9.8445997 ],
        [9.75818429],
        [9.71044756],
        [9.64812035],
        [9.49751807],
        [9.49551931],
        [9.51044496],
        [9.52510294],
        [9.66071575],
        [9.64601068],
        [9.55973659],
        [9.57351713],
        [9.42867317],
        [9.52634456],
        [9.50151633],
        [9.59390814],
        [9.61042503],
        [9.61976696],
        [9.7386359 ],
        [9.74506551],
        [9.68967719],
        [9.60928695],
        [9.57498349],
        [9.60959222],
        [9.49090568],
        [9.52806729],
        [9.56170122],
        [9.50859065],
        [9.51339838],
        [9.29651807],
        [9.30463094],
        [9.30218729],
        [9.34792429],
        [9.45719576],
        [9.35270761],
        [9.28359548],
        [9.28722548],
        [9.33697214],
        [9.32145858],
        [9.