In [1]:
import pandas as pd
import numpy as np
import torch
try:
    from PyEMD import EEMD
except:
    !pip install EMD-signal
    from PyEMD import EEMD

In [2]:
def get_data(currency_lst,
             frequency, 
             window_size,
             neutral_quantile = 0.25,
             beg_date = pd.Timestamp(2013,1,1),
             end_date = pd.Timestamp.now(),
             log_price = True, 
             include_indicators = True,
             include_imfs = True):
        
        currency_dfs = {}
        for cur in currency_lst: 
            currency_dfs[cur] = pd.read_csv(f"../data/0_raw/Binance/{str.lower(cur)}_usdt_1d_indicators.csv", index_col=0)
        
        for cur, df in currency_dfs.items():
            df.Date = df.Date.apply(pd.Timestamp)
            df.sort_values("Date", ascending=True, inplace=True)
            df.set_index("Date", inplace=True)
            df.drop(["Timestamp","Open", "High", "Low"], axis=1, inplace=True)
            df.rename(str.lower, axis=1, inplace=True)
            
            if log_price:
                df["close"] = df["close"].apply(np.log)
           
            price_diff = df["close"].diff().dropna()

            rolling_quantiles = price_diff.abs().rolling(window_size).quantile(neutral_quantile)
            rolling_quantiles.dropna(inplace=True)
            conditions = [(price_diff[window_size-1:] < 0) & (price_diff[window_size-1:].abs() > rolling_quantiles),
                          (price_diff[window_size-1:] > 0) & (price_diff[window_size-1:].abs() > rolling_quantiles)]

            #1 is decrease, 2 is decrease
            classes = [1,2] #0 is the default class if none of conditions is met

            y = pd.DataFrame(np.select(conditions, classes, default=0), index=price_diff[window_size-1:].index)
            df.insert(loc=0, column="change_dir", value=y)
            
            if not include_indicators:
                df = df[['change_dir', 'close']]
            
            if include_imfs:
                eemd = EEMD()
                imfs = eemd(df[PRICE_TYPE].values)
                imf_features = ["imf_"+str(i) for i in range(imfs.shape[0])]
                df = pd.concat((df, pd.DataFrame(imfs.T, columns=imf_features, index=df.index)), axis=1)
                
            currency_dfs[cur] = df

        min_dates = [df.index.min() for cur, df in currency_dfs.items()]
        max_dates = [df.index.max() for cur, df in currency_dfs.items()]
        beg_date = max([max(min_dates), beg_date])
        end_date = min([min(max_dates), end_date])
        common_range = pd.date_range(beg_date, end_date, freq=frequency)
        
        arr = np.array([currency_dfs[cur].loc[common_range].values for cur in currency_lst])
        features = df.columns.tolist()
        
        return arr, y, features, currency_dfs

In [18]:
CURRENCY_LST = ['BTC', 'ETH']
PRICE_TYPE = 'close'
FREQUENCY = "D"
WINDOW_SIZE = 50
NEUTRAL_QUANTILE = 0.25

In [19]:
arr, y, features, dfs = get_data(CURRENCY_LST,
                                 FREQUENCY, 
                                 WINDOW_SIZE,
                                 neutral_quantile = NEUTRAL_QUANTILE,
                                 log_price=True,
                                 include_indicators = False,
                                 include_imfs = False
                                )

In [60]:
arr.shape

(2, 1360, 2)

In [67]:
dfs['BTC'].iloc[WINDOW_SIZE-1:WINDOW_SIZE+1]

Unnamed: 0_level_0,change_dir,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-10-05,,8.364608
2017-10-06,2.0,8.382289


In [21]:
N_CURRENCIES = 1
INPUT_FEATURE_SIZE = 1
WINDOW_SIZE = 50
TRAIN_PERCENTAGE, VAL_PERCENTAGE, TEST_PERCENTAGE = 0.70, 0.15, 0.15

In [54]:
class TimeSeriesDataset():
    def __init__(self, 
                 x: np.ndarray, 
                 data_use_type,
                 train_percentage = TRAIN_PERCENTAGE,
                 val_percentage = VAL_PERCENTAGE,
                 test_percentage = TEST_PERCENTAGE,
                 seq_len = WINDOW_SIZE, 
                 ):
        
        self.x = torch.tensor(x).float()
        self.seq_len = seq_len
        
        self.data_use_type = data_use_type
        
        #self.train_size = int(len(self.x) * train_percentage)
        self.val_size = int(len(self.x) * val_percentage)
        self.test_size = int(len(self.x) * test_percentage)
        self.train_size = len(self.x) - self.val_size - self.test_size 
        
    def __len__(self):
        
        if self.data_use_type == "train":
            return self.train_size - self.seq_len
        
        if self.data_use_type == "val":
            return self.val_size 
        else:
            return self.test_size
        
    def __getitem__(self, index):
        
        if self.data_use_type =="val":
            index = self.train_size + index - self.seq_len
            
        elif self.data_use_type =="test":
            index = self.train_size + self.val_size + index - self.seq_len
        
        window = self.x[index:index+self.seq_len, 1]
        price_change = self.x[index+self.seq_len, 0]
        
        return (window, price_change)

In [55]:
arr.shape

(2, 1360, 2)

In [56]:
a = TimeSeriesDataset(arr[0], 'train')
b = TimeSeriesDataset(arr[0], 'val')
c = TimeSeriesDataset(arr[0], 'test')

In [68]:
len(a) + len(b) + len(c) + WINDOW_SIZE

1360

In [59]:
a[0]

(array([8.3628945 , 8.32078164, 8.32844624, 8.31539275, 8.29804166,
        8.30399997, 8.3221535 , 8.37008664, 8.36186715, 8.37503959,
        8.3686955 , 8.38633024, 8.43108613, 8.42401154, 8.46059956,
        8.48361779, 8.40562232, 8.41384842, 8.31876908, 8.38171018,
        8.4381002 , 8.45353109, 8.36236228, 8.35674506, 8.32612227,
        8.34485444, 8.33416418, 8.28012565, 8.06746894, 8.2160881 ,
        8.2201205 , 8.2160854 , 8.30276406, 8.27130288, 8.26873183,
        8.19146028, 8.18754124, 8.23747929, 8.20522389, 8.27403824,
        8.26419592, 8.34117175, 8.33674987, 8.33679538, 8.38446376,
        8.38445691, 8.384804  , 8.36869318, 8.34488295, 8.36460829]),
 2.0)