In [106]:
import pandas as pd
import numpy as np
import random
random.seed(42)
np.random.seed(42)
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from datetime import datetime, timedelta
from PyEMD import EEMD
from scipy import stats

In [116]:
def get_data(CURRENCY_LST,
             FREQUENCY, 
             WINDOW_SIZE,
             neutral_quantile = 0.25,
             beg_date = pd.Timestamp(2013,1,1),
             end_date = pd.Timestamp.now(),
             log_price = True, 
             include_indicators = True,
             include_imfs = True):
        
        currency_dfs = {}
        for cur in CURRENCY_LST: 
            currency_dfs[cur] = pd.read_csv(f"../data/0_raw/Indicators/{str.lower(cur)}_usdt_1d_indicators.csv", index_col=0)
        
        for cur, df in currency_dfs.items():
            df.Date = df.Date.apply(pd.Timestamp)
            df.sort_values("Date", ascending=True, inplace=True)
            df.set_index("Date", inplace=True)
            df.drop(["Timestamp","Open", "High", "Low"], axis=1, inplace=True)
            df.rename(str.lower, axis=1, inplace=True)
            
            if log_price:
                df["close"] = df["close"].apply(np.log)
           
            price_diff = df["close"].diff().dropna()

            rolling_quantiles = price_diff.abs().rolling(WINDOW_SIZE).quantile(neutral_quantile)
            rolling_quantiles.dropna(inplace=True)
            conditions = [(price_diff[WINDOW_SIZE-1:] < 0) & (price_diff[WINDOW_SIZE-1:].abs() > rolling_quantiles),
                          (price_diff[WINDOW_SIZE-1:] > 0) & (price_diff[WINDOW_SIZE-1:].abs() > rolling_quantiles)]

            classes = [1,2] # 0 is the default class if none of conditions is met

            y = pd.DataFrame(np.select(conditions, classes, default=0), index=price_diff[WINDOW_SIZE-1:].index)
            df.insert(loc=0, column="change_dir", value=y)
            
            if not include_indicators:
                df = df[['change_dir', 'close']]
            
            if include_imfs:
                eemd = EEMD()
                imfs = eemd(df[PRICE_TYPE].values)
                imf_features = ["imf_"+str(i) for i in range(imfs.shape[0])]
                df = pd.concat((df, pd.DataFrame(imfs.T, columns=imf_features, index=df.index)), axis=1)
                
            currency_dfs[cur] = df

        min_dates = [df.index.min() for cur, df in currency_dfs.items()]
        max_dates = [df.index.max() for cur, df in currency_dfs.items()]
        beg_date = max([max(min_dates), beg_date])
        end_date = min([min(max_dates), end_date])
        common_range = pd.date_range(beg_date, end_date, freq=FREQUENCY)
        
        arr = np.array([currency_dfs[cur].loc[common_range].values for cur in CURRENCY_LST])
        features = df.columns.tolist()
        
        return arr, y, features, currency_dfs

In [117]:
CURRENCY_LST = ['BTC', 'ETH', "LTC"]
PRICE_TYPE = 'close'
FREQUENCY = "D"
WINDOW_SIZE = 50
neutral_quantile = 0.25

In [None]:
arr, y, features, dfs = get_data(CURRENCY_LST,
                            FREQUENCY, 
                            WINDOW_SIZE,
                            neutral_quantile = neutral_quantile,
                            log_price=True,
                            include_indicators=False,
                            include_imfs = False)