In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import talib
import re

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
idx = pd.IndexSlice

datapath = '../DATA/data.parquet'

In [2]:
DATASTORE = '../DATA/crypto.h5'

In [3]:
with pd.HDFStore(DATASTORE) as store:
    #metadata = store['crypto/processed_metadata']
    rdata = store['crypto/raw_data']
    close = store['crypto/raw_data'].Close.unstack('asset')

In [None]:
rdata.info()

#### talib indicators

In [4]:
def apply_ta_indicators(group):
    highs_and_lows = {
        'fiftyTwoWeek': 7 * 52,
        'Quarterly': 364//4,
        'Monthly': 364//12,
        'Weekly': 7
    }

    for key, value in highs_and_lows.items():
        group[f'{key}_Low'] = group.Close.rolling(window=value).min()
        group[f'{key}_High'] = group.Close.rolling(window=value).max()
        
        group[f'percent_above_{key}_low'] = round((((group.Close - group[f'{key}_Low']) / group[f'{key}_Low']) * 100), 2)
        group[f'percent_below_{key}_high'] = round((((group[f'{key}_High'] - group.Close) / group[f'{key}_High']) * 100), 2)

    for key, _ in highs_and_lows.items():
        group = group.drop([f'{key}_Low', f'{key}_High'], axis=1)
    
    window=364
    group['DX'] = talib.DX(group['High'], group['Low'], group['Close'], timeperiod=14) # Directional Flow Index
    group['MFI'] = talib.MFI(group['High'], group.Low, group.Close, group.Volume, timeperiod=14) # Money Flow Index
    group['PPO'] = talib.PPO(group.Close, fastperiod=12, slowperiod=26, matype=0) # 12/26 Day Percent Price Oscillator
    slowk, slowd = talib.STOCH(group.High, group.Low, group.Close, fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0) # Default settings for stochastic oscillator
    group['slowk'], group['slowd'] = slowk, slowd
    group['Trendline'] = talib.LINEARREG_SLOPE(group.Close, timeperiod=window) # 52-Week Trendline
    group['EMA'] = talib.EMA(group['Close'], timeperiod=14)
    up, mid, low = talib.BBANDS(group['Close'])
    group['BB_up'], group['BB_mid'], group['BB_low'] = up, mid, low
    group['distanceToEMA'] = group['Close'] - group['EMA']
    #group['candle_type'] = (group['Close'] > group['Open']).astype(int)

    group.drop(['Close', 'Volume', 'Open', 'High', 'Low'], axis=1, inplace=True)

    return group

In [None]:
364//12

In [None]:
ta_data = None

In [None]:
ta_data

In [5]:
ta_data = rdata.groupby(level='asset').apply(apply_ta_indicators)

In [6]:
ta_data = ta_data.reset_index(level=1).drop('asset', axis=1)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
ta_data.head(25)

In [None]:
ta_data.scaled_distanceToEMA.value_counts()

#### returns

In [7]:
outlier_cutoff = 0.01
data = pd.DataFrame()
lags = [1, 3, 7, 14, 30]

for lag in lags:
    data[f'returns_{lag}D'] = (close
                              .pct_change(lag)
                              .stack()
                              .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                    upper=x.quantile(1-outlier_cutoff)))
                              .add(1)
                              .pow(1/lag)
                              .sub(1)
                            )

In [8]:
data = data.swaplevel()

In [None]:
data.info()

In [10]:
for col in ta_data.columns:
    data[col] = ta_data[col]

In [None]:
data.head()

In [None]:
data.info()

In [11]:
# momentum lags
lags = [3, 7, 14, 30]
for lag in lags:
    data[f'momentum{lag}'] = data[f'returns_{lag}D'].sub(data.returns_1D)
data['momentum_wk_month'] = data.returns_30D.sub(data.returns_7D)

In [None]:
data.info()

In [12]:
for t in range(1, 8):
    data[f'returns_1D_t-{t}'] = data.groupby(level='asset').returns_1D.shift(t)

data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 358864 entries, ('BTCUSDT', datetime.date(2017, 8, 18)) to ('ZRXUSDT', datetime.date(2023, 11, 22))
Data columns (total 36 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   returns_1D                       358864 non-null  float64
 1   returns_3D                       358152 non-null  float64
 2   returns_7D                       356728 non-null  float64
 3   returns_14D                      354243 non-null  float64
 4   returns_30D                      348594 non-null  float64
 5   percent_above_fiftyTwoWeek_low   227740 non-null  float64
 6   percent_below_fiftyTwoWeek_high  227740 non-null  float64
 7   percent_above_Quarterly_low      316942 non-null  float64
 8   percent_below_Quarterly_high     316942 non-null  float64
 9   percent_above_Monthly_low        338241 non-null  float64
 10  percent_below_Monthly_high       338241 non-null  float6

### Target: Holding Periods

In [13]:
for t in [1, 3, 7, 14, 30]:
    data[f'target_{t}D'] = data.groupby(level='asset')[f'returns_{t}D'].shift(-t)

In [None]:
data.info()

In [None]:
cols = [
    'target_1D',
    'target_3D',
    'target_7D',
    'returns_1D',
    'returns_3D',
    'returns_1D_t-1',
    'returns_1D_t-3',
    'returns_1D_t-7',
]

In [None]:
data[cols].dropna().sort_index().head(10)

#### Date Indicators

In [14]:
dates = pd.to_datetime(data.index.get_level_values('Date'))
data['date'] = dates

In [15]:
data['month'] = dates.month
data['weekday'] = dates.weekday
#data['quarter'] = dates.quarter
data['day'] = dates.day

In [16]:
data = data.reset_index()
data.head()

Unnamed: 0,asset,Date,returns_1D,returns_3D,returns_7D,returns_14D,returns_30D,percent_above_fiftyTwoWeek_low,percent_below_fiftyTwoWeek_high,percent_above_Quarterly_low,...,returns_1D_t-7,target_1D,target_3D,target_7D,target_14D,target_30D,date,month,weekday,day
0,BTCUSDT,2017-08-18,-0.041238,,,,,,,,...,,0.007694,-0.007551,0.005887,0.011699,-0.003484,2017-08-18,8,4,18
1,ETHUSDT,2017-08-18,-0.026623,,,,,,,,...,,-0.010376,0.03221,0.015439,0.02011,-0.004398,2017-08-18,8,4,18
2,BTCUSDT,2017-08-19,0.007694,,,,,,,,...,,-0.012969,-0.008116,0.006678,0.005528,-0.000856,2017-08-19,8,5,19
3,ETHUSDT,2017-08-19,-0.010376,,,,,,,,...,,0.028153,0.021192,0.018522,0.011864,0.000417,2017-08-19,8,5,19
4,BTCUSDT,2017-08-20,-0.012969,-0.015709,,,,,,,...,,-0.017201,0.002256,0.007644,0.007057,-0.001469,2017-08-20,8,6,20


In [17]:
data = data.drop('Date', axis=1)

In [18]:
data.rename(columns={'date': 'Date'}, inplace=True)

In [None]:
data.columns

In [19]:
data.set_index(['asset', 'Date'], inplace=True)

In [None]:
data.info()

In [None]:
test = pd.DataFrame()
cats = ['month', 'weekday', 'day']

for cat in cats:
    test[cat] = pd.factorize(data[cat])[0]

test.info()

In [None]:
for i in range(30):
    print((test.month[i], test.weekday[i], test.day[i]), (data.month[i], data.weekday[i], data.day[i]))

In [None]:
print()

In [20]:
data = data.dropna()
#data.head()

In [21]:
print('data before duplicated: {}', len(data))
data = data[~data.index.duplicated()]
print('data after removing duplicate: {}', len(data))

data before duplicated: {} 219062
data after removing duplicate: {} 219062


In [22]:
# drop cryptos with less than 1 year observation
min_obs = 365
nobs = data.groupby(level='asset').size()
keep = nobs[nobs>min_obs].index
print(f"number of assets before removing minimum obs: {data.index.get_level_values('asset').nunique()}")
data = data.loc[idx[keep,:], :]
print(f"number of assets after removing minimum obs: {data.index.get_level_values('asset').nunique()}")
data.info()

number of assets before removing minimum obs: 304
number of assets after removing minimum obs: 239
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 203583 entries, ('1INCHUSDT', Timestamp('2021-12-23 00:00:00')) to ('ZRXUSDT', Timestamp('2023-10-23 00:00:00'))
Data columns (total 44 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   returns_1D                       203583 non-null  float64
 1   returns_3D                       203583 non-null  float64
 2   returns_7D                       203583 non-null  float64
 3   returns_14D                      203583 non-null  float64
 4   returns_30D                      203583 non-null  float64
 5   percent_above_fiftyTwoWeek_low   203583 non-null  float64
 6   percent_below_fiftyTwoWeek_high  203583 non-null  float64
 7   percent_above_Quarterly_low      203583 non-null  float64
 8   percent_below_Quarterly_high     203583 non-null  float64
 9   percent

In [24]:
with pd.HDFStore('../DATA/crypto_.h5') as store:
    store.put('crypto/data', data)
    print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: ../DATA/crypto_.h5
/crypto/data            frame        (shape->[203583,44])


In [None]:
with pd.HDFStore(DATASTORE) as store:
    print(store.info())