In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import pickle as pkl

In [4]:
def get_ohlc_feats(data):
    o = data.OPEN.values.astype(float)
    h = data.HIGH.values.astype(float)
    l = data.LOW.values.astype(float)
    c = data.CLOSE.values.astype(float)
    
    ohlc_feats = pd.DataFrame({'OPEN_TIME': data.OPEN_TIME.values})
    
    ohlc_feats['OHLC'] = h - o + h - l + c - l
    ohlc_feats['OLHC'] = o - l + h - l + h - c
    
    ohlc_feats['O_EQUAL_C'] = (o == c).astype(int)
    ohlc_feats['O_EQUAL_L'] = (o == l).astype(int)
    ohlc_feats['O_EQUAL_H'] = (o == h).astype(int)
    ohlc_feats['C_EQUAL_H'] = (c == h).astype(int)
    ohlc_feats['C_EQUAL_L'] = (c == l).astype(int)
    ohlc_feats['L_EQUAL_H'] = (l == h).astype(int)
    
    ohlc_feats['O_GREATER_C'] = (o > c).astype(int)
    
    ohlc_feats['O_C_MEAN'] = (o + c)/2
    ohlc_feats['L_H_MEAN'] = (l + h)/2
    
    ohlc_feats['O_OC_MEAN_FRAC'] = o / ohlc_feats['O_C_MEAN']
    ohlc_feats['L_OC_MEAN_FRAC'] = l / ohlc_feats['O_C_MEAN']
    ohlc_feats['H_OC_MEAN_FRAC'] = h / ohlc_feats['O_C_MEAN']
    ohlc_feats['C_OC_MEAN_FRAC'] = c / ohlc_feats['O_C_MEAN']
    
    ohlc_feats['O_LH_MEAN_FRAC'] = o / ohlc_feats['L_H_MEAN']
    ohlc_feats['L_LH_MEAN_FRAC'] = l / ohlc_feats['L_H_MEAN']
    ohlc_feats['H_LH_MEAN_FRAC'] = h / ohlc_feats['L_H_MEAN']
    ohlc_feats['C_LH_MEAN_FRAC'] = c / ohlc_feats['L_H_MEAN']
    
    ohlc_feats['O_GREATER_LH_MEAN'] = (o > ohlc_feats['L_H_MEAN']).astype(int)
    ohlc_feats['C_GREATER_LH_MEAN'] = (c > ohlc_feats['L_H_MEAN']).astype(int)
    
    ohlc_feats['O_C_MEAN__L_H_MEAN__DIFF']     = ohlc_feats['O_C_MEAN'] - ohlc_feats['L_H_MEAN']
    ohlc_feats['O_C_MEAN__L_H_MEAN__DIFF_ABS'] = np.abs(ohlc_feats['O_C_MEAN'] - ohlc_feats['L_H_MEAN'])
    ohlc_feats['O_C_MEAN__L_H_MEAN__GREATER']  = (ohlc_feats['O_C_MEAN'] > ohlc_feats['L_H_MEAN']).astype(int)
    
    ohlc_feats['O_L_DIFF'] = o - l
    ohlc_feats['O_H_DIFF'] = o - h
    ohlc_feats['C_L_DIFF'] = c - l
    ohlc_feats['C_H_DIFF'] = c - h
    ohlc_feats['O_C_DIFF'] = o - c
    ohlc_feats['L_H_DIFF'] = l - h
    
    ohlc_feats['O_L_DIFF_ABS'] = np.abs(o - l)
    ohlc_feats['O_H_DIFF_ABS'] = np.abs(o - h)
    ohlc_feats['C_L_DIFF_ABS'] = np.abs(c - l)
    ohlc_feats['C_H_DIFF_ABS'] = np.abs(c - h)
    ohlc_feats['O_C_DIFF_ABS'] = np.abs(o - c)
    ohlc_feats['L_H_DIFF_ABS'] = np.abs(l - h)
    
    ohlc_feats['O_C_MEAN_PERCENTILE']     = ((ohlc_feats['O_C_MEAN'] - l)/(h - l)).fillna(value=1)
    ohlc_feats['O_PERCENTILE']            = ((pd.Series(o) - l)/(h - l)).fillna(value=1)
    ohlc_feats['C_PERCENTILE']            = ((pd.Series(c) - l)/(h - l)).fillna(value=1)
    ohlc_feats['O_C_PERCENTILE_DIFF']     = ohlc_feats['O_PERCENTILE'] - ohlc_feats['C_PERCENTILE']
    ohlc_feats['O_C_PERCENTILE_DIFF_ABS'] = np.abs(ohlc_feats['O_PERCENTILE'] - ohlc_feats['C_PERCENTILE'])
    
    return ohlc_feats

In [5]:
def get_ohlc_cross_feats(data):
    o = data.OPEN.values.astype(float)
    h = data.HIGH.values.astype(float)
    l = data.LOW.values.astype(float)
    c = data.CLOSE.values.astype(float)
    
    ohlc_feats = pd.DataFrame({'OPEN_TIME': data.OPEN_TIME.values[1:]})
    
    ohlc_feats['O_O1_EQUAL'] = (o[1:] == o[:-1]).astype(int)
    ohlc_feats['H_H1_EQUAL'] = (h[1:] == h[:-1]).astype(int)
    ohlc_feats['L_L1_EQUAL'] = (l[1:] == l[:-1]).astype(int)
    ohlc_feats['C_C1_EQUAL'] = (c[1:] == c[:-1]).astype(int)
    
    ohlc_feats['O_O1_GREATER'] = (o[1:] > o[:-1]).astype(int)
    ohlc_feats['O_H1_GREATER'] = (o[1:] > h[:-1]).astype(int)
    ohlc_feats['O_L1_GREATER'] = (o[1:] > l[:-1]).astype(int)
    ohlc_feats['O_C1_GREATER'] = (o[1:] > c[:-1]).astype(int)
    
    ohlc_feats['H_O1_GREATER'] = (h[1:] > o[:-1]).astype(int)
    ohlc_feats['H_H1_GREATER'] = (h[1:] > h[:-1]).astype(int)
    ohlc_feats['H_L1_GREATER'] = (h[1:] > l[:-1]).astype(int)
    ohlc_feats['H_C1_GREATER'] = (h[1:] > c[:-1]).astype(int)
    
    ohlc_feats['L_O1_GREATER'] = (l[1:] > o[:-1]).astype(int)
    ohlc_feats['L_H1_GREATER'] = (l[1:] > h[:-1]).astype(int)
    ohlc_feats['L_L1_GREATER'] = (l[1:] > l[:-1]).astype(int)
    ohlc_feats['L_C1_GREATER'] = (l[1:] > c[:-1]).astype(int)
    
    ohlc_feats['C_O1_GREATER'] = (c[1:] > o[:-1]).astype(int)
    ohlc_feats['C_H1_GREATER'] = (c[1:] > h[:-1]).astype(int)
    ohlc_feats['C_L1_GREATER'] = (c[1:] > l[:-1]).astype(int)
    ohlc_feats['C_C1_GREATER'] = (c[1:] > c[:-1]).astype(int)
    
    ohlc_feats['O_O1_ABS_PERC_DIFF'] = (o[1:] - o[:-1]) / o[:-1]
    ohlc_feats['O_H1_ABS_PERC_DIFF'] = (o[1:] - h[:-1]) / h[:-1]
    ohlc_feats['O_L1_ABS_PERC_DIFF'] = (o[1:] - l[:-1]) / l[:-1]
    ohlc_feats['O_C1_ABS_PERC_DIFF'] = (o[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['H_O1_ABS_PERC_DIFF'] = (h[1:] - o[:-1]) / o[:-1]
    ohlc_feats['H_H1_ABS_PERC_DIFF'] = (h[1:] - h[:-1]) / h[:-1]
    ohlc_feats['H_L1_ABS_PERC_DIFF'] = (h[1:] - l[:-1]) / l[:-1]
    ohlc_feats['H_C1_ABS_PERC_DIFF'] = (h[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['L_O1_ABS_PERC_DIFF'] = (l[1:] - o[:-1]) / o[:-1]
    ohlc_feats['L_H1_ABS_PERC_DIFF'] = (l[1:] - h[:-1]) / h[:-1]
    ohlc_feats['L_L1_ABS_PERC_DIFF'] = (l[1:] - l[:-1]) / l[:-1]
    ohlc_feats['L_C1_ABS_PERC_DIFF'] = (l[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['C_O1_ABS_PERC_DIFF'] = (c[1:] - o[:-1]) / o[:-1]
    ohlc_feats['C_H1_ABS_PERC_DIFF'] = (c[1:] - h[:-1]) / h[:-1]
    ohlc_feats['C_L1_ABS_PERC_DIFF'] = (c[1:] - l[:-1]) / l[:-1]
    ohlc_feats['C_C1_ABS_PERC_DIFF'] = (c[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['OC_ABS_PERC_GREATER'] = (np.abs(o[1:] - c[1:]) > np.abs(o[:-1] - c[:-1])).astype(int)
    ohlc_feats['LH_ABS_PERC_GREATER'] = (np.abs(l[1:] - h[1:]) > np.abs(l[:-1] - h[:-1])).astype(int)
    
    ohlc_feats['LH_ENGULF']   = ((l[1:] < l[:-1]) & (h[1:] > h[:-1])).astype(int)
    ohlc_feats['LH_ENGULFED'] = ((l[1:] > l[:-1]) & (h[1:] < h[:-1])).astype(int)
    
    return ohlc_feats

In [6]:
def get_vol_feats(data):
    bv = data.VOLUME.values.astype(float)
    qv = data.QUOTE_VOL.values.astype(float)
    tbv = data.TAKE_BASE_VOL.values.astype(float)
    tqv = data.TAKE_QUOTE_VOL.values.astype(float)
    nt = data.NUM_TRADES.values.astype(float)
    
    vol_feats = pd.DataFrame({'OPEN_TIME': data.OPEN_TIME.values})
    
    vol_feats['BV_GREATER_TBV'] = (bv  > tbv).astype(int)
    vol_feats['BV_EQUAL_TBV'  ] = (bv == tbv).astype(int)
    vol_feats['BV_LESSER_TBV' ] = (bv  < tbv).astype(int)
    vol_feats['QV_GREATER_TQV'] = (qv  > tqv).astype(int)
    vol_feats['QV_EQUAL_TQV'  ] = (qv == tqv).astype(int)
    vol_feats['QV_LESSER_TQV' ] = (qv  < tqv).astype(int)
    
    # vol_feats['BV_QV_FRAC'    ] = bv / qv
    # vol_feats['QV_BV_FRAC'    ] = qv / bv
    # vol_feats['BV_QV_FRAC_INV'] = 1 - bv / qv
    # vol_feats['QV_BV_FRAC_INV'] = 1 - qv / bv
    # 
    # vol_feats['TBV_TQV_FRAC'    ] = tbv / tqv
    # vol_feats['TQV_TBV_FRAC'    ] = tqv / tbv
    # vol_feats['TBV_TQV_FRAC_INV'] = 1 - tbv / tqv
    # vol_feats['TQV_TBV_FRAC_INV'] = 1 - tqv / tbv
    # 
    # vol_feats['BV_TBV_FRAC'    ] = bv / tbv
    # vol_feats['TBV_BV_FRAC'    ] = tbv / bv
    # vol_feats['BV_TBV_FRAC_INV'] = 1 - bv / tbv
    # vol_feats['TBV_BV_FRAC_INV'] = 1 - tbv / bv
    # 
    # vol_feats['QV_TQV_FRAC'    ] = qv / tqv
    # vol_feats['TQV_QV_FRAC'    ] = tqv / qv
    # vol_feats['QV_TQV_FRAC_INV'] = 1 - qv / tqv
    # vol_feats['TQV_QV_FRAC_INV'] = 1 - tqv / qv
    
    vol_feats['BV_TBV_DIFF'    ] = bv - tbv
    vol_feats['QV_TQV_DIFF'    ] = qv - tqv
    
    vol_feats['BV_TBV_DIFF_ABS'] = np.abs(bv - tbv)
    vol_feats['QV_TQV_DIFF_ABS'] = np.abs(qv - tqv)
    
    return vol_feats

In [7]:
def get_ohlc_vol_feats(data):
    o = data.OPEN.values.astype(float)
    h = data.HIGH.values.astype(float)
    l = data.LOW.values.astype(float)
    c = data.CLOSE.values.astype(float)
    bv = data.VOLUME.values.astype(float)
    qv = data.QUOTE_VOL.values.astype(float)
    tbv = data.TAKE_BASE_VOL.values.astype(float)
    tqv = data.TAKE_QUOTE_VOL.values.astype(float)
    
    ohlc_vol_feats = pd.DataFrame({'OPEN_TIME': data.OPEN_TIME.values})
    
    ohlc_vol_feats['HL_BASE_VOL'] = h - l / bv
    ohlc_vol_feats['OHLC_BASE_VOL'] = h - o + h - l + c - l / bv
    ohlc_vol_feats['OLHC_BASE_VOL'] = o - l + h - l + h - c / bv
    
    ohlc_vol_feats['HL_QUOTE_VOL'] = h - l / qv
    ohlc_vol_feats['OHLC_QUOTE_VOL'] = h - o + h - l + c - l / qv
    ohlc_vol_feats['OLHC_QUOTE_VOL'] = o - l + h - l + h - c / qv
    
    ohlc_vol_feats['HL_TAKE_BASE_VOL'] = h - l / tbv
    ohlc_vol_feats['OHLC_TAKE_BASE_VOL'] = h - o + h - l + c - l / tbv
    ohlc_vol_feats['OLHC_TAKE_BASE_VOL'] = o - l + h - l + h - c / tbv
    
    ohlc_vol_feats['HL_TAKE_QUOTE_VOL'] = h - l / tqv
    ohlc_vol_feats['OHLC_TAKE_QUOTE_VOL'] = h - o + h - l + c - l / tqv
    ohlc_vol_feats['OLHC_TAKE_QUOTE_VOL'] = o - l + h - l + h - c / tqv
    
    ohlc_vol_feats['HL_BASE_VOL_INV']   = 1 / ohlc_vol_feats['HL_BASE_VOL']
    ohlc_vol_feats['OHLC_BASE_VOL_INV'] = 1 / ohlc_vol_feats['OHLC_BASE_VOL']
    ohlc_vol_feats['OLHC_BASE_VOL_INV'] = 1 / ohlc_vol_feats['OLHC_BASE_VOL']
    
    ohlc_vol_feats['HL_QUOTE_VOL_INV']   = 1 / ohlc_vol_feats['HL_QUOTE_VOL']
    ohlc_vol_feats['OHLC_QUOTE_VOL_INV'] = 1 / ohlc_vol_feats['OHLC_QUOTE_VOL']
    ohlc_vol_feats['OLHC_QUOTE_VOL_INV'] = 1 / ohlc_vol_feats['OLHC_QUOTE_VOL']
    
    ohlc_vol_feats['HL_TAKE_BASE_VOL_INV']   = 1 / ohlc_vol_feats['HL_TAKE_BASE_VOL']
    ohlc_vol_feats['OHLC_TAKE_BASE_VOL_INV'] = 1 / ohlc_vol_feats['OHLC_TAKE_BASE_VOL']
    ohlc_vol_feats['OLHC_TAKE_BASE_VOL_INV'] = 1 / ohlc_vol_feats['OLHC_TAKE_BASE_VOL']
    
    ohlc_vol_feats['HL_TAKE_QUOTE_VOL_INV']   = 1 / ohlc_vol_feats['HL_TAKE_QUOTE_VOL']
    ohlc_vol_feats['OHLC_TAKE_QUOTE_VOL_INV'] = 1 / ohlc_vol_feats['OHLC_TAKE_QUOTE_VOL']
    ohlc_vol_feats['OLHC_TAKE_QUOTE_VOL_INV'] = 1 / ohlc_vol_feats['OLHC_TAKE_QUOTE_VOL']
    
    return ohlc_vol_feats

In [8]:
def get_num_trades_feats(data):
    o = data.OPEN.values.astype(float)
    h = data.HIGH.values.astype(float)
    l = data.LOW.values.astype(float)
    c = data.CLOSE.values.astype(float)
    bv = data.VOLUME.values.astype(float)
    qv = data.QUOTE_VOL.values.astype(float)
    tbv = data.TAKE_BASE_VOL.values.astype(float)
    tqv = data.TAKE_QUOTE_VOL.values.astype(float)
    nt = data.NUM_TRADES.values.astype(float)
    
    num_feats = pd.DataFrame({'OPEN_TIME': data.OPEN_TIME.values})
    
    num_feats['num_trades_inv']     = 1/nt
    
    # Vol feats
    # num_feats['NUM_TRADES_PER_BV' ] = nt/bv
    # num_feats['NUM_TRADES_PER_QV' ] = nt/qv
    # num_feats['NUM_TRADES_PER_TBV'] = nt/tbv
    # num_feats['NUM_TRADES_PER_TQV'] = nt/tqv
    # 
    # num_feats['BV_PER_NUM_TRADES' ] = bv/nt
    # num_feats['QV_PER_NUM_TRADES' ] = qv/nt
    # num_feats['TBV_PER_NUM_TRADES'] = tbv/nt
    # num_feats['TQV_PER_NUM_TRADES'] = tqv/nt
    
    # OHLC feats
    num_feats['NUM_TRADES_HL_SPREAD'    ] = h - l / nt
    num_feats['NUM_TRADES_HL_SPREAD_INV'] = nt / h - l
    num_feats['NUM_TRADES_HL_AVG'       ] = (h + l)/2 / nt
    num_feats['NUM_TRADES_OC_AVG'       ] = (o + c)/2 / nt
    
    return num_feats

In [9]:
file = 'ETHBTC_1m_1519496760000_1549568220000.csv'
data = pd.read_csv('data/'+file)

ohlc_feat        = get_ohlc_feats(data)
ohlc_cross_feat  = get_ohlc_cross_feats(data)
vol_feat         = get_vol_feats(data)
ohlc_vol_feat    = get_ohlc_vol_feats(data)
num_trades_feat  = get_num_trades_feats(data)

In [10]:
all_feat = ohlc_feat.merge(ohlc_cross_feat, on='OPEN_TIME', how='outer')
all_feat = all_feat.merge(vol_feat, on='OPEN_TIME', how='outer')
all_feat = all_feat.merge(ohlc_vol_feat, on='OPEN_TIME', how='outer')
all_feat = all_feat.merge(num_trades_feat, on='OPEN_TIME', how='outer')

In [11]:
all_feat.shape
all_feat.head()

(500000, 121)

Unnamed: 0,OPEN_TIME,OHLC,OLHC,O_EQUAL_C,O_EQUAL_L,O_EQUAL_H,C_EQUAL_H,C_EQUAL_L,L_EQUAL_H,O_GREATER_C,...,OHLC_TAKE_BASE_VOL_INV,OLHC_TAKE_BASE_VOL_INV,HL_TAKE_QUOTE_VOL_INV,OHLC_TAKE_QUOTE_VOL_INV,OLHC_TAKE_QUOTE_VOL_INV,num_trades_inv,NUM_TRADES_HL_SPREAD,NUM_TRADES_HL_SPREAD_INV,NUM_TRADES_HL_AVG,NUM_TRADES_OC_AVG
0,1519496760000,0.000258,0.000394,0,0,0,0,0,0,1,...,12.486614,12.455867,49.114696,48.886597,48.526934,0.005208,0.085125,2243.691618,0.000445,0.000445
1,1519496820000,0.00022,0.000268,0,0,0,0,0,0,1,...,13.758942,13.749701,-15.083869,-15.106199,-15.116983,0.008197,0.08487,1425.64787,0.000701,0.0007
2,1519496880000,0.000428,0.000248,0,0,0,0,0,0,0,...,12.28021,12.28762,29.280037,29.059662,29.165677,0.004785,0.08516,2442.387926,0.000409,0.000409
3,1519496940000,0.00029,0.000286,0,0,0,0,0,0,0,...,12.542618,12.523084,63.288578,62.709137,62.625491,0.009524,0.084733,1227.324413,0.000814,0.000815
4,1519497000000,0.000101,0.000251,0,0,0,0,1,0,1,...,13.270008,13.243646,-29.607974,-29.619374,-29.751558,0.008547,0.084822,1367.503838,0.000731,0.000731


In [12]:
f = open('data/OHLCV_'+file+'_.pkl', 'wb')
pkl.dump(all_feat, f, -1)
f.close()