In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd
import collections as col

In [3]:
def smape_cal(y_true, y_pred):
    smape = (np.abs(y_true-y_pred)*200)/(np.abs(y_true)+np.abs(y_pred))
    return smape

def mase_cal(y_true, y_pred, freq, len_series, timeseries):
    forecastsNaiveSD = [np.nan]*freq
    for j in range(freq, len_series):
        forecastsNaiveSD.append(timeseries[j - freq])
        
    masep = np.nanmean(np.abs(timeseries - forecastsNaiveSD))
  
    mase = (np.abs(y_true-y_pred)) / masep
    return(mase)

In [4]:
df = pd.read_csv('ETHBTC_1529613479999.csv')

In [5]:
df.head()

Unnamed: 0,OPEN_TIME,OPEN,HIGH,LOW,CLOSE,VOLUME,CLOSE_TIME,QUOTE_VOL,NUM_TRADES,TAKE_BASE_VOL,TAKE_QUOTE_VOL,EXTRA
0,1530638880000,0.071101,0.071145,0.071035,0.071094,107.686,1530638939999,7.657848,142,72.256,5.139032,0.0
1,1530638940000,0.071094,0.071094,0.071054,0.071093,20.544,1530638999999,1.460186,84,12.594,0.895262,0.0
2,1530639000000,0.071093,0.071125,0.071054,0.071104,32.477,1530639059999,2.30884,145,20.775,1.477142,0.0
3,1530639060000,0.071104,0.071109,0.071055,0.071099,52.265,1530639119999,3.714634,115,20.643,1.467686,0.0
4,1530639120000,0.071099,0.071144,0.071062,0.071071,125.828,1530639179999,8.948411,154,117.658,8.367659,0.0


In [6]:
def ohlc_feats(data):
    o = data.OPEN.values
    h = data.HIGH.values
    l = data.LOW.values
    c = data.CLOSE.values
    
    ohlc_feats = pd.DataFrame({})
    
    ohlc_feats['OHLC'] = h - o + h - l + c - l
    ohlc_feats['OLHC'] = o - l + h - l + h - c
    
    ohlc_feats['O_EQUAL_C'] = o == c
    ohlc_feats['O_EQUAL_L'] = o == l
    ohlc_feats['O_EQUAL_H'] = o == h
    ohlc_feats['C_EQUAL_H'] = c == h
    ohlc_feats['C_EQUAL_L'] = c == l
    ohlc_feats['L_EQUAL_H'] = l == h
    
    ohlc_feats['O_GREATER_C'] = o > c
    
    ohlc_feats['O_C_MEAN'] = (o + c)/2
    ohlc_feats['L_H_MEAN'] = (l + h)/2
    
    ohlc_feats['O_OC_MEAN_FRAC'] = o / ohlc_feats['O_C_MEAN']
    ohlc_feats['L_OC_MEAN_FRAC'] = l / ohlc_feats['O_C_MEAN']
    ohlc_feats['H_OC_MEAN_FRAC'] = h / ohlc_feats['O_C_MEAN']
    ohlc_feats['C_OC_MEAN_FRAC'] = c / ohlc_feats['O_C_MEAN']
    
    ohlc_feats['O_LH_MEAN_FRAC'] = o / ohlc_feats['L_H_MEAN']
    ohlc_feats['L_LH_MEAN_FRAC'] = l / ohlc_feats['L_H_MEAN']
    ohlc_feats['H_LH_MEAN_FRAC'] = h / ohlc_feats['L_H_MEAN']
    ohlc_feats['C_LH_MEAN_FRAC'] = c / ohlc_feats['L_H_MEAN']
    
    ohlc_feats['O_GREATER_LH_MEAN'] = o > ohlc_feats['L_H_MEAN']
    ohlc_feats['C_GREATER_LH_MEAN'] = c > ohlc_feats['L_H_MEAN']
    
    ohlc_feats['O_C_MEAN__L_H_MEAN__DIFF']     = ohlc_feats['O_C_MEAN'] - ohlc_feats['L_H_MEAN']
    ohlc_feats['O_C_MEAN__L_H_MEAN__DIFF_ABS'] = np.abs(ohlc_feats['O_C_MEAN'] - ohlc_feats['L_H_MEAN'])
    ohlc_feats['O_C_MEAN__L_H_MEAN__GREATER']  = ohlc_feats['O_C_MEAN'] > ohlc_feats['L_H_MEAN']
    
    ohlc_feats['O_L_DIFF'] = o - l
    ohlc_feats['O_H_DIFF'] = o - h
    ohlc_feats['C_L_DIFF'] = c - l
    ohlc_feats['C_H_DIFF'] = c - h
    ohlc_feats['O_C_DIFF'] = o - c
    ohlc_feats['L_H_DIFF'] = l - h
    
    ohlc_feats['O_L_DIFF_ABS'] = np.abs(o - l)
    ohlc_feats['O_H_DIFF_ABS'] = np.abs(o - h)
    ohlc_feats['C_L_DIFF_ABS'] = np.abs(c - l)
    ohlc_feats['C_H_DIFF_ABS'] = np.abs(c - h)
    ohlc_feats['O_C_DIFF_ABS'] = np.abs(o - c)
    ohlc_feats['L_H_DIFF_ABS'] = np.abs(l - h)
    
    ohlc_feats['O_C_MEAN_PERCENTILE']     = (ohlc_feats['O_C_MEAN'] - l)/(h - l)
    ohlc_feats['O_PERCENTILE']            = (o - l)/(h - l)
    ohlc_feats['C_PERCENTILE']            = (c - l)/(h - l)
    ohlc_feats['O_C_PERCENTILE_DIFF']     = ohlc_feats['O_PERCENTILE'] - ohlc_feats['C_PERCENTILE']
    ohlc_feats['O_C_PERCENTILE_DIFF_ABS'] = np.abs(ohlc_feats['O_PERCENTILE'] - ohlc_feats['C_PERCENTILE'])
    
    return ohlc_feats

In [7]:
def ohlc_cross_feats(data):
    o = data.OPEN.values
    h = data.HIGH.values
    l = data.LOW.values
    c = data.CLOSE.values
    
    ohlc_feats = pd.DataFrame({})
    
    ohlc_feats['O_O1_EQUAL'] = o[1:] == o[:-1]
    ohlc_feats['H_H1_EQUAL'] = h[1:] == h[:-1]
    ohlc_feats['L_L1_EQUAL'] = l[1:] == l[:-1]
    ohlc_feats['C_C1_EQUAL'] = c[1:] == c[:-1]
    
    ohlc_feats['O_O1_GREATER'] = o[1:] > o[:-1]
    ohlc_feats['O_H1_GREATER'] = o[1:] > h[:-1]
    ohlc_feats['O_L1_GREATER'] = o[1:] > l[:-1]
    ohlc_feats['O_C1_GREATER'] = o[1:] > c[:-1]
    
    ohlc_feats['H_O1_GREATER'] = h[1:] > o[:-1]
    ohlc_feats['H_H1_GREATER'] = h[1:] > h[:-1]
    ohlc_feats['H_L1_GREATER'] = h[1:] > l[:-1]
    ohlc_feats['H_C1_GREATER'] = h[1:] > c[:-1]
    
    ohlc_feats['L_O1_GREATER'] = l[1:] > o[:-1]
    ohlc_feats['L_H1_GREATER'] = l[1:] > h[:-1]
    ohlc_feats['L_L1_GREATER'] = l[1:] > l[:-1]
    ohlc_feats['L_C1_GREATER'] = l[1:] > c[:-1]
    
    ohlc_feats['C_O1_GREATER'] = c[1:] > o[:-1]
    ohlc_feats['C_H1_GREATER'] = c[1:] > h[:-1]
    ohlc_feats['C_L1_GREATER'] = c[1:] > l[:-1]
    ohlc_feats['C_C1_GREATER'] = c[1:] > c[:-1]
    
    ohlc_feats['O_O1_ABS_PERC_DIFF'] = (o[1:] - o[:-1]) / o[:-1]
    ohlc_feats['O_H1_ABS_PERC_DIFF'] = (o[1:] - h[:-1]) / h[:-1]
    ohlc_feats['O_L1_ABS_PERC_DIFF'] = (o[1:] - l[:-1]) / l[:-1]
    ohlc_feats['O_C1_ABS_PERC_DIFF'] = (o[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['H_O1_ABS_PERC_DIFF'] = (h[1:] - o[:-1]) / o[:-1]
    ohlc_feats['H_H1_ABS_PERC_DIFF'] = (h[1:] - h[:-1]) / h[:-1]
    ohlc_feats['H_L1_ABS_PERC_DIFF'] = (h[1:] - l[:-1]) / l[:-1]
    ohlc_feats['H_C1_ABS_PERC_DIFF'] = (h[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['L_O1_ABS_PERC_DIFF'] = (l[1:] - o[:-1]) / o[:-1]
    ohlc_feats['L_H1_ABS_PERC_DIFF'] = (l[1:] - h[:-1]) / h[:-1]
    ohlc_feats['L_L1_ABS_PERC_DIFF'] = (l[1:] - l[:-1]) / l[:-1]
    ohlc_feats['L_C1_ABS_PERC_DIFF'] = (l[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['C_O1_ABS_PERC_DIFF'] = (c[1:] - o[:-1]) / o[:-1]
    ohlc_feats['C_H1_ABS_PERC_DIFF'] = (c[1:] - h[:-1]) / h[:-1]
    ohlc_feats['C_L1_ABS_PERC_DIFF'] = (c[1:] - l[:-1]) / l[:-1]
    ohlc_feats['C_C1_ABS_PERC_DIFF'] = (c[1:] - c[:-1]) / c[:-1]
    
    ohlc_feats['OC_ABS_PERC_GREATER'] = np.abs(o[1:] - c[1:]) > np.abs(o[:-1] - c[:-1])
    ohlc_feats['LH_ABS_PERC_GREATER'] = np.abs(l[1:] - h[1:]) > np.abs(l[:-1] - h[:-1])
    
    ohlc_feats['LH_ENGULF'] = (l[1:] < l[:-1]) & (h[1:] > h[:-1])
    ohlc_feats['LH_ENGULFED'] = (l[1:] > l[:-1]) & (h[1:] < h[:-1])
    
    dummy_feats = ohlc_feats.iloc[0:0]
    
    dummy_feats = ohlc_feats.iloc[0:0]
    dummy_feats.loc[0, :] = [np.nan for i in range(len(dummy_feats.columns))]
    ohlc_feats = dummy_feats.append(ohlc_feats, ignore_index=True)
    
    return ohlc_feats

In [8]:
ohlc_feat = ohlc_feats(df)
ohlc_cross_feat = ohlc_cross_feats(df)

ohlc_feat.head()
ohlc_cross_feat.head()

all_feats = pd.concat([ohlc_feat, ohlc_cross_feat], axis=1)
all_feats.shape
all_feats.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,OHLC,OLHC,O_EQUAL_C,O_EQUAL_L,O_EQUAL_H,C_EQUAL_H,C_EQUAL_L,L_EQUAL_H,O_GREATER_C,O_C_MEAN,...,O_H_DIFF_ABS,C_L_DIFF_ABS,C_H_DIFF_ABS,O_C_DIFF_ABS,L_H_DIFF_ABS,O_C_MEAN_PERCENTILE,O_PERCENTILE,C_PERCENTILE,O_C_PERCENTILE_DIFF,O_C_PERCENTILE_DIFF_ABS
0,0.000213,0.000227,False,False,False,False,False,False,True,0.071098,...,4.4e-05,5.9e-05,5.1e-05,7e-06,0.00011,0.568182,0.6,0.536364,0.063636,0.063636
1,7.9e-05,8.1e-05,False,False,True,False,False,False,True,0.071094,...,0.0,3.9e-05,1e-06,1e-06,4e-05,0.9875,1.0,0.975,0.025,0.025
2,0.000153,0.000131,False,False,False,False,False,False,False,0.071099,...,3.2e-05,5e-05,2.1e-05,1.1e-05,7.1e-05,0.626761,0.549296,0.704225,-0.15493,0.15493
3,0.000103,0.000113,False,False,False,False,False,False,True,0.071101,...,5e-06,4.4e-05,1e-05,5e-06,5.4e-05,0.861111,0.907407,0.814815,0.092593,0.092593
4,0.000136,0.000192,False,False,False,False,False,False,True,0.071085,...,4.5e-05,9e-06,7.3e-05,2.8e-05,8.2e-05,0.280488,0.45122,0.109756,0.341463,0.341463


Unnamed: 0,O_O1_EQUAL,H_H1_EQUAL,L_L1_EQUAL,C_C1_EQUAL,O_O1_GREATER,O_H1_GREATER,O_L1_GREATER,O_C1_GREATER,H_O1_GREATER,H_H1_GREATER,...,L_L1_ABS_PERC_DIFF,L_C1_ABS_PERC_DIFF,C_O1_ABS_PERC_DIFF,C_H1_ABS_PERC_DIFF,C_L1_ABS_PERC_DIFF,C_C1_ABS_PERC_DIFF,OC_ABS_PERC_GREATER,LH_ABS_PERC_GREATER,LH_ENGULF,LH_ENGULFED
0,,,,,,,,,,,...,,,,,,,,,,
1,False,False,False,False,False,False,True,False,False,False,...,0.000267,-0.000563,-0.000113,-0.000731,0.000816,-1.4e-05,False,False,False,True
2,False,False,True,False,False,False,True,False,True,True,...,0.0,-0.000549,0.000141,0.000141,0.000704,0.000155,True,True,False,False
3,False,False,False,False,True,False,True,False,True,False,...,1.4e-05,-0.000689,8.4e-05,-0.000366,0.000633,-7e-05,False,False,False,True
4,False,False,False,False,False,False,True,False,True,True,...,9.9e-05,-0.00052,-0.000464,-0.000534,0.000225,-0.000394,True,True,False,False


(450000, 81)

Unnamed: 0,OHLC,OLHC,O_EQUAL_C,O_EQUAL_L,O_EQUAL_H,C_EQUAL_H,C_EQUAL_L,L_EQUAL_H,O_GREATER_C,O_C_MEAN,...,L_L1_ABS_PERC_DIFF,L_C1_ABS_PERC_DIFF,C_O1_ABS_PERC_DIFF,C_H1_ABS_PERC_DIFF,C_L1_ABS_PERC_DIFF,C_C1_ABS_PERC_DIFF,OC_ABS_PERC_GREATER,LH_ABS_PERC_GREATER,LH_ENGULF,LH_ENGULFED
0,0.000213,0.000227,False,False,False,False,False,False,True,0.071098,...,,,,,,,,,,
1,7.9e-05,8.1e-05,False,False,True,False,False,False,True,0.071094,...,0.000267,-0.000563,-0.000113,-0.000731,0.000816,-1.4e-05,False,False,False,True
2,0.000153,0.000131,False,False,False,False,False,False,False,0.071099,...,0.0,-0.000549,0.000141,0.000141,0.000704,0.000155,True,True,False,False
3,0.000103,0.000113,False,False,False,False,False,False,True,0.071101,...,1.4e-05,-0.000689,8.4e-05,-0.000366,0.000633,-7e-05,False,False,False,True
4,0.000136,0.000192,False,False,False,False,False,False,True,0.071085,...,9.9e-05,-0.00052,-0.000464,-0.000534,0.000225,-0.000394,True,True,False,False


In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import Imputer

Y = all_feats.loc[:, 'O_GREATER_C'].values[1:]
X = all_feats.values[:-1]

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = imp.fit_transform(X)

model = RandomForestClassifier(n_estimators=1, criterion='gini', n_jobs=-1, max_features="auto", random_state=0, verbose=1)
model = model.fit(X[:300000], Y[:300000])

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.3s finished


In [15]:
window_size = 5

X_ = np.zeros((X.shape[0] - window_size + 1, X.shape[1]*window_size))
Y_ = np.zeros((X.shape[0] - window_size + 1, ))

for i in range(window_size):
    X_[:, i*X.shape[1]:(i+1)*X.shape[1]] = X[i:X.shape[0]-window_size+i+1]

Y_ = Y[window_size-1:]

model_ = RandomForestClassifier(n_estimators=1, criterion='gini', n_jobs=-1, max_features=50, random_state=0, verbose=1)
model_ = model_.fit(X_[:300000], Y_[:300000])

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   17.0s finished


In [None]:
for i in range(len(model.feature_importances_)):
    print(all_feats.columns[i], model.feature_importances_[i])

In [None]:
model.feature_importances_

In [11]:
Y_pred = model.predict(X[300000:])
(Y[300000:] == Y_pred).sum()

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


77348

In [16]:
Y_pred = model_.predict(X_[300000:])
(Y_[300000:] == Y_pred).sum()

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


77215

In [None]:
Y_pred.shape

In [None]:
X.shape
Y.shape

In [None]:
(Y[-5000:] == Y_pred[-5000:]).sum()

In [None]:
Y_pred[-50:]