In [1]:
# Import statements (standard)
import math
import time
import numpy as np
import pandas as pd
import datetime as dt
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

# Import statements (custom)
import helper_functions as hf

# Import statements (RF)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load data (closing prices).
cp_df = pd.read_csv('../data/cleaned_crypto_closing_prices.csv', index_col='time')

In [3]:
# Load data (all features).
all_feat_df = pd.read_csv('../data/cleaned_crypto_all_features.csv', header=[0,1], skiprows=[2], index_col=0)
all_feat_df.index.name = 'time'
all_feat_df.index = pd.to_datetime(all_feat_df.index)
all_feat_df = all_feat_df[:-1].astype(float) # drop last row (missing closing prices)

In [4]:
# Collapse hierarchical column labels (for sake of ease).
first_level_labels = list(all_feat_df.columns.labels[0])
second_level_labels = list(all_feat_df.columns.labels[1])

new_labels = []

for i in range(len(first_level_labels)):
    f_str = all_feat_df.columns.levels[0][first_level_labels[i]]
    s_str = all_feat_df.columns.levels[1][second_level_labels[i]]
    
    new_str = s_str + '_' + f_str
    new_labels.append(new_str)

all_feat_df.columns = new_labels

In [5]:
# Construct DataFrame with lag-1 closing prices. 
lag_one_df = cp_df.shift(1)
lag_one_df.columns = [s + '_lag1' for s in cp_df.columns]

# Construct DataFrame with lag-2 closing prices. 
lag_two_df = cp_df.shift(2)
lag_two_df.columns = [s + '_lag2' for s in cp_df.columns]

# Construct DataFrame with lag-3 closing prices. 
lag_three_df = cp_df.shift(3)
lag_three_df.columns = [s + '_lag3' for s in cp_df.columns]

# Construct DataFrame with lag-4 closing prices. 
lag_four_df = cp_df.shift(4)
lag_four_df.columns = [s + '_lag4' for s in cp_df.columns]

# Construct DataFrame with lag-5 closing prices. 
lag_five_df = cp_df.shift(5)
lag_five_df.columns = [s + '_lag5' for s in cp_df.columns]

# Construct DataFrame with lag-6 closing prices. 
lag_six_df = cp_df.shift(6)
lag_six_df.columns = [s + '_lag6' for s in cp_df.columns]

# Construct DataFrame with lag-7 closing prices. 
lag_seven_df = cp_df.shift(7)
lag_seven_df.columns = [s + '_lag7' for s in cp_df.columns]

# Construct DataFrame with lag-8 closing prices. 
lag_eight_df = cp_df.shift(8)
lag_eight_df.columns = [s + '_lag8' for s in cp_df.columns]

# Merge all lags into one df. 
lag_df = lag_one_df.merge(lag_two_df, left_index=True, right_index=True) \
                   .merge(lag_three_df, left_index=True, right_index=True) \
                   .merge(lag_four_df, left_index=True, right_index=True) \
                   .merge(lag_five_df, left_index=True, right_index=True) \
                   .merge(lag_six_df, left_index=True, right_index=True) \
                   .merge(lag_seven_df, left_index=True, right_index=True) \
                   .merge(lag_eight_df, left_index=True, right_index=True) 

In [6]:
# Merge lag DataFrame with all_feat_df. 
merge_df = all_feat_df.merge(lag_df, left_index=True, right_index=True)
merge_df.dropna(axis=0, inplace=True) # drop rows missing lag values

# Drop all columns containing data for cryptocurrencies other than Ethereum and Monero. 
eth_cols = [s for s in list(merge_df.columns) if 'ETH' in s]
xmr_cols = [s for s in list(merge_df.columns) if 'XMR' in s]
keep_cols = eth_cols + xmr_cols
merge_df.drop([s for s in merge_df.columns if s not in keep_cols], axis=1, inplace=True)

In [7]:
merge_df.head()

Unnamed: 0_level_0,ETH_USD_close,XMR_USD_close,ETH_USD_volume,XMR_USD_volume,ETH_USD_fluctuation,XMR_USD_fluctuation,ETH_USD_relative_hl_close,XMR_USD_relative_hl_close,ETH_USD_lag1,XMR_USD_lag1,...,ETH_USD_lag4,XMR_USD_lag4,ETH_USD_lag5,XMR_USD_lag5,ETH_USD_lag6,XMR_USD_lag6,ETH_USD_lag7,XMR_USD_lag7,ETH_USD_lag8,XMR_USD_lag8
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-09-17 15:00:00,262.96,96.163333,563732.4,6521.94,0.012616,0.013505,0.606836,0.408165,262.025,96.01,...,255.56,95.6,253.1875,95.44,253.555,94.123333,246.485,92.756667,246.1225,92.036667
2017-09-17 16:00:00,260.055,95.823333,1301293.0,64634.153333,0.020059,0.015401,0.161163,0.117506,262.96,96.163333,...,258.595,95.743333,255.56,95.6,253.1875,95.44,253.555,94.123333,246.485,92.756667
2017-09-17 17:00:00,258.72,95.193333,1059513.0,36948.256667,0.01454,0.014654,0.363335,0.071225,260.055,95.823333,...,263.55,96.76,258.595,95.743333,255.56,95.6,253.1875,95.44,253.555,94.123333
2017-09-17 18:00:00,259.875,95.266667,905734.7,9920.033333,0.015788,0.011584,0.346623,0.334716,258.72,95.193333,...,262.025,96.01,263.55,96.76,258.595,95.743333,255.56,95.6,253.1875,95.44
2017-09-17 19:00:00,258.34,94.203333,1060469.0,59256.0,0.013284,0.013641,0.534556,0.062381,259.875,95.266667,...,262.96,96.163333,262.025,96.01,263.55,96.76,258.595,95.743333,255.56,95.6


In [8]:
# Prepare training/test DataFrames. 
train_end = pd.to_datetime('2017/11/30 12:00:00')
test_start = pd.to_datetime('2017/11/30 13:00:00')
target = 'ETH_USD_close'

train_df = merge_df.ix[:train_end]
test_df = merge_df.ix[test_start:] 

X_train = train_df.copy().drop(target, axis=1).values
X_test = test_df.copy().drop(target, axis=1).values
y_train = train_df[target].values
y_test = test_df[target].values

In [9]:
# Perform grid search for hyperparameters. 
def Grid_Search_CV_RFR(X_train, y_train):
    reg = RandomForestRegressor()
    param_grid = { 
            "n_estimators"      : [10,50,100,500],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_leaf" : [1,5,10,20]
            }

    tss_splits = TimeSeriesSplit(n_splits=10).split(X_train)
    grid = GridSearchCV(reg, param_grid, cv=tss_splits, verbose=0)
    #grid = GridSearchCV(reg, param_grid, cv=3, verbose=0)

    grid.fit(X_train, y_train)

    return grid.best_score_ , grid.best_params_

best_score, best_params = Grid_Search_CV_RFR(X_train, y_train)

In [10]:
mf = best_params['max_features']
msl = best_params['min_samples_leaf']
ne = best_params['n_estimators']

In [11]:
# Fit RFR with best parameters from grid search.
rfr = RandomForestRegressor(n_estimators=ne, max_features=mf, min_samples_leaf=msl, random_state=10)
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=10,
           verbose=0, warm_start=False)

In [12]:
# Examine feature importances.
features = train_df.copy().drop(target, axis=1).columns
importances = rfr.feature_importances_
indices = np.argsort(importances)

trace = go.Bar(
    x=features[indices],
    y=importances[indices],
    marker = dict(color='green')
)

data=[trace]

# Edit the layout, then plot!
layout = dict(title = 'Feature Importance (RF)',
              yaxis = dict(title = 'Relative Importance'),
              xaxis = dict(autorange='reversed',
                           tickfont=dict(size=10))
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='importance')

In [13]:
# Get predictions for train/test sets.
train_pred = rfr.predict(X_train)
test_pred = rfr.predict(X_test)

In [14]:
# Create traces.
def create_trace(df, color, label):
    dates = df.index 
    prices = df[df.columns[0]].values

    trace = go.Scatter(
        x = dates,
        y = prices,
        name = label,
        line = dict(color=color)
    )
    return trace

train_act_df = pd.DataFrame(y_train).set_index(train_df.index)
train_act_df.columns = ['act_close']
train_pred_df = pd.DataFrame(train_pred).set_index(train_df.index)
train_pred_df.columns = ['pred_close']

pred_trace = create_trace(train_pred_df, 'red', 'Predicted')
act_trace = create_trace(train_act_df, 'blue', 'Actual')
data = [pred_trace, act_trace]

In [15]:
# Edit the layout, then plot!
layout = dict(title = 'Ethereum Closing Price (Training)',
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'Closing Price'),
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='training-prices')

In [16]:
test_act_df = pd.DataFrame(y_test).set_index(test_df.index)
test_act_df.columns = ['act_close']
test_pred_df = pd.DataFrame(test_pred).set_index(test_df.index)
test_pred_df.columns = ['pred_close']

pred_trace = create_trace(test_pred_df, 'red', 'Predicted')
act_trace = create_trace(test_act_df, 'blue', 'Actual')
data = [pred_trace, act_trace]

In [17]:
# Edit the layout, then plot!
layout = dict(title = 'Ethereum Closing Price (Test)',
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'Closing Price'),
              )

fig = dict(data=data, layout=layout)
iplot(fig, filename='test-prices')