In [1]:
import pandas as pd
import numpy as np
import datetime
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from keras import backend as K
import matplotlib.pyplot as plt
from datetime import timedelta
import sys
sys.path.insert(1, '/Users/yuesongyang/Desktop/BT4222_repo')
from backtest import *
import tensorflow as tf
tf.random.set_seed(0)

In [2]:
features = pd.read_csv("../../data/cooked_data/cooked_complete_dataset.csv")

In [3]:
features.dropna(inplace = True)

In [4]:
features['date'] = features['date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
features["date"] = pd.to_datetime(features["date"], format='%d/%m/%Y', infer_datetime_format=True)

In [34]:
features["class_y"] = features["Adj_Close_BTC-USD"].shift(1).dropna()
features["class_y"] = features.apply(lambda x : 1 if x["class_y"] < x["Adj_Close_BTC-USD"] else 0 , axis = 1)

In [35]:
features.head()

Unnamed: 0,date,Adj_Close_BTC-USD,Open_BTC-USD,High_BTC-USD,Low_BTC-USD,Volume_BTC-USD,Adj_Close_SPY,Adj_Close_GLD,Adj_Close_CHFUSD=X,Adj_Close_CNYUSD=X,Adj_Close_EURUSD=X,Adj_Close_GBPUSD=X,Adj_Close_JPYUSD=X,coindesk_sentiment,num_of_coindesk_posts,reddit_comments_sentiments,top_50_reddit_posts_sentiments,blockchain_transactions_per_block,blockchain_hash_rates,class_y
0,2020-12-14,19246.6445,19144.4922,19305.0996,19012.709,22500000000.0,361.926788,171.539993,1.125442,0.152772,1.21334,1.331824,0.009621,0.249489,12,0.15806,0.677618,2167.93103,134533588,0
1,2020-12-15,19417.0762,19246.9199,19525.0078,19079.8418,26700000000.0,366.819824,173.940002,1.12793,0.152679,1.21489,1.333084,0.009614,0.173773,18,0.10193,0.447277,2288.85714,133351912,1
2,2020-12-16,21310.5977,19418.8184,21458.9082,19298.3164,44400000000.0,367.395508,174.899994,1.129382,0.152945,1.21543,1.344447,0.009649,0.341491,11,0.127344,0.480809,2204.31469,132323572,1
3,2020-12-17,22805.1621,21308.3516,23642.6602,21234.6758,71400000000.0,369.449982,176.740006,1.129446,0.153109,1.219959,1.350293,0.009664,0.197572,10,0.135945,0.539729,2399.07752,132373209,1
4,2020-12-18,23137.9609,22806.7969,23238.6016,22399.8125,40400000000.0,367.974793,176.440002,1.130301,0.15309,1.226272,1.357018,0.009696,0.315601,2,0.135441,0.449503,2392.03185,131791042,1


**Helper functions to create lags for features and calculate RMSE**

In [36]:
def lag(data, dic):
    cols = []
    for key, value in dic.items():
        for i in range(1, value+1):
            cols.append(data[key].shift(i).rename('{}_lag{}'.format(data[key].name, i)))
    return pd.concat([data["date"],data["class_y"]] + cols, axis = 1)

## Without Sentiments

#### 1. Create feature lags

In [37]:
feature_lags = {"Adj_Close_BTC-USD" : 1,
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1}

data = lag(features, feature_lags)

#### 2. Handle train-test split

In [38]:
# data['date'] = data['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
# data["date"] = pd.to_datetime(data["date"], format='%d/%m/%Y', infer_datetime_format=True)
data = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-04-12")]

train = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-14")]
validation = data[(data["date"] >= "2021-03-15") & (data["date"] <= "2021-03-29")]
test = data[(data["date"] >= "2021-03-30") & (data["date"] <= "2021-04-12")]

refit = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-29")]
full = data.copy(deep = True)


x_train = train.drop(["date", "class_y"], axis = 1)
y_train = train["class_y"]

x_val = validation.drop(["date", "class_y"], axis = 1)
y_val = validation["class_y"]

x_test = test.drop(["date", "class_y"], axis = 1)
y_test = test["class_y"]

x_refit = refit.drop(["date", "class_y"], axis = 1)
y_refit = refit["class_y"]

x_full = full.drop(["date", "class_y"], axis = 1)
y_full = full["class_y"]

#### 3. MinMax scale the features

In [39]:
sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_val_trans = sc.transform(x_val)
x_test_trans = sc.transform(x_test)

sc2 = MinMaxScaler(feature_range = (0, 1))
sc2.fit(x_refit)
x_refit_trans = sc2.transform(x_refit)

sc3 = MinMaxScaler(feature_range = (0, 1))
sc3.fit(x_full)
x_full_trans = sc3.transform(x_full)

#### 4. Transform the feature shapes to fit into LSTM

In [40]:
x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_val_trans = x_val_trans.reshape(x_val_trans.shape[0], x_val_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)
x_full_trans = x_full_trans.reshape(x_full_trans.shape[0], x_full_trans.shape[1], 1)

#### 5. Define a Grid Search function

In [41]:
def LSTM_GS(layer_spe, time_step, e):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1, activation = "sigmoid"))
    regressor.compile(optimizer = 'Adam', loss = "binary_crossentropy", metrics=['accuracy'])
    regressor.fit(x_train_trans, y_train, epochs = e, verbose = 0)
    
    yhat = regressor.predict(x_val_trans)
    yhat = [1 if x[0] > 0.5 else 0 for x in yhat]
    return accuracy_score(y_train, [1 if x[0] > 0.5 else 0 for x in regressor.predict(x_train_trans)]), accuracy_score(y_val, yhat)

#### 6. Grid Search on a few hyperparameters combinations

In [42]:
res = []
# 1 layer
for unit in (4,8,16):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit],time_step, e), LSTM_GS([unit], time_step, e)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit, unit//2],time_step, e),LSTM_GS([unit, unit//2], time_step, e)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit, unit//2, unit//4],time_step, e),LSTM_GS([unit, unit//2, unit//4], time_step, e)))













In [43]:
# best combi
sorted(res, key = lambda x : x[1][1], reverse = True)[0]

(([16], 1, 500), (0.6438356164383562, 0.6))

In [44]:
layer_config = sorted(res, key = lambda x : x[1][1], reverse = True)[0][0][0]
time_config = sorted(res, key = lambda x : x[1][1], reverse = True)[0][0][1]
e_config = sorted(res, key = lambda x : x[1][1], reverse = True)[0][0][2]

In [45]:
print(layer_config, time_config, e_config)

[16] 1 500


#### 7. Define a function to retrain the model with selected hyperparameters and all data avilable

In [46]:
def LSTM_pipe(layer_spe, time_step, e, x, y):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1, activation = "sigmoid"))
    regressor.compile(optimizer = 'Adam', loss = "binary_crossentropy", metrics=['accuracy'])
    regressor.fit(x, y, epochs = e, verbose = 0)
    return regressor

In [47]:
model = LSTM_pipe(layer_config, time_config, e_config, x_refit_trans, y_refit)



In [48]:
accuracy_score(y_test, [1 if x[0] > 0.5 else 0 for x in model.predict(x_test_trans)])



0.5714285714285714

#### 8. Backtesting

In [49]:
# predicted value
model = LSTM_pipe(layer_config, time_config, e_config, x_full_trans, y_full)
yhat = model.predict(x_full_trans, verbose=0)
yhat = [1 if x[0] > 0.5 else 0 for x in yhat]



In [50]:
for b in yhat:
    print(b)

1
1
1
1
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
0
0
0
0
0
0


## With Sentiments

In [51]:
feature_lags = {"Adj_Close_BTC-USD" : 1, 
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1,
                "coindesk_sentiment" : 1,
                "reddit_comments_sentiments" : 1,
                "top_50_reddit_posts_sentiments" : 1}

data = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-04-12")]

train = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-14")]
validation = data[(data["date"] >= "2021-03-15") & (data["date"] <= "2021-03-29")]
test = data[(data["date"] >= "2021-03-30") & (data["date"] <= "2021-04-12")]

refit = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-29")]
full = data.copy(deep = True)


x_train = train.drop(["date", "class_y"], axis = 1)
y_train = train["class_y"]

x_val = validation.drop(["date", "class_y"], axis = 1)
y_val = validation["class_y"]

x_test = test.drop(["date", "class_y"], axis = 1)
y_test = test["class_y"]

x_refit = refit.drop(["date", "class_y"], axis = 1)
y_refit = refit["class_y"]

x_full = full.drop(["date", "class_y"], axis = 1)
y_full = full["class_y"]

sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_val_trans = sc.transform(x_val)
x_test_trans = sc.transform(x_test)

sc2 = MinMaxScaler(feature_range = (0, 1))
sc2.fit(x_refit)
x_refit_trans = sc2.transform(x_refit)

sc3 = MinMaxScaler(feature_range = (0, 1))
sc3.fit(x_full)
x_full_trans = sc3.transform(x_full)

x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_val_trans = x_val_trans.reshape(x_val_trans.shape[0], x_val_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)
x_full_trans = x_full_trans.reshape(x_full_trans.shape[0], x_full_trans.shape[1], 1)

In [52]:
res2 = []
# 1 layer
for unit in (4,8,16):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit],time_step, e), LSTM_GS([unit], time_step, e)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit, unit//2],time_step, e),LSTM_GS([unit, unit//2], time_step, e)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit, unit//2, unit//4],time_step, e),LSTM_GS([unit, unit//2, unit//4], time_step, e)))















In [53]:
# best combi
sorted(res2, key = lambda x : x[1][1], reverse = True)[0]

(([16, 8], 2, 500), (0.6301369863013698, 0.6666666666666666))

In [54]:
layer_config2 = sorted(res2, key = lambda x : x[1][1], reverse = True)[0][0][0]
time_config2 = sorted(res2, key = lambda x : x[1][1], reverse = True)[0][0][1]
e_config2 = sorted(res2, key = lambda x : x[1][1], reverse = True)[0][0][2]
print(layer_config2, time_config2, e_config2)

[16, 8] 2 500


In [55]:
model2 = LSTM_pipe(layer_config2, time_config2, e_config2, x_refit_trans, y_refit)



In [56]:
# predicted value
yhat2 = model2.predict(x_test_trans, verbose=0)
accuracy_score(y_test, [1 if x[0] > 0.5 else 0 for x in yhat2])



0.6428571428571429

In [57]:
model2 = LSTM_pipe(layer_config2, time_config2, e_config2, x_full_trans, y_full)
yhat2 = model2.predict(x_full_trans, verbose=0)

yhat2 = [1 if x[0] > 0.5 else 0 for x in yhat2]



In [58]:
for b in yhat2:
    print(b)

1
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
1
1
1
1
1
0
0
1
1
0
0
0
0
0
0
