In [61]:
import pandas as pd
import numpy as np
import datetime
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras import backend as K
import matplotlib.pyplot as plt
import sys
sys.path.insert(1, '/Users/yuesongyang/Desktop/BT4222_repo')
from backtest import *
import tensorflow as tf
tf.random.set_seed(0)

In [62]:
features = pd.read_csv("../../data/cooked_data/cooked_complete_dataset.csv")

In [63]:
features.dropna(inplace = True)

In [64]:
features['date'] = features['date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
features["date"] = pd.to_datetime(features["date"], format='%d/%m/%Y', infer_datetime_format=True)

**Helper functions to create lags for features and calculate RMSE**

In [66]:
def lag(data, dic):
    cols = []
    for key, value in dic.items():
        for i in range(1, value+1):
            cols.append(data[key].shift(i).rename('{}_lag{}'.format(data[key].name, i)))
    return pd.concat([data["date"],data["Adj_Close_BTC-USD"]] + cols, axis = 1)

In [67]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

## Without Sentiments

#### 1. Create feature lags

In [68]:
feature_lags = {"Adj_Close_BTC-USD" : 1,
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1}

data = lag(features, feature_lags)

#### 2. Handle train-test split

In [69]:
data = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-04-12")]

train = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-14")]
validation = data[(data["date"] >= "2021-03-15") & (data["date"] <= "2021-03-29")]
test = data[(data["date"] >= "2021-03-30") & (data["date"] <= "2021-04-12")]

refit = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-29")]
full = data.copy(deep = True)


x_train = train.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_train = train["Adj_Close_BTC-USD"]

x_val = validation.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_val = validation["Adj_Close_BTC-USD"]

x_test = test.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_test = test["Adj_Close_BTC-USD"]

x_refit = refit.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_refit = refit["Adj_Close_BTC-USD"]

x_full = full.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_full = full["Adj_Close_BTC-USD"]

#### 3. MinMax scale the features

In [71]:
sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_val_trans = sc.transform(x_val)
x_test_trans = sc.transform(x_test)

sc2 = MinMaxScaler(feature_range = (0, 1))
sc2.fit(x_refit)
x_refit_trans = sc2.transform(x_refit)

sc3 = MinMaxScaler(feature_range = (0, 1))
sc3.fit(x_full)
x_full_trans = sc3.transform(x_full)

#### 4. Transform the feature shapes to fit into LSTM

In [72]:
x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_val_trans = x_val_trans.reshape(x_val_trans.shape[0], x_val_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)
x_full_trans = x_full_trans.reshape(x_full_trans.shape[0], x_full_trans.shape[1], 1)

#### 5. Define a Grid Search function

In [73]:
def LSTM_GS(layer_spe, time_step, epochs_):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1))
    regressor.compile(optimizer = 'Adam', loss = root_mean_squared_error)
    regressor.fit(x_train_trans, y_train, epochs = epochs_, verbose = 0)
    
    train_error = mean_squared_error(y_train, regressor.predict(x_train_trans), squared = False)
    val_error = mean_squared_error(y_val, regressor.predict(x_val_trans), squared = False)
    
    return train_error, val_error

#### 6. Grid Search on a few hyperparameters combinations

In [74]:
res = []
# 1 layer
for unit in (4,8,16):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit],time_step, e), LSTM_GS([unit], time_step, e)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit, unit//2],time_step, e),LSTM_GS([unit, unit//2], time_step, e)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit, unit//2, unit//4],time_step, e),LSTM_GS([unit, unit//2, unit//4], time_step, e)))













In [75]:
# best combi
sorted(res, key = lambda x : x[1][1])[0]

(([16], 2, 500), (3249.8426634525367, 1855.3524141398607))

In [76]:
layer_config = sorted(res, key = lambda x : x[1][1])[0][0][0]
time_config = sorted(res, key = lambda x : x[1][1])[0][0][1]
e_config = sorted(res, key = lambda x : x[1][1])[0][0][2]

In [77]:
print(layer_config, time_config, e_config)

[16] 2 500


#### 7. Define a function to retrain the model with selected hyperparameters and generate test rmse

In [78]:
def LSTM_pipe(layer_spe, time_step, e, x, y):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1))
    regressor.compile(optimizer = 'Adam', loss = root_mean_squared_error)
    regressor.fit(x, y, epochs = e, verbose = 0)
    return regressor

In [79]:
model = LSTM_pipe(layer_config, time_config, e_config, x_refit_trans, y_refit)



In [80]:
mean_squared_error(y_test, model.predict(x_test_trans), squared = False)



3527.259456741796

#### 8. Backtesting

In [81]:
# predicted value
model = LSTM_pipe(layer_config, time_config, e_config, x_full_trans, y_full)
yhat = model.predict(x_full_trans, verbose=0)



In [82]:
for b in [x[0] for x in yhat]:
    print(b)

31882.857
31817.668
33739.656
34207.027
33240.855
35401.48
37687.254
41023.254
41624.766
41196.37
39717.83
36803.535
35735.508
38515.004
39550.17
36692.62
36209.613
35922.613
36430.48
37009.598
38465.43
35037.496
36165.344
35481.23
35641.355
36097.426
35973.688
32005.516
33600.61
33640.57
33607.215
32782.527
34807.363
37174.8
38662.26
38794.285
40256.95
41134.402
40848.676
47934.516
48201.906
47084.543
49595.645
49708.96
49366.13
50751.008
50058.598
50709.684
52941.883
51939.168
55703.24
55895.883
57216.15
53725.727
49116.324
50890.387
45530.902
43732.957
43607.168
42743.438
48949.254
47021.4
47179.387
43851.348
45885.445
45871.773
47839.547
47903.863
52135.266
54181.496
56941.27
56769.98
60431.293
58601.566
56150.78
56868.83
59404.082
56716.184
56945.777
56913.996
56182.55
54240.715
53491.3
51017.74
50556.375
55436.062
56199.668
56177.707
57481.59
57821.902
58607.53
60325.64
60655.99
58982.324
60063.234
62019.098
61468.355
59615.496
62619.215
63320.99
64798.05
65193.027


In [83]:
mean_squared_error(y_full, model.predict(x_full_trans), squared = False)

2789.3791803173813

In [84]:
# weighted sentiments
sentiments = features[features["date"].isin(data.date)]
sentiments = (sentiments["coindesk_sentiment"] + sentiments["reddit_comments_sentiments"] + sentiments["top_50_reddit_posts_sentiments"]) / 3
sentiments = ["positive" if x > 0.2 else "negative" for x in sentiments]

In [85]:
for s in sentiments:
    print(s)

positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
negative
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
positive
negative
negative
positive
negative
positive
positive
negative
negative
positive
positive
negative
positive


## With Sentiments

In [86]:
feature_lags = {"Adj_Close_BTC-USD" : 1, 
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1,
                "coindesk_sentiment" : 1,
                "reddit_comments_sentiments" : 1,
                "top_50_reddit_posts_sentiments" : 1}

data = lag(features, feature_lags)

data = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-04-12")]

train = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-14")]
validation = data[(data["date"] >= "2021-03-15") & (data["date"] <= "2021-03-29")]
test = data[(data["date"] >= "2021-03-30") & (data["date"] <= "2021-04-12")]

refit = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-29")]
full = data.copy(deep = True)


x_train = train.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_train = train["Adj_Close_BTC-USD"]

x_val = validation.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_val = validation["Adj_Close_BTC-USD"]

x_test = test.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_test = test["Adj_Close_BTC-USD"]

x_refit = refit.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_refit = refit["Adj_Close_BTC-USD"]

x_full = full.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_full = full["Adj_Close_BTC-USD"]

sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_val_trans = sc.transform(x_val)
x_test_trans = sc.transform(x_test)

sc2 = MinMaxScaler(feature_range = (0, 1))
sc2.fit(x_refit)
x_refit_trans = sc2.transform(x_refit)

sc3 = MinMaxScaler(feature_range = (0, 1))
sc3.fit(x_full)
x_full_trans = sc3.transform(x_full)

x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_val_trans = x_val_trans.reshape(x_val_trans.shape[0], x_val_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)
x_full_trans = x_full_trans.reshape(x_full_trans.shape[0], x_full_trans.shape[1], 1)

In [87]:
res2 = []
# 1 layer
for unit in (4,8,16):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit],time_step, e), LSTM_GS([unit], time_step, e)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit, unit//2],time_step, e),LSTM_GS([unit, unit//2], time_step, e)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit, unit//2, unit//4],time_step, e),LSTM_GS([unit, unit//2, unit//4], time_step, e)))













In [88]:
sorted(res2, key = lambda x : x[1][1])[0]

(([32, 16], 3, 500), (3428.0410363587, 2023.7685232796198))

In [89]:
layer_config2 = sorted(res2, key = lambda x : x[1][1])[0][0][0]
time_config2 = sorted(res2, key = lambda x : x[1][1])[0][0][1]
e_config2 = sorted(res2, key = lambda x : x[1][1])[0][0][2]

In [90]:
print(layer_config2, time_config2, e_config2 )

[32, 16] 3 500


In [99]:
model2 = LSTM_pipe(layer_config2, time_config2, e_config2, x_refit_trans, y_refit)



In [100]:
mean_squared_error(y_test, model2.predict(x_test_trans), squared = False)



7453.998523926727

In [101]:
# predicted value
# predicted value
model2 = LSTM_pipe(layer_config2, time_config2, e_config2, x_full_trans, y_full)
yhat2 = model2.predict(x_full_trans, verbose=0)



In [102]:
for b in [x[0] for x in yhat2]:
    print(b)

30772.92
31171.412
32062.006
32406.158
31999.209
33372.973
34583.043
36410.723
37426.477
37064.574
35808.16
34012.28
33052.402
35030.05
36259.254
34440.395
34024.387
33769.723
34583.016
34461.395
34474.582
32247.99
32596.904
32106.24
32206.9
32551.438
32608.326
30725.312
31563.92
32406.885
32379.334
31649.18
32582.81
34359.63
35742.496
35904.902
37153.09
37868.875
37634.832
42674.375
42808.934
41402.668
43459.113
43557.918
43278.76
44365.25
43833.598
44548.156
46829.625
46392.445
49255.113
49419.56
50537.633
47552.21
43412.824
44818.31
41818.727
41232.637
41113.902
40353.32
44712.984
43751.324
44811.73
43004.27
44266.844
44248.555
46029.66
46784.84
49667.22
50566.98
52392.098
51846.707
54914.0
53357.13
51476.043
52106.973
53954.35
51850.21
52470.973
52431.223
51777.69
50142.723
49509.99
48261.62
47792.63
51798.59
52421.824
52395.66
53726.418
54480.87
54976.367
56029.066
56073.45
54680.746
55560.547
57086.45
56043.98
54510.53
56721.93
57385.81
58497.2
58791.715


In [103]:
len(x_full_trans)

102

In [104]:
mean_squared_error(y_full, model2.predict(x_full_trans), squared = False)

4336.992847278056