In [177]:
import pandas as pd
import numpy as np
import datetime
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras import backend as K
import matplotlib.pyplot as plt
import sys
sys.path.insert(1, '/Users/yuesongyang/Desktop/BT4222_repo')
from backtest import *
import tensorflow as tf
tf.random.set_seed(0)

In [178]:
features = pd.read_csv("../../data/cooked_data/cooked_complete_dataset.csv")

In [179]:
features.dropna(inplace = True)

In [180]:
features['date'] = features['date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
features["date"] = pd.to_datetime(features["date"], format='%d/%m/%Y', infer_datetime_format=True)

In [181]:
features.head()

Unnamed: 0,date,Adj_Close_BTC-USD,Open_BTC-USD,High_BTC-USD,Low_BTC-USD,Volume_BTC-USD,Adj_Close_SPY,Adj_Close_GLD,Adj_Close_CHFUSD=X,Adj_Close_CNYUSD=X,Adj_Close_EURUSD=X,Adj_Close_GBPUSD=X,Adj_Close_JPYUSD=X,coindesk_sentiment,num_of_coindesk_posts,reddit_comments_sentiments,top_50_reddit_posts_sentiments,blockchain_transactions_per_block,blockchain_hash_rates
0,2020-12-14,19246.64453,19144.49219,19305.09961,19012.70898,22474000000.0,361.926788,171.539993,1.125442,0.152772,1.21334,1.331824,0.009621,0.249489,12,0.15806,0.677618,2167.931034,134574371.4
1,2020-12-15,19417.07617,19246.91992,19525.00781,19079.8418,26741980000.0,366.819824,173.940002,1.12793,0.152679,1.21489,1.333084,0.009614,0.173773,18,0.10193,0.447277,2288.857143,129933875.8
2,2020-12-16,21310.59766,19418.81836,21458.9082,19298.31641,44409010000.0,367.395508,174.899994,1.129382,0.152945,1.21543,1.344447,0.009649,0.341491,11,0.127344,0.480809,2204.314685,132718173.2
3,2020-12-17,22805.16211,21308.35156,23642.66016,21234.67578,71378610000.0,369.449982,176.740005,1.129446,0.153109,1.219959,1.350293,0.009664,0.197572,10,0.135945,0.539729,2399.077519,119724785.6
4,2020-12-18,23137.96094,22806.79688,23238.60156,22399.8125,40387900000.0,367.974792,176.440002,1.130301,0.15309,1.226272,1.357018,0.009696,0.315601,2,0.135441,0.449503,2392.031847,145711560.8


**Helper functions to create lags for features and calculate RMSE**

In [182]:
def lag(data, dic):
    cols = []
    for key, value in dic.items():
        for i in range(1, value+1):
            cols.append(data[key].shift(i).rename('{}_lag{}'.format(data[key].name, i)))
    return pd.concat([data["date"],data["Adj_Close_BTC-USD"]] + cols, axis = 1)

In [183]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

## Without Sentiments

#### 1. Create feature lags

In [184]:
feature_lags = {"Adj_Close_BTC-USD" : 1,
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1}

data = lag(features, feature_lags)

#### 2. Handle train-test split

In [185]:
# data['date'] = data['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
# data["date"] = pd.to_datetime(data["date"], format='%d/%m/%Y', infer_datetime_format=True)
data = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-04-12")]

train = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-14")]
validation = data[(data["date"] >= "2021-03-15") & (data["date"] <= "2021-03-29")]
test = data[(data["date"] >= "2021-03-30") & (data["date"] <= "2021-04-12")]

refit = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-29")]
full = data.copy(deep = True)


x_train = train.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_train = train["Adj_Close_BTC-USD"]

x_val = validation.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_val = validation["Adj_Close_BTC-USD"]

x_test = test.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_test = test["Adj_Close_BTC-USD"]

x_refit = refit.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_refit = refit["Adj_Close_BTC-USD"]

x_full = full.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_full = full["Adj_Close_BTC-USD"]

#### 3. MinMax scale the features

In [186]:
sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_val_trans = sc.transform(x_val)
x_test_trans = sc.transform(x_test)

sc2 = MinMaxScaler(feature_range = (0, 1))
sc2.fit(x_refit)
x_refit_trans = sc2.transform(x_refit)

sc3 = MinMaxScaler(feature_range = (0, 1))
sc3.fit(x_full)
x_full_trans = sc3.transform(x_full)

#### 4. Transform the feature shapes to fit into LSTM

In [187]:
x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_val_trans = x_val_trans.reshape(x_val_trans.shape[0], x_val_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)
x_full_trans = x_full_trans.reshape(x_full_trans.shape[0], x_full_trans.shape[1], 1)

#### 5. Define a Grid Search function

In [188]:
def LSTM_GS(layer_spe, time_step, epochs_):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1))
    regressor.compile(optimizer = 'Adam', loss = root_mean_squared_error)
    regressor.fit(x_train_trans, y_train, epochs = epochs_, verbose = 0)
    
    train_error = mean_squared_error(y_train, regressor.predict(x_train_trans), squared = False)
    val_error = mean_squared_error(y_val, regressor.predict(x_val_trans), squared = False)
    
    return train_error, val_error

#### 6. Grid Search on a few hyperparameters combinations

In [189]:
res = []
# 1 layer
for unit in (4,8,16):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit],time_step, e), LSTM_GS([unit], time_step, e)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit, unit//2],time_step, e),LSTM_GS([unit, unit//2], time_step, e)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res.append((([unit, unit//2, unit//4],time_step, e),LSTM_GS([unit, unit//2, unit//4], time_step, e)))















In [234]:
# best combi
sorted(res, key = lambda x : x[1][1])[0]

(([16], 2, 500), (3267.5697919222184, 1928.437725773618))

In [235]:
layer_config = sorted(res, key = lambda x : x[1][1])[0][0][0]
time_config = sorted(res, key = lambda x : x[1][1])[0][0][1]
e_config = sorted(res, key = lambda x : x[1][1])[0][0][2]

In [236]:
print(layer_config, time_config, e_config)

[16] 2 500


#### 7. Define a function to retrain the model with selected hyperparameters and generate test rmse

In [193]:
def LSTM_pipe(layer_spe, time_step, e, x, y):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1))
    regressor.compile(optimizer = 'Adam', loss = root_mean_squared_error)
    regressor.fit(x, y, epochs = e, verbose = 0)
    return regressor

In [245]:
model = LSTM_pipe(layer_config, time_config, e_config, x_refit_trans, y_refit)



In [246]:
mean_squared_error(y_test, model.predict(x_test_trans), squared = False)



3107.08919370734

#### 8. Backtesting

In [243]:
# predicted value
model = LSTM_pipe(layer_config2, time_config2, e_config2, x_full_trans, y_full)
yhat = model.predict(x_full_trans, verbose=0)



In [244]:
for b in [x[0] for x in yhat]:
    print(b)

31885.945
31312.172
34539.594
35564.453
35160.723
36569.047
39244.598
42926.21
43442.71
41569.64
40996.203
40289.375
36646.78
39258.55
40067.664
37452.746
36438.445
35865.23
36140.457
37008.188
38815.03
35746.05
37097.867
34961.535
35127.54
36072.383
35951.33
32545.418
34729.465
37163.89
34449.105
32997.914
35163.92
37288.867
38660.45
38906.855
39718.074
41302.95
40686.227
50097.703
49887.703
48541.945
50840.344
50539.457
49800.414
51327.176
50930.883
51669.355
54237.87
51489.21
56379.953
56878.38
57296.406
56259.25
52063.04
51089.656
45497.047
61740.977
43039.652
42544.31
48464.094
46184.25
47073.375
43731.754
45079.676
44234.03
46836.89
47335.6
51547.53
54067.96
56738.945
56511.242
60854.918
57767.918
56238.133
56639.996
59334.77
56675.93
56487.01
56526.523
55817.457
53684.832
53192.188
51378.414
50518.402
54589.984
54827.117
54822.53
56917.145
57184.8
58555.812
59814.008
60035.59
58257.875
58867.348
61149.1
60994.277
59571.07
61217.66
61380.562
63734.016
63379.38


In [241]:
mean_squared_error(y_full, model.predict(x_full_trans), squared = False)

3215.81206583314

In [206]:
# weighted sentiments
sentiments = features[features["date"].isin(data.date)]
sentiments = (sentiments["coindesk_sentiment"] + sentiments["reddit_comments_sentiments"] + sentiments["top_50_reddit_posts_sentiments"]) / 3
sentiments = ["positive" if x > 0.2 else "negative" for x in sentiments]

In [207]:
for s in sentiments:
    print(s)

positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
negative
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
positive
negative
negative
positive
negative
positive
positive
negative
negative
positive
positive
negative
positive


## With Sentiments

In [208]:
feature_lags = {"Adj_Close_BTC-USD" : 1, 
                "Volume_BTC-USD" : 1, 
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1,
                "coindesk_sentiment" : 1,
                "reddit_comments_sentiments" : 1,
                "top_50_reddit_posts_sentiments" : 1}

data = lag(features, feature_lags)

data = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-04-12")]

train = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-14")]
validation = data[(data["date"] >= "2021-03-15") & (data["date"] <= "2021-03-29")]
test = data[(data["date"] >= "2021-03-30") & (data["date"] <= "2021-04-12")]

refit = data[(data["date"] >= "2021-01-01") & (data["date"] <= "2021-03-29")]
full = data.copy(deep = True)


x_train = train.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_train = train["Adj_Close_BTC-USD"]

x_val = validation.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_val = validation["Adj_Close_BTC-USD"]

x_test = test.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_test = test["Adj_Close_BTC-USD"]

x_refit = refit.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_refit = refit["Adj_Close_BTC-USD"]

x_full = full.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_full = full["Adj_Close_BTC-USD"]

sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_val_trans = sc.transform(x_val)
x_test_trans = sc.transform(x_test)

sc2 = MinMaxScaler(feature_range = (0, 1))
sc2.fit(x_refit)
x_refit_trans = sc2.transform(x_refit)

sc3 = MinMaxScaler(feature_range = (0, 1))
sc3.fit(x_full)
x_full_trans = sc3.transform(x_full)

x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_val_trans = x_val_trans.reshape(x_val_trans.shape[0], x_val_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)
x_full_trans = x_full_trans.reshape(x_full_trans.shape[0], x_full_trans.shape[1], 1)

In [209]:
res2 = []
# 1 layer
for unit in (4,8,16):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit],time_step, e), LSTM_GS([unit], time_step, e)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit, unit//2],time_step, e),LSTM_GS([unit, unit//2], time_step, e)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3):
        for e in (200, 500):
            res2.append((([unit, unit//2, unit//4],time_step, e),LSTM_GS([unit, unit//2, unit//4], time_step, e)))













In [226]:
sorted(res2, key = lambda x : x[1][1])[0]

(([32, 16], 3, 500), (3532.507662584802, 1740.1979570315914))

In [227]:
layer_config2 = sorted(res2, key = lambda x : x[1][1])[0][0][0]
time_config2 = sorted(res2, key = lambda x : x[1][1])[0][0][1]
e_config2 = sorted(res2, key = lambda x : x[1][1])[0][0][2]

In [228]:
print(layer_config2, time_config2, e_config2 )

[32, 16] 3 500


In [229]:
model2 = LSTM_pipe(layer_config2, time_config2, e_config2, x_refit_trans, y_refit)



In [230]:
mean_squared_error(y_test, model2.predict(x_test_trans), squared = False)



5868.028609479828

In [231]:
# predicted value
yhat2 = model2.predict(x_full_trans, verbose=0)

In [232]:
for b in [x[0] for x in yhat2]:
    print(b)

32959.65
33303.02
33965.36
34447.953
34174.16
35432.906
36841.438
38534.695
39694.793
39133.883
37855.516
36648.125
34921.69
37059.203
38479.445
36855.223
36330.043
36016.258
36844.21
36422.727
35928.812
33500.09
34111.06
33329.805
33463.44
33615.82
33825.617
33106.07
33597.5
35650.125
35019.207
34139.92
34516.72
36122.164
37596.945
37390.34
38356.484
39267.457
38962.93
44502.805
44631.25
43044.445
45415.594
45238.03
44903.203
46162.152
45586.137
46562.098
49319.83
48975.28
52530.07
52743.438
53997.93
50934.72
46266.21
46955.68
44859.03
55763.83
44358.727
43536.0
47512.617
46606.625
48541.184
47072.688
47713.32
47617.85
49708.793
50909.99
53369.617
54149.7
55802.777
55073.438
58959.035
56917.68
54256.04
54989.996
56932.816
55473.004
56148.363
56121.19
55373.17
52833.08
52793.492
51544.46
50704.688
54326.348
55043.805
55024.523
56758.96
57917.15
58172.676
58767.883
58853.39
57208.977
58247.32
59525.984
58300.883
56399.695
58620.34
59025.113
60419.137
60791.773


In [233]:
mean_squared_error(y_full, model2.predict(x_full_trans), squared = False)

2663.2824483514014