In [2]:
import pandas as pd
import numpy as np
import datetime
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras import backend as K
import matplotlib.pyplot as plt
import sys
sys.path.insert(1, '/Users/yuesongyang/Desktop/BT4222_repo')
from backtest import *
import tensorflow as tf
tf.random.set_seed(0)

In [11]:
features = pd.read_csv("../../data/cooked_data/cooked_complete_dataset.csv")

In [12]:
features.dropna(inplace = True)

In [13]:
features['date'] = features['date'].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%y"))
features["date"] = pd.to_datetime(features["date"], format='%d/%m/%Y', infer_datetime_format=True)

In [14]:
features.head()

Unnamed: 0,date,Adj_Close_BTC-USD,Open_BTC-USD,High_BTC-USD,Low_BTC-USD,Volume_BTC-USD,Adj_Close_SPY,Adj_Close_GLD,Adj_Close_CHFUSD=X,Adj_Close_CNYUSD=X,Adj_Close_EURUSD=X,Adj_Close_GBPUSD=X,Adj_Close_JPYUSD=X,coindesk_sentiment,num_of_coindesk_posts,reddit_comments_sentiments,top_50_reddit_posts_sentiments,blockchain_transactions_per_block,blockchain_hash_rates
0,2020-12-14,19246.64453,19144.49219,19305.09961,19012.70898,22474000000.0,361.926788,171.539993,1.125442,0.152772,1.21334,1.331824,0.009621,0.249489,12,0.188275,0.297238,2167.931034,134533587.6
1,2020-12-15,19417.07617,19246.91992,19525.00781,19079.8418,26741980000.0,366.819824,173.940002,1.12793,0.152679,1.21489,1.333084,0.009614,0.173773,18,0.144389,0.399427,2288.857143,133351912.2
2,2020-12-16,21310.59766,19418.81836,21458.9082,19298.31641,44409010000.0,367.395508,174.899994,1.129382,0.152945,1.21543,1.344447,0.009649,0.341491,11,0.137256,0.489673,2204.314685,132323572.3
3,2020-12-17,22805.16211,21308.35156,23642.66016,21234.67578,71378610000.0,369.449982,176.740005,1.129446,0.153109,1.219959,1.350293,0.009664,0.197572,10,0.156723,0.63603,2399.077519,132373208.7
4,2020-12-18,23137.96094,22806.79688,23238.60156,22399.8125,40387900000.0,367.974792,176.440002,1.130301,0.15309,1.226272,1.357018,0.009696,0.315601,2,0.166419,0.107093,2392.031847,131791042.0


**Helper functions to create lags for features and calculate RMSE**

In [16]:
def lag(data, dic):
    cols = []
    for key, value in dic.items():
        for i in range(1, value+1):
            cols.append(data[key].shift(i).rename('{}_lag{}'.format(data[key].name, i)))
    return pd.concat([data["date"],data["Adj_Close_BTC-USD"]] + cols, axis = 1)

In [17]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

## Without Sentiments

#### 1. Create feature lags

In [18]:
feature_lags = {"Adj_Close_BTC-USD" : 2, 
                "Volume_BTC-USD" : 1, 
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_CNYUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1}

data = lag(features, feature_lags)

#### 2. Handle train-test split

In [19]:
# data['date'] = data['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
# data["date"] = pd.to_datetime(data["date"], format='%d/%m/%Y', infer_datetime_format=True)
data = data[data["date"] >= "1/1/2021"]

train = data[data["date"] <= "2021-03-10"]
test = data[(data["date"] > "2021-03-10") & (data["date"] <= "2021-04-5")]

x_train = train.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_train = train["Adj_Close_BTC-USD"]

x_test = test.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_test = test["Adj_Close_BTC-USD"]

#### 3. MinMax scale the features

In [20]:
sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_test_trans = sc.transform(x_test)

#### 4. Transform the feature shapes to fit into LSTM

In [21]:
x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)

#### 5. Define a Grid Search function

In [22]:
def LSTM_GS(layer_spe, time_step):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1))
    regressor.compile(optimizer = 'Adam', loss = root_mean_squared_error)
    regressor.fit(x_train_trans, y_train, epochs = 200, verbose = 0)
    
    yhat = regressor.predict(x_test_trans)
    return mean_squared_error(y_train, regressor.predict(x_train_trans), squared = False), mean_squared_error(y_test, yhat, squared = False)

#### 6. Grid Search on a few hyperparameters combinations

In [23]:
res = []
# 1 layer
for unit in (4,8,16,32):
    for time_step in (1,2,3,4,5):
        res.append((([unit],time_step),LSTM_GS([unit], time_step)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3,4,5):
        res.append((([unit, unit//2],time_step),LSTM_GS([unit, unit//2], time_step)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3,4,5):
        res.append((([unit, unit//2, unit//4],time_step),LSTM_GS([unit, unit//2, unit//4], time_step)))













In [25]:
# best combi
sorted(res, key = lambda x : x[1][1])[0]

(([32, 16], 5), (2752.206904668088, 1924.0251686109223))

In [26]:
layer_config = sorted(res, key = lambda x : x[1][1])[0][0][0]
time_config = sorted(res, key = lambda x : x[1][1])[0][0][1]

#### 7. Define a function to retrain the model with selected hyperparameters and all data avilable

In [27]:
x_refit = data.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_refit = data["Adj_Close_BTC-USD"]
x_refit_trans = sc.transform(x_refit)
x_refit_trans = x_refit_trans.reshape(x_refit_trans.shape[0], x_refit_trans.shape[1], 1)

In [28]:
def LSTM_pipe(layer_spe, time_step):
    regressor = Sequential()
    count = 0
    for unit in layer_spe:
        count += 1 
        if len(layer_spe) == 1:
            regressor.add(LSTM(unit, activation = "relu", input_shape = (time_step, 1))) # batch size, time steps, data dimension
            regressor.add(Dropout(0.3))
        else:
            if count == 1:
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True, input_shape = (time_step, 1))) # batch size, time steps, data dimension
                regressor.add(Dropout(0.3))
            elif count < len(layer_spe): 
                regressor.add(LSTM(unit, activation = "relu", return_sequences=True))
                regressor.add(Dropout(0.3))
            else:
                regressor.add(LSTM(unit, activation = "relu"))
                regressor.add(Dropout(0.3))
            
    regressor.add(Dense(units = 1))
    regressor.compile(optimizer = 'Adam', loss = root_mean_squared_error)
    regressor.fit(x_refit_trans, y_refit, epochs = 200, verbose = 0)
    return regressor

In [29]:
model = LSTM_pipe(layer_config, time_config)



#### 8. Backtesting

In [30]:
# predicted value
yhat = model.predict(x_refit_trans, verbose=0)



In [31]:
for b in [x[0] for x in yhat]:
    print(b)

32554.205
32288.773
34063.812
35417.457
35124.566
36251.734
38578.797
41899.61
43224.72
42898.797
42136.19
40463.785
37771.152
38948.29
40629.21
39518.25
38190.434
37682.96
37757.46
38534.805
39547.9
37422.656
36650.49
36369.465
36168.35
36675.938
36618.062
34302.06
34855.027
36107.54
35922.37
35155.52
36286.38
37777.465
39377.53
40348.977
40770.56
42040.582
42140.047
47449.668
50424.07
49774.707
50613.42
51840.504
51282.027
52074.195
52416.66
52555.758
54698.84
54501.113
57077.75
59267.438
59855.53
59091.45
54775.406
52548.03
49014.51
54143.875
46211.406
45757.836
49406.93
49788.133
49456.105
48036.055
48470.656
48266.82
49786.145
50812.57
54070.29
56969.395
59463.203
60247.71
62949.477
62882.5
60768.32
59504.758
61751.41
60671.938
60114.566
60358.91
59865.1
58270.273
56376.92
54974.438
53728.793
56221.727
58070.957
58454.72
59661.035
60566.332
61972.95
63185.54
63382.21
62339.438
62004.992


In [39]:
# weighted sentiments
sentiments = features[features["date"].isin(data.date)]
sentiments = (sentiments["coindesk_sentiment"] + sentiments["reddit_comments_sentiments"] + sentiments["top_50_reddit_posts_sentiments"]) / 3
sentiments = ["positive" if x > 0.2 else "negative" for x in sentiments]

In [40]:
for s in sentiments:
    print(s)

positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
negative
positive
positive
negative
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
negative
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
positive
negative
negative
negative
negative
positive
negative
negative
positive
negative
positive


## With Sentiments

In [41]:
feature_lags = {"Adj_Close_BTC-USD" : 2, 
                "Volume_BTC-USD" : 1, 
                "Adj_Close_SPY" : 1,
                "Adj_Close_GLD" : 1,
                "Adj_Close_CHFUSD=X" : 1,
                "Adj_Close_CNYUSD=X" : 1,
                "Adj_Close_EURUSD=X" : 1,
                "Adj_Close_GBPUSD=X" : 1,
                "Adj_Close_JPYUSD=X" : 1,
                "blockchain_transactions_per_block" : 1,
                "blockchain_hash_rates" : 1,
                "coindesk_sentiment" : 1,
                "num_of_coindesk_posts" : 1,
                "reddit_comments_sentiments" : 1,
                "top_50_reddit_posts_sentiments" : 1}

data = lag(features, feature_lags)
# data['date'] = data['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
# data["date"] = pd.to_datetime(data["date"], format='%d/%m/%Y', infer_datetime_format=True)

data = data[data["date"] >= "1/1/2021"]

train = data[data["date"] <= "2021-03-10"]
test = data[(data["date"] > "2021-03-10") & (data["date"] <= "2021-04-5")]

x_train = train.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_train = train["Adj_Close_BTC-USD"]

x_test = test.drop(["date", "Adj_Close_BTC-USD"], axis = 1)
y_test = test["Adj_Close_BTC-USD"]

sc = MinMaxScaler(feature_range = (0, 1))
sc.fit(x_train)
x_train_trans = sc.transform(x_train)
x_test_trans = sc.transform(x_test)

x_train_trans = x_train_trans.reshape(x_train_trans.shape[0], x_train_trans.shape[1], 1)
x_test_trans = x_test_trans.reshape(x_test_trans.shape[0], x_test_trans.shape[1], 1)

In [42]:
res2 = []
# 1 layer
for unit in (4,8,16,32):
    for time_step in (1,2,3,4,5):
        res2.append((([unit],time_step),LSTM_GS([unit], time_step)))

# 2 layers
for unit in (8,16,32):
    for time_step in (1,2,3,4,5):
        res2.append((([unit, unit//2],time_step),LSTM_GS([unit, unit//2], time_step)))

# 3 layers
for unit in (8,16,32):
    for time_step in (1,2,3,4,5):
        res2.append((([unit, unit//2, unit//4],time_step),LSTM_GS([unit, unit//2, unit//4], time_step)))













In [43]:
sorted(res2, key = lambda x : x[1][1])[0]

(([8], 2), (3502.3132255331902, 2202.7428963125462))

In [44]:
layer_config2 = sorted(res2, key = lambda x : x[1])[0][0][0]
time_config2 = sorted(res2, key = lambda x : x[1])[0][0][1]

In [45]:
model2 = LSTM_pipe(layer_config2, time_config2)



In [46]:
# predicted value
yhat2 = model2.predict(x_refit_trans, verbose=0)



In [47]:
for b in [x[0] for x in yhat2]:
    print(b)

30533.139
30303.48
32069.129
33439.316
33075.75
34099.547
36381.066
39681.36
41044.81
40708.984
40002.27
38424.484
35741.457
36829.32
38461.535
37394.34
36071.777
35566.51
35646.758
36470.523
37505.29
35478.76
34642.75
34354.74
34150.66
34698.69
34639.145
32257.254
32859.223
34009.01
33769.984
33008.363
34183.8
35744.94
37299.082
38324.99
38753.895
40004.305
40103.355
45229.57
48095.875
47437.863
48234.7
49439.86
48897.7
49645.984
50000.613
50123.83
52195.7
52009.645
54476.97
56597.152
57138.207
56443.91
52316.176
50155.008
46706.62
51944.344
43950.797
43524.754
47139.223
47521.746
47158.105
45783.87
46278.113
46066.043
47538.08
48575.176
51798.258
54562.402
56971.004
57699.934
60371.03
60216.734
58248.97
57005.04
59137.484
58095.6
57566.49
57804.34
57336.582
55842.484
53982.83
52702.473
51514.887
53950.02
55728.22
56100.742
57262.074
58175.55
59537.05
60665.61
60855.008
59873.26
59532.777
