# ML to predict Bitcoin prices

+ Data from: https://coinmarketcap.com/api/
+ Adapted from: https://dashee87.github.io/data%20science/deep%20learning/python/another-keras-tutorial-for-neural-network-beginners/
+ Adapted from: https://dashee87.github.io/deep%20learning/python/predicting-cryptocurrency-prices-with-deep-learning/
+ Adapted from: https://github.com/dashee87/blogScripts/blob/master/Jupyter/2017-11-20-predicting-cryptocurrency-prices-with-deep-learning.ipynb

In [1]:
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import numpy as np

# get market info for bitcoin from the start of 2016 to the current day
bitcoin_market_info = pd.read_html("https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end="+time.strftime("%Y%m%d"))[0]

# convert the date string to the correct date format
bitcoin_market_info = bitcoin_market_info.assign(Date=pd.to_datetime(bitcoin_market_info['Date']))
# when Volume is equal to '-' convert it to 0
bitcoin_market_info.loc[bitcoin_market_info['Volume']=="-",'Volume']=0
# convert to int
bitcoin_market_info['Volume'] = bitcoin_market_info['Volume'].astype('int64')
# look at the first few rows
bitcoin_market_info.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap
0,2018-04-15,7999.33,8338.42,7999.33,8329.11,5244480000,135812000000
1,2018-04-14,7874.67,8140.71,7846.0,7986.24,5191430000,133682000000
2,2018-04-13,7901.09,8183.96,7758.93,7895.96,7764460000,134114000000
3,2018-04-12,6955.38,7899.23,6806.51,7889.25,8906250000,118048000000
4,2018-04-11,6843.47,6968.32,6817.59,6968.32,4641890000,116126000000


In [2]:
bitcoin_market_info.columns =[bitcoin_market_info.columns[0]]+['bt_'+i for i in bitcoin_market_info.columns[1:]]

In [3]:
# market_info = pd.merge(bitcoin_market_info,eth_market_info, on=['Date'])
# market_info = market_info[market_info['Date']>='2016-01-01']
market_info = bitcoin_market_info
market_info = market_info[market_info['Date']>='2016-01-01']
for coins in ['bt_']: 
    kwargs = { coins+'day_diff': lambda x: (x[coins+'Close']-x[coins+'Open'])/x[coins+'Open']}
    market_info = market_info.assign(**kwargs)
market_info.head()

Unnamed: 0,Date,bt_Open,bt_High,bt_Low,bt_Close,bt_Volume,bt_Market Cap,bt_day_diff
0,2018-04-15,7999.33,8338.42,7999.33,8329.11,5244480000,135812000000,0.041226
1,2018-04-14,7874.67,8140.71,7846.0,7986.24,5191430000,133682000000,0.014168
2,2018-04-13,7901.09,8183.96,7758.93,7895.96,7764460000,134114000000,-0.000649
3,2018-04-12,6955.38,7899.23,6806.51,7889.25,8906250000,118048000000,0.134266
4,2018-04-11,6843.47,6968.32,6817.59,6968.32,4641890000,116126000000,0.018244


In [4]:
for coins in ['bt_']: 
    kwargs = { coins+'close_off_high': lambda x: 2*(x[coins+'High']- x[coins+'Close'])/(x[coins+'High']-x[coins+'Low'])-1,
            coins+'volatility': lambda x: (x[coins+'High']- x[coins+'Low'])/(x[coins+'Open'])}
    market_info = market_info.assign(**kwargs)

In [5]:
model_data = market_info[['Date']+[coin+metric for coin in ['bt_'] 
                                   for metric in ['Close','Volume','close_off_high','volatility']]]
# need to reverse the data frame so that subsequent rows represent later timepoints
model_data = model_data.sort_values(by='Date')
model_data.head()

Unnamed: 0,Date,bt_Close,bt_Volume,bt_close_off_high,bt_volatility
835,2016-01-01,434.33,36278900,-0.560641,0.020292
834,2016-01-02,433.44,30096600,0.250597,0.009641
833,2016-01-03,430.01,39633800,-0.173865,0.020827
832,2016-01-04,433.09,38477500,-0.474265,0.012649
831,2016-01-05,431.96,34522600,-0.013333,0.010391


In [6]:
split_date = '2017-06-01'
# we don't need the date columns anymore
training_set, test_set = model_data[model_data['Date']<split_date], model_data[model_data['Date']>=split_date]
training_set = training_set.drop('Date', 1)
test_set = test_set.drop('Date', 1)
test_set

Unnamed: 0,bt_Close,bt_Volume,bt_close_off_high,bt_volatility
318,2407.88,1653180000,-0.493815,0.069946
317,2488.55,1317030000,-1.000000,0.047932
316,2515.35,1514950000,-0.159278,0.063496
315,2511.81,1355120000,0.111061,0.052339
314,2686.81,1369310000,-1.000000,0.070287
313,2863.20,2089610000,-0.115346,0.114860
312,2732.16,1517710000,0.625637,0.058835
311,2805.62,1281170000,-0.865882,0.053060
310,2823.81,1348950000,0.468564,0.037789
309,2947.71,2018890000,-0.967912,0.072288


In [7]:
window_len = 1
norm_cols = [coin+metric for coin in ['bt_'] for metric in ['Close','Volume']]

In [8]:
print(type(test_set))
print(print(test_set.shape))
print(test_set)
LSTM_training_inputs = []
for i in range(len(training_set)-window_len):
    temp_set = training_set[i:(i+window_len)].copy()
    for col in norm_cols:
        temp_set.loc[:, col] = temp_set[col]/temp_set[col].iloc[0] - 1
    LSTM_training_inputs.append(temp_set)
LSTM_training_outputs = (training_set['bt_Close'][window_len:].values/training_set['bt_Close'][:-window_len].values)-1
print(LSTM_training_inputs)

<class 'pandas.core.frame.DataFrame'>
(319, 4)
None
     bt_Close   bt_Volume  bt_close_off_high  bt_volatility
318   2407.88  1653180000          -0.493815       0.069946
317   2488.55  1317030000          -1.000000       0.047932
316   2515.35  1514950000          -0.159278       0.063496
315   2511.81  1355120000           0.111061       0.052339
314   2686.81  1369310000          -1.000000       0.070287
313   2863.20  2089610000          -0.115346       0.114860
312   2732.16  1517710000           0.625637       0.058835
311   2805.62  1281170000          -0.865882       0.053060
310   2823.81  1348950000           0.468564       0.037789
309   2947.71  2018890000          -0.967912       0.072288
308   2958.11  1752400000          -0.506760       0.053042
307   2659.63  2569530000           0.410612       0.162094
306   2717.02  1781200000           0.038800       0.051721
305   2506.37  1696560000           0.500227       0.137617
304   2464.58  2026260000          -0.564071    

In [9]:
LSTM_training_inputs[0]

Unnamed: 0,bt_Close,bt_Volume,bt_close_off_high,bt_volatility
835,0.0,0.0,-0.560641,0.020292


In [10]:
LSTM_test_inputs = []
for i in range(len(test_set)-window_len):
    temp_set = test_set[i:(i+window_len)].copy()
    for col in norm_cols:
        temp_set.loc[:, col] = temp_set[col]/temp_set[col].iloc[0] - 1
    LSTM_test_inputs.append(temp_set)
LSTM_test_outputs = (test_set['bt_Close'][window_len:].values/test_set['bt_Close'][:-window_len].values)-1

In [11]:
LSTM_training_inputs = [np.array(LSTM_training_input) for LSTM_training_input in LSTM_training_inputs]
LSTM_training_inputs = np.array(LSTM_training_inputs)

LSTM_test_inputs = [np.array(LSTM_test_inputs) for LSTM_test_inputs in LSTM_test_inputs]
LSTM_test_inputs = np.array(LSTM_test_inputs)
LSTM_test_inputs.shape

(318, 1, 4)

In [12]:
# import the relevant Keras modules
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras.layers import LSTM
from keras.layers import Dropout

def build_model(inputs, output_size, neurons, activ_func="linear",
                dropout=0.25, loss="mae", optimizer="adam"):
    model = Sequential()

    model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2])))
    model.add(Dropout(dropout))
    model.add(Dense(units=output_size))
    model.add(Activation(activ_func))

    model.compile(loss=loss, optimizer=optimizer)
    return model

Using TensorFlow backend.


In [13]:
# random seed for reproducibility
np.random.seed(202)
# we'll try to predict the closing price for the next 5 days 
# change this value if you want to make longer/shorter prediction
pred_range = 1
# initialise model architecture
bt_model = build_model(LSTM_training_inputs, output_size=pred_range, neurons = 20)
# model output is next 5 prices normalised to 10th previous closing price
LSTM_training_outputs = []
for i in range(window_len, len(training_set['bt_Close'])-pred_range):
    LSTM_training_outputs.append((training_set['bt_Close'][i:i+pred_range].values/
                                  training_set['bt_Close'].values[i-window_len])-1)
LSTM_training_outputs = np.array(LSTM_training_outputs)
# train model on data
# note:bt_history contains information on the training error per epoch
bt_history = bt_model.fit(LSTM_training_inputs[:-pred_range], LSTM_training_outputs, 
                            epochs=50, batch_size=1, verbose=2, shuffle=True)

Epoch 1/50
 - 2s - loss: 0.0237
Epoch 2/50
 - 1s - loss: 0.0221
Epoch 3/50
 - 1s - loss: 0.0212
Epoch 4/50
 - 1s - loss: 0.0206
Epoch 5/50
 - 1s - loss: 0.0197
Epoch 6/50
 - 1s - loss: 0.0193
Epoch 7/50
 - 1s - loss: 0.0189
Epoch 8/50
 - 1s - loss: 0.0190
Epoch 9/50
 - 1s - loss: 0.0191
Epoch 10/50
 - 1s - loss: 0.0190
Epoch 11/50
 - 1s - loss: 0.0189
Epoch 12/50
 - 1s - loss: 0.0187
Epoch 13/50
 - 1s - loss: 0.0188
Epoch 14/50
 - 1s - loss: 0.0188
Epoch 15/50
 - 1s - loss: 0.0187
Epoch 16/50
 - 1s - loss: 0.0186
Epoch 17/50
 - 1s - loss: 0.0187
Epoch 18/50
 - 1s - loss: 0.0188
Epoch 19/50
 - 1s - loss: 0.0186
Epoch 20/50
 - 1s - loss: 0.0185
Epoch 21/50
 - 1s - loss: 0.0187
Epoch 22/50
 - 1s - loss: 0.0186
Epoch 23/50
 - 1s - loss: 0.0186
Epoch 24/50
 - 1s - loss: 0.0187
Epoch 25/50
 - 1s - loss: 0.0187
Epoch 26/50
 - 1s - loss: 0.0186
Epoch 27/50
 - 1s - loss: 0.0186
Epoch 28/50
 - 1s - loss: 0.0185
Epoch 29/50
 - 1s - loss: 0.0186
Epoch 30/50
 - 1s - loss: 0.0185
Epoch 31/50
 - 1s -

In [14]:
bt_pred_prices = ((bt_model.predict(LSTM_test_inputs)[:-pred_range][::pred_range]+1)*\
                   test_set['bt_Close'].values[:-(window_len + pred_range)][::pred_range].reshape(int(np.ceil((len(LSTM_test_inputs)-pred_range)/float(pred_range))),1))

In [15]:
bt_pred_prices = ((bt_model.predict(LSTM_test_inputs)[:-pred_range][::pred_range]+1)*\
                   test_set['bt_Close'].values[:-(window_len + pred_range)][::pred_range].reshape(int(np.ceil((len(LSTM_test_inputs)-pred_range)/float(pred_range))),1))

In [16]:
bt_pred_prices

array([[  2420.59766797],
       [  2498.13918216],
       [  2530.21011926],
       [  2527.29388087],
       [  2698.4901143 ],
       [  2887.78323441],
       [  2756.04545105],
       [  2817.14501817],
       [  2842.03989351],
       [  2960.79346825],
       [  2971.83945961],
       [  2699.52472901],
       [  2733.05340197],
       [  2541.42317311],
       [  2482.20129243],
       [  2530.07974543],
       [  2669.46016263],
       [  2568.98378651],
       [  2605.87180767],
       [  2735.71042117],
       [  2707.30075878],
       [  2715.62743944],
       [  2755.11366294],
       [  2633.72930576],
       [  2607.96084571],
       [  2499.87338866],
       [  2564.58208275],
       [  2586.14428329],
       [  2552.78305096],
       [  2500.57290064],
       [  2452.46708217],
       [  2516.49903766],
       [  2575.49564901],
       [  2612.58761972],
       [  2612.12362566],
       [  2616.92525828],
       [  2568.29724492],
       [  2580.31267633],
       [  25

In [17]:
test = np.array([[[0.,0.,-0.49381482,0.06994621]]])

In [18]:
bt_pred_price = bt_model.predict(test)

In [19]:
bt_pred_price

array([[ 0.00528163]], dtype=float32)

In [20]:
LSTM_test_inputs[0][0]

array([ 0.        ,  0.        , -0.49381482,  0.06994621])

In [21]:
test_price = (bt_model.predict(LSTM_test_inputs))

Querying like above, without the full calculation, returns wrong results. Calculation:

```
bt_pred_prices = ((bt_model.predict(LSTM_test_inputs)[:-pred_range][::pred_range]+1)*\
                   test_set['bt_Close'].values[:-(window_len + pred_range)[::pred_range].reshape(int(np.ceil((len(LSTM_test_inputs)-pred_range)/float(pred_range))),1))
```
Think the solution into read in and prepare data (test_set, LSTM_test_inputs) from scratch using only one days worth of data.

In [22]:
test_price

array([[ 0.00528163],
       [ 0.00385338],
       [ 0.00590773],
       [ 0.0061644 ],
       [ 0.00434725],
       [ 0.00858599],
       [ 0.00874233],
       [ 0.00410784],
       [ 0.00645575],
       [ 0.00443849],
       [ 0.00464124],
       [ 0.01500006],
       [ 0.00590114],
       [ 0.01398559],
       [ 0.00714981],
       [ 0.00457395],
       [ 0.00511321],
       [ 0.00812066],
       [ 0.00628347],
       [ 0.00511449],
       [ 0.00676837],
       [ 0.00377666],
       [ 0.00371735],
       [ 0.00958686],
       [ 0.00716416],
       [ 0.00864385],
       [ 0.0047531 ],
       [ 0.00440977],
       [ 0.00530181],
       [ 0.00795414],
       [ 0.00735945],
       [ 0.0040013 ],
       [ 0.00445994],
       [ 0.00420795],
       [ 0.00389463],
       [ 0.00320684],
       [ 0.01970779],
       [ 0.00348945],
       [ 0.00953147],
       [ 0.01025208],
       [ 0.00659625],
       [ 0.00470833],
       [ 0.00673468],
       [ 0.0091184 ],
       [ 0.01507564],
       [ 0

In [42]:
# get market info for bitcoin from the start of 2016 to the current day
bitcoin_market_new = pd.read_html("https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20180415&end=20180416")[0]
print(bitcoin_market_new)
# convert the date string to the correct date format
bitcoin_market_new = bitcoin_market_new.assign(Date=pd.to_datetime(bitcoin_market_info['Date']))
print(bitcoin_market_new)

# convert to int
bitcoin_market_new['Volume'] = bitcoin_market_new['Volume'].astype('int64')

# look at the first few rows
bitcoin_market_new.head()

bitcoin_market_new.columns =[bitcoin_market_new.columns[0]]+['bt_'+i for i in bitcoin_market_new.columns[1:]]

market_new = bitcoin_market_new
market_new = market_new[market_new['Date']>='2016-01-01']

for coins in ['bt_']: 
    kwargs = { coins+'day_diff': lambda x: (x[coins+'Close']-x[coins+'Open'])/x[coins+'Open']}
    market_new = market_new.assign(**kwargs)
market_new.head()

for coins in ['bt_']: 
    kwargs = { coins+'close_off_high': lambda x: 2*(x[coins+'High']- x[coins+'Close'])/(x[coins+'High']-x[coins+'Low'])-1,
            coins+'volatility': lambda x: (x[coins+'High']- x[coins+'Low'])/(x[coins+'Open'])}
    market_new = market_new.assign(**kwargs)
    
m_data = market_new[['Date']+[coin+metric for coin in ['bt_'] 
                                   for metric in ['Close','Volume','close_off_high','volatility']]]

# need to reverse the data frame so that subsequent rows represent later timepoints
m_data = m_data.sort_values(by='Date')
m_data.head()

predict_set = m_data.drop('Date', 1)

print(type(predict_set))
print(predict_set)
print(predict_set.shape)

window_len = 1
norm_cols = [coin+metric for coin in ['bt_'] for metric in ['Close','Volume']]

LSTM_predict_inputs = []

for i in range(len(predict_set)):#-window_len):
    temp_set = predict_set[i:(i+window_len)].copy()
    for col in norm_cols:
        temp_set.loc[:, col] = temp_set[col]/temp_set[col].iloc[0] - 1
    LSTM_predict_inputs.append(temp_set)
LSTM_test_outputs = (test_set['bt_Close'][window_len:].values/test_set['bt_Close'][:-window_len].values)-1

LSTM_predict_inputs = [np.array(LSTM_predict_inputs) for LSTM_predict_inputs in LSTM_predict_inputs]
print(LSTM_predict_inputs)
LSTM_predict_inputs = np.array(LSTM_predict_inputs)
print(LSTM_predict_inputs)
LSTM_predict_inputs.shape

           Date     Open     High      Low    Close      Volume    Market Cap
0  Apr 15, 2018  7999.33  8338.42  7999.33  8329.11  5244480000  135812000000
        Date     Open     High      Low    Close      Volume    Market Cap
0 2018-04-15  7999.33  8338.42  7999.33  8329.11  5244480000  135812000000
<class 'pandas.core.frame.DataFrame'>
   bt_Close   bt_Volume  bt_close_off_high  bt_volatility
0   8329.11  5244480000          -0.945088        0.04239
(1, 4)
[array([[ 0.        ,  0.        , -0.94508832,  0.0423898 ]])]
[[[ 0.          0.         -0.94508832  0.0423898 ]]]


(1, 1, 4)

In [63]:
((bt_model.predict(LSTM_predict_inputs)+1)*\
                   predict_set['bt_Close'].values.reshape(1,1))

array([[ 8360.51863617]])