In [603]:
# Adi Pradhan
# ZTDL Bootcamp Sep 2018
# Predicting the Price of BTC using an LSTM 
%reset -f
#libraries

import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import warnings
from keras.utils import to_categorical
warnings.filterwarnings('ignore')

In [604]:
# read the csvs
df1 = pd.read_csv('data/data.part1.csv',index_col=0)
df2 = pd.read_csv('data/data.part2.csv',index_col=0)
df = pd.concat([df1,df2])

In [605]:
# create a Pandas Series with index as the date and data as the mean price in that day
df['timestamp_hour'] = pd.to_datetime(df['Timestamp'],unit='s')
df['timestamp_hour'] = pd.to_datetime(df['timestamp_hour'])
df = df.set_index(['timestamp_hour'])
df_price = df.resample('H').mean()
df_price = df_price[['Weighted_Price']]
df_price = df_price.dropna()

In [606]:
df_price['Next_Weighted_Price'] = df_price['Weighted_Price'].shift(-1)
df_price = df_price[1:]
df_price = df_price[:-1]

In [607]:
df_price = df_price['2017':'2018']

In [608]:
df_price.head()

Unnamed: 0_level_0,Weighted_Price,Next_Weighted_Price
timestamp_hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-01 00:00:00,971.395033,970.7578
2017-01-01 01:00:00,970.7578,970.164174
2017-01-01 02:00:00,970.164174,968.458101
2017-01-01 03:00:00,968.458101,967.766736
2017-01-01 04:00:00,967.766736,966.05339


In [609]:
df_price.tail()

Unnamed: 0_level_0,Weighted_Price,Next_Weighted_Price
timestamp_hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-06-26 19:00:00,6172.690783,6183.871078
2018-06-26 20:00:00,6183.871078,6187.186309
2018-06-26 21:00:00,6187.186309,6155.090729
2018-06-26 22:00:00,6155.090729,6113.676832
2018-06-26 23:00:00,6113.676832,6073.96996


In [610]:
df_price['up_or_down'] = df_price.apply(lambda row: 1 if row['Next_Weighted_Price'] > row['Weighted_Price'] else 0,axis=1)

In [611]:
# most interesting man (coin flip prediction)
df_price['coin_flip'] = np.random.randint(0,2,size=df_price.shape[0])

# yoda (always HODL - optimist)
df_price['optimistic_prediction'] = 1

df_price.head()

Unnamed: 0_level_0,Weighted_Price,Next_Weighted_Price,up_or_down,coin_flip,optimistic_prediction
timestamp_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01 00:00:00,971.395033,970.7578,0,1,1
2017-01-01 01:00:00,970.7578,970.164174,0,0,1
2017-01-01 02:00:00,970.164174,968.458101,0,1,1
2017-01-01 03:00:00,968.458101,967.766736,0,0,1
2017-01-01 04:00:00,967.766736,966.05339,0,1,1


In [612]:
pd.DataFrame.corr(df_price)

Unnamed: 0,Weighted_Price,Next_Weighted_Price,up_or_down,coin_flip,optimistic_prediction
Weighted_Price,1.0,0.999815,-0.030474,-0.006178,
Next_Weighted_Price,0.999815,1.0,-0.021538,-0.006063,
up_or_down,-0.030474,-0.021538,1.0,0.018242,
coin_flip,-0.006178,-0.006063,0.018242,1.0,
optimistic_prediction,,,,,


In [613]:
df_price['coin_flip'].value_counts()/len(df_price)

0    0.503383
1    0.496617
Name: coin_flip, dtype: float64

Therefore the coin flip has 50.4% accuracy

In [614]:
df_price['up_or_down'].value_counts()/len(df_price)

1    0.537592
0    0.462408
Name: up_or_down, dtype: float64

Therefore the optimistic view has 51% accuracy

In [615]:
# Get the right series for prediction 
X = df_price[['Weighted_Price']]
y = df_price['up_or_down']

In [616]:
# for train test split we will take Mar, Apr and May 2018 as the test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle=False,test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10406, 1)
(2602, 1)
(10406,)
(2602,)


In [617]:
from sklearn.preprocessing import MinMaxScaler
mx = MinMaxScaler()
X_train_rs = X_train.values.reshape(-1,1)
X_test_rs = X_test.values.reshape(-1,1)
X_train_sc = mx.fit_transform(X_train_rs)
X_test_sc = mx.transform(X_test_rs)

In [618]:
X_train_sc_df = pd.DataFrame(X_train_sc,columns=['Price'])
X_test_sc_df = pd.DataFrame(X_test_sc,columns=['Price'])
X_train_sc_df.describe()
print(X_train_sc_df.shape[1:])

(1,)


In [619]:
window_size = 24*7*4
for s in np.arange(1,window_size+1):
    X_train_sc_df['shift_{}'.format(s)] = X_train_sc_df['Price'].shift(s)
    X_test_sc_df['shift_{}'.format(s)] = X_test_sc_df['Price'].shift(s)

In [620]:
X_train_sc_df.head(n=30)

Unnamed: 0,Price,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,...,shift_663,shift_664,shift_665,shift_666,shift_667,shift_668,shift_669,shift_670,shift_671,shift_672
0,0.01107,,,,,,,,,,...,,,,,,,,,,
1,0.011037,0.01107,,,,,,,,,...,,,,,,,,,,
2,0.011006,0.011037,0.01107,,,,,,,,...,,,,,,,,,,
3,0.010917,0.011006,0.011037,0.01107,,,,,,,...,,,,,,,,,,
4,0.01088,0.010917,0.011006,0.011037,0.01107,,,,,,...,,,,,,,,,,
5,0.010791,0.01088,0.010917,0.011006,0.011037,0.01107,,,,,...,,,,,,,,,,
6,0.010856,0.010791,0.01088,0.010917,0.011006,0.011037,0.01107,,,,...,,,,,,,,,,
7,0.010976,0.010856,0.010791,0.01088,0.010917,0.011006,0.011037,0.01107,,,...,,,,,,,,,,
8,0.010945,0.010976,0.010856,0.010791,0.01088,0.010917,0.011006,0.011037,0.01107,,...,,,,,,,,,,
9,0.010986,0.010945,0.010976,0.010856,0.010791,0.01088,0.010917,0.011006,0.011037,0.01107,...,,,,,,,,,,


In [621]:
X_train_sc_df = X_train_sc_df[window_size:]
y_train = y_train[window_size:]

X_test_sc_df = X_test_sc_df[window_size:]
y_test = y_test[window_size:]

X_train_final = X_train_sc_df.values.reshape(-1, window_size+1, 1)
y_train_cat = to_categorical(y_train.values)


X_test_final = X_test_sc_df.values.reshape(-1,window_size+1,1)
y_test_cat = to_categorical(y_test.values)



In [622]:
X_train.shape

(10406, 1)

In [623]:
from keras.layers import Dense, CuDNNLSTM, Dropout
from keras import Sequential
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(CuDNNLSTM(32,input_shape=(window_size+1,1),return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(32))
model.add(Dropout(0.2))
model.add(Dense(32,activation='softmax'))
model.add(Dropout(0.2))
model.add(Dense(2,activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_27 (CuDNNLSTM)    (None, 673, 32)           4480      
_________________________________________________________________
dropout_32 (Dropout)         (None, 673, 32)           0         
_________________________________________________________________
cu_dnnlstm_28 (CuDNNLSTM)    (None, 32)                8448      
_________________________________________________________________
dropout_33 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 32)                1056      
_________________________________________________________________
dropout_34 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 2)                 66        
Total para

In [624]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
early_stop = EarlyStopping(monitor='loss',patience=3,verbose=1)

In [625]:
h = model.fit(X_train_final,y_train_cat,epochs=200,batch_size=1024,callbacks=[early_stop],verbose=1,validation_split=0.1)

Train on 8760 samples, validate on 974 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 00010: early stopping


In [626]:
model.evaluate(X_train_final,y_train_cat)



[0.6889877725068808, 0.5427367988555172]

In [627]:
model.evaluate(X_test_final,y_test_cat)



[0.6925013338963603, 0.5202072538860104]

In [628]:
y_pred = model.predict(X_test_final)

In [629]:
y_pred_classes = np.argmax(y_pred,axis=1)

In [630]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_classes)

0.5202072538860104