Loading dependency packages.



In [None]:
import sys
import os
import os.path
import numpy as np
import matplotlib.pyplot as plt
import datetime
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.metrics import r2_score
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.models import model_from_json
from tensorflow.keras.metrics import *

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


Superparameters for tuning.

In [None]:
# Features
processing_from = '2016-01-01'
processing_to = '2020-12-31'
label_columns = ['Weighted_Price']
input_features = 2
output_features = len(label_columns)
Tx = 7
Ty = 3
shift = 1
cost_function="mse"
optimizer="adam"
BATCH_SIZE = 128
learning_rate = 0.0001
num_epochs = 50
L2_lambd = 0.0
beta1 = 0.0
beta2 = 0.0
epsilon = 0.0
LSTM_units = 512
LSTM_dropout = 0.6
dropout_rate = 0.6

#INPUT_DATA
IN_PATH = os.path.abspath('F:/Work/DATA/BTC/')
trained_model_path = "BTC_regressor_daily_v20210129.h5"
trained_model_figure_path = "BTC_regressor_daily_v20210129.png"
dataset_scaler_path = "20180101_20201231_daily.scl"
preprocessed_dataset_path = "20180101_20201231_daily.npy"

Define CSV reading function.

In [None]:
# data reading
def csv_reading(url, options):
  if options == 0:
    content = pd.read_csv(url)
    return content
  elif options == 1:
    file_id = url.split('/')[-2]
    path='https://drive.google.com/uc?export=download&id=' + file_id
    file_name = 'bitstampUSD_1-min_data_2012-01-01_to_2020-12-31.csv'
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    fileDownloaded = drive.CreateFile({'id':file_id})
    fileDownloaded.GetContentFile(file_name)
    content = pd.read_csv(file_name)
    return content

Define preprocessing function to get daily trading data.

In [None]:
def daily_preprocess(df):
  price = df.resample('d', on='Timestamp')[['Weighted_Price']].mean()
  volume_usd = df.resample('d', on='Timestamp')[['Volume_(Currency)']].sum()
  close_price = df.resample('d', on='Timestamp')[['Close']].last('T')
  fig, axs = plt.subplots(3, 1, figsize=(20,10))
  fig.suptitle('Full BTC historical trading (daily)', fontsize= 18, color='b')
  axs[0].plot(price, 'g')
  axs[0].set_ylabel('BTC daily weighted price ($)')
  axs[1].plot(close_price, 'b')
  axs[1].set_ylabel('BTC daily close price ($)')
  axs[2].plot(volume_usd, 'r')
  axs[2].set_ylabel('BTC daily volume ($)')
  fig.tight_layout(rect=[0, 0.03, 1, 0.95])
  plt.show()

  # removed NaN and slice data
  price.fillna(method ='bfill', inplace = True)
  close_price.fillna(method ='bfill', inplace = True)
  volume_usd.fillna(method ='bfill', inplace = True)
  price = price[processing_from:processing_to]
  close_price = close_price[processing_from:processing_to]
  volume_usd = volume_usd[processing_from:processing_to]
  fig, axs = plt.subplots(3, 1, figsize=(20,10))
  fig.suptitle('Sliced BTC historical trading (daily)', fontsize= 18, color='b')
  axs[0].plot(price, 'g')
  axs[0].set_ylabel('BTC daily weighted price ($)')
  axs[1].plot(close_price, 'b')
  axs[1].set_ylabel('BTC daily close price ($)')
  axs[2].plot(volume_usd, 'r')
  axs[2].set_ylabel('BTC volume (currency) ($)')
  fig.tight_layout(rect=[0, 0.03, 1, 0.95])
  plt.show()
  daily = pd.concat([price, volume_usd], join = 'outer', axis = 1)
  return daily

Loading dataset from Google drive and visualization

In [None]:
dataset_url='https://drive.google.com/file/d/16AoK0vKAIshdyO96Td4Vfmnf_6cdTBko/view?usp=sharing'
btc = csv_reading(dataset_url, 1)
btc['Timestamp'] = pd.to_datetime(btc.Timestamp, unit='s')
print(btc.info())
daily = daily_preprocess(btc)

Define models

In [None]:
def BTC_forecasting_model(num_of_input_features, Tx, num_of_out_features, Ty, LSTM_units, LSTM_dropout, dropout_rate):
  ''' v1
  model = Sequential()
  model.add(LSTM(LSTM_units, activation='relu', return_sequences=True, input_shape=(Tx, num_of_input_features), dropout=LSTM_dropout))
  model.add(Dropout(rate=dropout_rate))
  model.add(BatchNormalization())
  model.add(LSTM(LSTM_units, dropout=LSTM_dropout))
  model.add(Dropout(rate=dropout_rate))
  model.add(Dense(num_of_out_features, activation='sigmoid'))
  '''
  '''
  model = Sequential()
  model.add(LSTM(LSTM_units, activation='relu', return_sequences=True, input_shape=(Tx, num_of_input_features),dropout=LSTM_dropout))
  #model.add(BatchNormalization())
  model.add(LSTM(LSTM_units, activation='relu',dropout=LSTM_dropout))
  model.add(Dropout(rate=dropout_rate)(training=True))
  model.add(Dense(num_of_out_features))
  '''

  input_shape=(Tx, num_of_input_features)
  X_input = Input(input_shape)
  X = LSTM(LSTM_units, activation='relu', return_sequences=False, input_shape=input_shape, name='LSTM_1')(X_input)
  #X = BatchNormalization(name='BN_1')(X)
  #X = LSTM(LSTM_units, activation='relu',dropout=LSTM_dropout,name='LSTM_2')(X)
  #X = Dropout(rate=dropout_rate,name='Dropout_1')(X, training=True)
  X = Dense(num_of_out_features, activation='sigmoid',name='Full_1')(X)
  model = Model(inputs = X_input, outputs = X, name='BTC_forecasting')

  if optimizer == 'adam':
    opt = optimizers.Adam(learning_rate=learning_rate)
  elif optimizer == 'SGD':
    opt = optimizers.SGD(learning_rate=learning_rate)
  model.compile(optimizer=opt, loss=cost_function, metrics=[MeanSquaredError()])
  return model

def BTC_daily_forecasting_multi_steps_model(num_of_input_features, Tx, num_of_out_features, Ty, LSTM_units, LSTM_dropout, dropout_rate):
  model = Sequential()
  model.add(LSTM(LSTM_units, activation='relu', input_shape=(Tx, num_of_input_features)))
  #model.add(Dropout(rate=dropout_rate))
  #model.add(BatchNormalization())
  #model.add(LSTM(LSTM_units, dropout=LSTM_dropout))
  model.add(Dense(Ty*num_of_out_features, activation='sigmoid'))
  model.add(Reshape((Ty, num_of_out_features)))
  
  if optimizer == 'adam':
    opt = optimizers.Adam(learning_rate=learning_rate)
  elif optimizer == 'SGD':
    opt = optimizers.SGD(learning_rate=learning_rate)
  model.compile(optimizer=opt, loss=cost_function, metrics=[MeanSquaredError()])
  return model


Define data generators

In [None]:
class WindowGenerator():
  def __init__(self, input_width, label_width, shift,
               input_df,
               label_columns=None):
    # Store the raw data.
    self.input_df = input_df

    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
    self.column_indices = {name: i for i, name in enumerate(input_df.columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}',
        f'Label column name(s): {self.label_columns}'])
  
def sequence_data_generator(df, label_columns, input_len, output_len, shift, batch_size):
  total_window_size = input_len + shift
  inputs = np.array(df)
  labels = np.array(df[label_columns])
  x_shape = (batch_size, input_len, inputs.shape[1])
  x_batch = np.zeros(shape=x_shape, dtype=np.float16)
  y_shape = (batch_size, output_len, 1)
  y_batch = np.zeros(shape=y_shape, dtype=np.float16)

  while True:
    for i in range(batch_size):
      idx = np.random.randint(len(inputs) - total_window_size)
      x_batch[i] = inputs[idx:idx+input_len]
      y_batch[i] = labels[idx + total_window_size-output_len:idx + total_window_size]
    yield (x_batch, y_batch)

def convert_fit_data(df, label_columns, input_len, output_len, shift):
    data_x = []
    data_y = []
    for i in range(len(df) - input_len - shift):
        x_floats = np.array(df.iloc[i:i+input_len])
        y_floats = np.array(df[label_columns].iloc[i+input_len+shift-output_len:i+input_len+shift])
        data_x.append(x_floats)
        data_y.append(y_floats)
    return np.array(data_x), np.array(data_y)

Normalize data and split into train and test set.

In [None]:
columns_name = daily.columns
scaler = preprocessing.MinMaxScaler()
btc_scaled = daily.copy()
print(btc_scaled.head())
scaler = scaler.fit(daily)
btc_scaled[columns_name] = scaler.transform(daily)
print(btc_scaled.head())
train, test = train_test_split(btc_scaled, test_size=0.2, random_state=1)
train, validate = train_test_split(train, test_size=0.25, random_state=1)
X,y = convert_fit_data(btc_scaled, label_columns, Tx, Ty, shift)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
# save
joblib.dump(scaler, dataset_scaler_path)
saved_data = {'X_train':X_train,'y_train':y_train,'X_val':X_validate,'y_val':y_validate, 'X_test':X_test,'y_test':y_test}
np.save(preprocessed_dataset_path, saved_data)
#
print('train size: ' + str(X_train.shape))
print('train label size: ' + str(y_train.shape))
print('validate size: ' + str(X_validate.shape))
print('test size: ' + str(X_test.shape))


Model, train and validate.

In [None]:
checkpoint = ModelCheckpoint(trained_model_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
#model = BTC_forecasting_model(input_features, Tx, output_features, Ty, LSTM_units, LSTM_dropout, dropout_rate)
model = BTC_daily_forecasting_multi_steps_model(input_features, Tx, output_features, Ty, LSTM_units, LSTM_dropout, dropout_rate)
model.summary()
tf.keras.utils.plot_model(model, to_file=trained_model_figure_path, show_shapes=True, show_layer_names=True, expand_nested=False, dpi=96)
#history = model.fit(sequence_data_generator(train, label_columns, Tx, Ty, shift, BATCH_SIZE),validation_data=sequence_data_generator(validate, label_columns, Tx, Ty, shift, BATCH_SIZE),validation_steps=len(validate)-Tx-shift, steps_per_epoch=len(train)-Tx-shift, epochs=num_epochs)
history = model.fit(X_train, y_train,epochs=num_epochs, batch_size=BATCH_SIZE, callbacks=[checkpoint], shuffle=True, validation_data=(X_validate, y_validate))

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()

# predict on train/test dataset
#test_pred = model.predict(sequence_data_generator(test, label_columns, Tx, Ty, shift, len(test) - shift - Tx), steps=1)
#test_true = np.array(test[['Weighted_Price']])[Tx+shift-Ty:,0]
#train_pred = model.predict(sequence_data_generator(train, label_columns, Tx, Ty, shift, len(train) - shift - Tx), steps=1)
#train_true = np.array(train[['Weighted_Price']])[Tx+shift-Ty:,0]

test_true = np.copy(y_test)
train_true = np.copy(y_train)
test_pred = model.predict(X_test)
test_true = test_true.reshape(test_pred.shape)
train_pred = model.predict(X_train)
train_true = train_true.reshape(train_pred.shape)
train_mse = tf.keras.losses.MeanSquaredError()(train_true, train_pred).numpy()
test_mse = tf.keras.losses.MeanSquaredError()(test_true, test_pred).numpy()
# transform to original value
scaler = joblib.load(dataset_scaler_path)
inverse_scaler = preprocessing.MinMaxScaler()
inverse_scaler.min_,inverse_scaler.scale_ = scaler.min_[0],scaler.scale_[0]
X_train_org = np.copy(X_train)
X_test_org = np.copy(X_test)

for i in range(Tx):
  X_train_org[:,i,:] = scaler.inverse_transform(X_train[:,i,:])
  X_test_org[:,i,:] = scaler.inverse_transform(X_test[:,i,:])
for i in range(Ty):
  test_pred[:,i,:] = inverse_scaler.inverse_transform(test_pred[:,i,:])
  test_true[:,i,:] = inverse_scaler.inverse_transform(test_true[:,i,:])
  train_pred[:,i,:] = inverse_scaler.inverse_transform(train_pred[:,i,:])
  train_true[:,i,:] = inverse_scaler.inverse_transform(train_true[:,i,:])

test_r2 = r2_score(test_true[:,:,0], test_pred[:,:,0])
train_r2 = r2_score(train_true[:,:,0], train_pred[:,:,0])
#
font = {'family': 'serif',
        'color':  'darkred',
        'weight': 'normal',
        'size': 16,
        }
f = plt.figure(figsize=(20,10))
ax = f.add_subplot()
rand_idx = random.sample(range(len(y_test)), 3)
for i in range(3):
  # plt.plot(X_train_org[rand_idx[i],:,0], 'C{n}+-'.format(n=i), label='input of train sample {spl}'.format(spl=rand_idx[i]))
  # plt.plot(np.arange(Ty)+Tx, train_true[rand_idx[i],:,:], 'C{n}o'.format(n=i), label='true output of train sample {spl}'.format(spl=rand_idx[i]))
  # plt.plot(np.arange(Ty)+Tx, train_pred[rand_idx[i],:,:], 'C{n}o'.format(n=i), markerfacecolor='none', label='pred output of train sample {spl}'.format(spl=rand_idx[i]))

  plt.plot(X_test_org[rand_idx[i],:,0], 'C{n}+-'.format(n=i), label='input of test sample {spl}'.format(spl=rand_idx[i]))
  plt.plot(np.arange(Ty)+Tx, test_true[rand_idx[i],:,:], 'C{n}o'.format(n=i), label='true output of test sample {spl}'.format(spl=rand_idx[i]))
  plt.plot(np.arange(Ty)+Tx, test_pred[rand_idx[i],:,:], 'C{n}o'.format(n=i), markerfacecolor='none', label='pred output of test sample {spl}'.format(spl=rand_idx[i]))


plt.text(0.3, 0.95, "train MSE = {mse}".format(mse = round(train_mse, 5)), fontdict=font, transform=ax.transAxes)
plt.text(0.3, 0.90, "train R2 = {r2_score}".format(r2_score = round(train_r2, 5)), fontdict=font, transform=ax.transAxes)
plt.text(0.3, 0.85, "test MSE = {mse}".format(mse = round(test_mse, 5)), fontdict=font, transform=ax.transAxes)
plt.text(0.3, 0.80, "test R2 = {r2_score}".format(r2_score = round(test_r2, 5)), fontdict=font, transform=ax.transAxes)
plt.xlabel('Day')
plt.ylabel('Weighted price($)')
plt.legend()
plt.show()
