In [13]:
# importing required libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from pandas.plotting import lag_plot
from pandas import datetime, read_csv
! pip install scikit-learn
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sys

# for ARIMA
! pip install pmdarima
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller

# for LSTM
from math import sqrt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
import seaborn as sns

from numpy import concatenate
from matplotlib import pyplot
from pandas import DataFrame
from pandas import concat



In [14]:
# importing data from csv files
lags_arr = [24, 48, 72]
token_symbols = ['UNI', 'LINK', 'AAVE', 'MKR', 'LEO', 'COMP', 'GRT', 'HT', 'CEL', 
                 'CHZ', 'TEL', 'YFI', 'HOT', 'ENJ', 'MANA', 'QNT', 'BAT', 'SNX', 'NEXO', 
                 'BNT', 'CRV', 'CHSB', 'KCS', 'ZRX', 'UMA', 'ANKR', 'VGX', '1INCH']
number_of_tokens = len(token_symbols)
raw_datasets = []

for symbol in token_symbols:
  raw_datasets.append(read_csv(symbol + '.csv'))

In [15]:
# Scaling data and splitting to training and test datasets
training_set_ratio = 0.80
data_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Trades']
scalers = []
training_datasets, test_datasets = [], []
training_datasets_dates, test_datasets_dates = [], []

for i in range(number_of_tokens):
  training_set_size = int(len(raw_datasets[i])*training_set_ratio)

  # separating dates from values
  training_datasets_dates.append(raw_datasets[i][:training_set_size]['Time'])
  test_datasets_dates.append(raw_datasets[i][training_set_size:]['Time'])
  raw_dataset_without_dates = raw_datasets[i][data_columns].astype('float')

  # scaling
  scaler = MinMaxScaler()
  raw_training_dataset = raw_dataset_without_dates[:training_set_size]
  scaler.fit(raw_training_dataset)
  scaled_dataset = pd.DataFrame(scaler.transform(raw_dataset_without_dates), columns=data_columns)
  scalers.append(scaler)

  # splitting to training and test datasets
  training_datasets.append(scaled_dataset[:training_set_size])
  test_datasets.append(scaled_dataset[training_set_size:])

#ARIMA

In [16]:
# Auto ARIMA to select the best parameters for the ARIMA model
# documentation - http://alkaline-ml.com/pmdarima/1.8.1/modules/generated/pmdarima.arima.auto_arima.html#pmdarima.arima.auto_arima
ARIMA_params = []

for i in range(number_of_tokens):
  model = pm.auto_arima(training_datasets[i]['Close'], 
                        test='adf', # use adftest to find optimal 'd'
                        d=None, # let the model determine 'd'
                        start_p=1, start_q=1,
                        max_p=3, max_q=3, # maximum p=3 and q=3
                        # trace=True, # to see logs in the process
                        error_action='ignore',  
                        suppress_warnings=True, 
                        stepwise=True)
  print(token_symbols[i] + ' - ' + str(model.order))
  ARIMA_params.append(model.order)

ENJ - (3, 1, 2)


In [None]:
# To ignore unrelated warnings
import warnings
warnings.filterwarnings("ignore")

# predicting future prices with ARIMA
ARIMA_predictions_for_all_lags = []

for l in range(len(lags_arr)):
  lags = lags_arr[l]
  ARIMA_predictions = []
  print('{} lags:'.format(lags))

  for i in range(number_of_tokens):
    test_dataset_close_prices = test_datasets[i]['Close'].to_numpy().tolist()
    training_dataset_close_prices = training_datasets[i]['Close'].to_numpy().tolist()

    training_X = []
    for j in range(len(test_dataset_close_prices)):
      curr_data = []
      if j >= lags:
        curr_data.extend(test_dataset_close_prices[(j-lags):j])
      else:
        curr_data.extend(training_dataset_close_prices[-(lags-j):])
        curr_data.extend(test_dataset_close_prices[0:j])
      training_X.append(curr_data)

    curr_predictions = []
    for j in range(len(test_dataset_close_prices)):
      sys.stdout.write('\r' + '{} - {}/{}'.format(token_symbols[i], j, len(test_dataset_close_prices)))
      sys.stdout.flush()
      model = ARIMA(training_X[j], order=ARIMA_params[i])
      model_fit = model.fit()
      output = model_fit.forecast()
      yhat = output[0]
      curr_predictions.append(yhat)
      
    ARIMA_predictions.append(curr_predictions)
    print(' - done')

  ARIMA_predictions_for_all_lags.append(ARIMA_predictions)
  print()

#LSTM

In [18]:
# prepare data
training_y_arr, training_X_arr, test_X_arr = [], [], []

for l in range(len(lags_arr)):
  lags = lags_arr[l]

  # test_y = []
  training_y = []
  training_X, test_X = [], []

  for i in range(number_of_tokens):
    training_y.append(training_datasets[i][lags:]['Close'].to_numpy())
    # test_y.append(test_datasets[i][:]['Close'].to_numpy())

    curr_training_X = []
    for j in range(0, len(training_datasets[i]) - lags):
      curr_training_X.append(training_datasets[i][j:j+lags].to_numpy().tolist())
    training_X.append(np.array(curr_training_X))

    curr_test_X = []
    for j in range(len(test_datasets[i])):
      curr_data = []
      if j >= lags:
        curr_data.extend(test_datasets[i][(j-lags):j].to_numpy().tolist())
      else:
        curr_data.extend(training_datasets[i][-(lags-j):].to_numpy().tolist())
        curr_data.extend(test_datasets[i][0:j].to_numpy().tolist())
      curr_test_X.append(curr_data)
    test_X.append(np.array(curr_test_X))

  training_y_arr.append(training_y)
  training_X_arr.append(training_X)
  test_X_arr.append(test_X)

In [None]:
# build models and predict test values
LSTM_predictions_for_all_lags = []

for l in range(len(lags_arr)):
  LSTM_predictions = []
  print('{} lags:'.format(lags_arr[l]))

  for i in range(number_of_tokens):
    # design network

    # determining number of neurons by a rule of thumb
    alpha = 2
    shape = training_X_arr[l][i].shape
    neurons = int(shape[0] / (alpha * (shape[1] * shape[2] + 1)))

    model = Sequential()
    model.add(LSTM(neurons, input_shape=(training_X_arr[l][i].shape[1], training_X_arr[l][i].shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')

    # fit network
    history = model.fit(training_X_arr[l][i], training_y_arr[l][i], epochs=400, batch_size=len(training_X_arr[l][i]), 
                        validation_split=0.9, verbose=0, shuffle=False)
    
    # make a prediction
    yhat = model.predict(test_X_arr[l][i])
    LSTM_predictions.append(yhat.flatten('C'))
    print('{} - done, neurons - {}'.format(token_symbols[i], neurons))

  LSTM_predictions_for_all_lags.append(LSTM_predictions)
  print()

    # plot history
    # pyplot.plot(history.history['loss'], label='train')
    # pyplot.plot(history.history['val_loss'], label='test')
    # pyplot.legend()
    # pyplot.show()

#Calculate performance of ARIMA and LSTM

In [20]:
def calculate_MAPE(test_data, predicted_data): 
    return np.mean(np.abs((test_data - predicted_data) / test_data)) * 100

def calculate_performance(predictions, test_data, scaler):
  # rescaling back predictions
  dataframe_for_rescaling = DataFrame({"col1":predictions, "col2":predictions, "col3":predictions, 
                                       "col4":predictions, "col5":predictions, "col6":predictions})
  dataframe_scaled_back = scaler.inverse_transform(dataframe_for_rescaling)
  predictions_scaled_back = DataFrame(dataframe_scaled_back)[3].to_numpy()

  # Rescaling back test dataset
  dataframe_for_rescaling = DataFrame({"col1":test_data, "col2":test_data, "col3":test_data, 
                                       "col4":test_data, "col5":test_data, "col6":test_data})
  dataframe_scaled_back = scaler.inverse_transform(dataframe_for_rescaling)
  test_y_scaled_back = DataFrame(dataframe_scaled_back)[3].to_numpy()

  # calculating performance metrics
  RMSE = mean_squared_error(test_y_scaled_back, predictions_scaled_back, squared=False) # Root mean square error
  MAE =  mean_absolute_error(test_y_scaled_back, predictions_scaled_back) # Mean absolute error
  MAPE = calculate_MAPE(test_y_scaled_back, predictions_scaled_back) # Mean absolute percentage error
  return [RMSE, MAE, MAPE]

In [None]:
from google.colab import drive
from google.colab import files
drive.mount('/drive')

# Calculating and saving performance results
for l in range(len(lags_arr)):
  lags = lags_arr[l]
  print('{} lags:'.format(lags))

  results = DataFrame(columns=['Token', 'ARIMA_RMSE','LSTM_RMSE', 'ARIMA_MAE', 'LSTM_MAE', 'ARIMA_MAPE', 'LSTM_MAPE'])

  for i in range(number_of_tokens):
    # calculating performance
    test_data = test_datasets[i]['Close'].to_numpy()
    ARIMA_performance = calculate_performance(ARIMA_predictions_for_all_lags[l][i], test_data, scalers[i])
    LSTM_performance = calculate_performance(LSTM_predictions_for_all_lags[l][i], test_data, scalers[i])

    results = results.append({'Token': token_symbols[i], 
                              'ARIMA_RMSE': ARIMA_performance[0], 'LSTM_RMSE': LSTM_performance[0],
                              'ARIMA_MAE': ARIMA_performance[1], 'LSTM_MAE': LSTM_performance[1], 
                              'ARIMA_MAPE': ARIMA_performance[2], 'LSTM_MAPE': LSTM_performance[2]}, 
                            ignore_index=True)
  print(results)
  print()
  fileName = 'results_ARIMA_LSTM_{}_lags.csv'.format(lags)
  results.to_csv(fileName)
  files.download(fileName)

In [22]:
# test_set_range = df[int(len(df)*training_set_ratio):].index
# plt.plot(test_set_range, model_predictions, color='blue', marker='o', linestyle='dashed',label='Predicted Price')
# plt.plot(test_set_range, test_data, color='red', label='Actual Price')
# plt.title('UNI Prices Prediction')
# plt.xlabel('Date')
# plt.ylabel('Prices')
# plt.xticks(np.arange(3911,5587,200), df.Date[3911:5587:200])
# plt.xticks(rotation = 90)
# plt.rcParams["figure.figsize"] = (10,3)
# plt.legend()
# plt.show()