# LSTM

In [None]:
#import os
import gc
#from datetime import datetime
#import time

#import math
#from itertools import product

import numpy as np
import pandas as pd

#from statsmodels.tsa.ar_model import AutoReg
#from statsmodels.tsa.arima_model import ARMA, ARIMA
#from statsmodels.tsa.stattools import adfuller
#from statsmodels.tsa.statespace.sarimax import SARIMAX
#from statsmodels.tsa.seasonal import seasonal_decompose
#from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
#import statsmodels.api as sm

#from sklearn.model_selection import KFold
#from sklearn.metrics import mean_squared_error
#from sklearn.preprocessing import RobustScaler, StandardScaler

#from scipy import stats

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
#from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# https://github.com/philipperemy/keras-tcn
#from tcn import TCN

import tensorflow_probability as tfp

#from plotly.subplots import make_subplots
#import plotly.express as px
#import plotly.graph_objects as go
import matplotlib.pyplot as plt
#import seaborn as sns

In [None]:
#data_path = '/kaggle/input/'
data_path = '../data/'

## Artificial Neural Networks Methods

In [None]:
DEBUG = True
N_ASSETS = 1
WINDOW_SIZE = 15
BATCH_SIZE = 1024
PCT_VALIDATION = 10 # last 10% of the data are used as validation set

### Data Preparation (Building the Time Series Model)

#### Loading

Loading data and converting timestamps

In [None]:
asset_details = pd.read_csv(data_path + 'g-research-crypto-forecasting/asset_details.csv')

In [None]:
asset_details.sort_values(by='Asset_ID')

In [None]:
train = pd.read_csv(data_path + "g-research-crypto-forecasting/train.csv")

In [None]:
train.columns

In [None]:
train = train[train.Asset_ID == 1]

In [None]:
# Convert timestamp
train['timestamp'] = pd.to_datetime(train['timestamp'], unit='s')
train

#### Reindexing and sorting

Asset-wise reindexing, performing a forward fill when possible, else a backfill. We create a column `is_real` to note whether a row is existant in the original dataset or added due to reindexing.

In [None]:
# maybe target should not be filled. The filling has to be re-examined

In [None]:
train.index = train.timestamp
train.sort_index(inplace=True)
train['is_real'] = True
ind = train.index.unique()

def reindex(df):
    res = df.reindex(pd.date_range(ind.min(), ind.max(), freq='min'))
    res['is_real'].fillna(False, inplace=True)
    res['timestamp'] = res.index
    res = res.fillna(method="ffill").fillna(method="bfill")
    return res

train = train.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_values(by=['timestamp', 'Asset_ID'])
gc.collect()

In [None]:
train.head(10)

#### VWAP clipping

In [None]:
VWAP_max = np.max(train[np.isfinite(train.VWAP)].VWAP)
VWAP_min = np.min(train[np.isfinite(train.VWAP)].VWAP)
train['VWAP'] = np.nan_to_num(train.VWAP, posinf=VWAP_max, neginf=VWAP_min)

#### Feature engineering

In [None]:
def feature_eng(_df, row = False):
    #_df = df.copy()
    
    _df['Spread'] = _df['High'] - _df['Low']
    _df['Close-Open'] = _df['Close'] - _df['Open']

    _df['Upper_Shadow'] = _df['High'] - np.maximum(_df['Close'], _df['Open'])
    _df['Lower_Shadow'] = np.minimum(_df['Close'], _df['Open']) - _df['Low']
    
    _df['High/Low'] = _df['High'] / _df['Low']
    _df['log_High/Low'] = np.log(_df['High/Low'])
    
    _df['Close/Open'] = _df['Close'] / _df['Open']
    _df['log_Close/Open'] = np.log(_df['Close/Open'])

    _df['Volume/Count'] = _df['Volume'] / (_df['Count'] + 1)

    _df['LOGVOL'] = np.log(1. + _df['Volume'])
    _df['LOGCNT'] = np.log(1. + _df['Count'])

    _df['Mean'] = _df[['Open', 'High', 'Low', 'Close']].mean(axis = 1)
    _df['High/Mean'] = _df['High'] / _df['Mean']
    _df['Low/Mean'] = _df['Low'] / _df['Mean']

    _df['Median'] = _df[['Open', 'High', 'Low', 'Close']].median(axis=1)
    _df['High/Median'] = _df['High'] / _df['Median']
    _df['Low/Median'] = _df['Low'] / _df['Median']

    """################################### ???

        _df['RNG'] = (_df['High'] - _df['Low']) / _df['VWAP']
        _df['MOV'] = (_df['Close'] - _df['Open']) / _df['VWAP']
        _df['CLS'] = (_df['Close'] - _df['VWAP']) / _df['VWAP']

    ################################### ???

        _df['gtrade'] = _df['Close-Open'] / _df['Count']
        _df['shadow1'] = _df['Close-Open'] / (_df['Volume'] + 1)
        _df['shadow3'] = _df['Upper_Shadow'] / (_df['Volume'] + 1)
        _df['shadow5'] = _df['Lower_Shadow'] / (_df['Volume'] + 1)

        _df['diff1'] = _df['Volume'] - _df['Count'] # ?????

        _df['mean1'] = (_df['shadow5'] + _df['shadow3']) / 2
        _df['mean2'] = (_df['shadow1'] + _df['Volume']) / 2
        _df['mean3'] = (_df['Close-Open'] + _df['gtrade']) / 2
        _df['mean4'] = (_df['diff1'] + _df['Upper_Shadow']) / 2
        _df['mean5'] = (_df['diff1'] + _df['Lower_Shadow']) / 2"""

    return _df

Get the features

In [None]:
train = feature_eng(train)

some params needed for the model, sampling, etc.

In [None]:
targets = train['Target'].to_numpy().reshape(-1, N_ASSETS)

In [None]:
feature_cols = train.columns.drop(['Asset_ID', 'Target', 'timestamp', 'is_real'])

In [None]:
feature_cols

In [None]:
len(feature_cols)

#### Managing added timestamps

non-real data features are set to 0 which are then masked by the model

In [None]:
train.head(10)

#### Generating training samples

In [None]:
train_data = train[feature_cols].values
train_data = train_data.reshape(-1, N_ASSETS, train_data.shape[-1])

In [None]:
train_data.shape

Samples with a duration of WINDOW_SIZE records (minutes) will be formed from the train array. Each sample has a target vector corresponding to the final index if WINDOW_SIZE record.

In [None]:
class sample_generator(keras.utils.Sequence):
    def __init__(self, x_set, y_set, batch_size, length):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.length = length
        self.size = len(x_set)
    def __len__(self): return int(np.ceil(len(self.x) / float(self.batch_size)))
    def __getitem__(self, idx):
        batch_x = []
        batch_y = []
        for i in range(self.batch_size):
            start_ind = self.batch_size * idx + i
            end_ind = start_ind + self.length 
            if end_ind <= self.size:
                batch_x.append(self.x[start_ind : end_ind])
                batch_y.append(self.y[end_ind -1])
        return np.array(batch_x), np.array(batch_y)

In [None]:
X_train, X_test = train_data[:-len(train_data)//PCT_VALIDATION], train_data[-len(train_data)//PCT_VALIDATION:]
y_train, y_test = targets[:-len(train_data)//PCT_VALIDATION], targets[-len(train_data)//PCT_VALIDATION:]

In [None]:
train_generator = sample_generator(X_train, y_train, length=WINDOW_SIZE, batch_size=BATCH_SIZE)
val_generator = sample_generator(X_test, y_test, length=WINDOW_SIZE, batch_size=BATCH_SIZE)
print(f'Sample shape: {train_generator[0][0].shape}')
print(f'Target shape: {train_generator[0][1].shape}')

### Metrics and Loss functions

In [None]:
#Correlations for predicted and real
def MaxCorrelation(y_true,y_pred): 
    return -tf.math.abs(tfp.stats.correlation(y_pred, y_true, sample_axis=None, event_axis=None))

def Correlation(y_true,y_pred): 
    return tf.math.abs(tfp.stats.correlation(y_pred, y_true, sample_axis=None, event_axis=None))

#Masked losses
def masked_mse(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.mean_squared_error(y_true=y_true_masked, y_pred=y_pred_masked)

def masked_mae(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.mean_absolute_error(y_true=y_true_masked, y_pred=y_pred_masked)

def masked_cosine(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.cosine_similarity(y_true_masked, y_pred_masked)

### LSTM

#### Multivariate 1-Layered LSTM with Gloobal Average Pool

##### Model
Our model will be trained for the number of FOLDS and EPOCHS you chose in the configuration above. Each fold the model with lowest validation loss will be saved and used to predict OOF and test. Adjust the variable VERBOSE. The variable VERBOSE=1 or 2 will display the training and validation loss for each epoch as text.

In [None]:
#Model
def get_model(n_assets=N_ASSETS):
    x_input = keras.Input(shape=(train_generator[0][0].shape[1], n_assets, train_generator[0][0].shape[-1]))
    branch_outputs = []
    
    for i in range(n_assets):
        a = layers.Lambda(lambda x: x[:,:, i])(x_input) # Slicing the ith asset:
        a = layers.Masking(mask_value=0.)(a)
        a = tf.keras.layers.BatchNormalization()(a)
        a = layers.LSTM(units=32, return_sequences=True)(a)
        #a = layers.GlobalAvgPool1D()(a)
        branch_outputs.append(a)
        
    x = layers.Concatenate()(branch_outputs)
    x = layers.Dense(units=128)(x)
    out = layers.Dense(units=n_assets)(x)
    model = keras.Model(inputs=x_input, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=masked_mae, metrics=[Correlation])
    return model

model = get_model()
model.summary()

In [None]:
tf.keras.utils.plot_model(get_model(n_assets=1), show_shapes=True)

In [None]:
tf.random.set_seed(0)
estop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=0, mode='min', restore_best_weights=True)
scheduler = keras.optimizers.schedules.ExponentialDecay(1e-3, (0.5 * len(X_train) / BATCH_SIZE), 1e-3)
lr = keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

##### Training and testing

In [None]:
epochs = 200
history = model.fit(train_generator, validation_data=(val_generator), epochs=epochs, callbacks=[lr, estop])

In [None]:
def plot_training_history(history):
    fig, ax = plt.subplots(1, 2, figsize=(16, 8))
    histories = pd.DataFrame(history.history)
    epochs = list(range(1, len(histories)+1))
    loss = histories['loss']
    val_loss = histories['val_loss']
    correlation = histories['Correlation']
    val_correlation = histories['val_Correlation']
    ax[0].plot(epochs, loss, label='Train Loss')
    ax[0].plot(epochs, val_loss, label='Val Loss')
    ax[0].set_title('Losses')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(loc='upper right')
    ax[1].plot(epochs, correlation, label='Train Correlation')
    ax[1].plot(epochs, val_correlation, label='Val Correlation')
    ax[1].set_title('Correlations')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(loc='upper right')
    fig.show()
gc.collect()

In [None]:
plot_training_history(history)

In [None]:
predictions = model.predict(val_generator)

In [None]:
def prediction_details(predictions, y_test, asset_details, assets):
    print('Asset:    Corr. coef.')
    print('---------------------')
    for i, asset in enumerate(assets):
        # drop first 14 values in the y_test, since they are absent in val_generator labels
        y_true = np.squeeze(y_test[WINDOW_SIZE - 1:, i])
        y_pred = np.squeeze(predictions[:, i])
        real_target_ind = np.argwhere(y_true != 0)
        asset_name = asset_details[asset_details.Asset_ID == asset]['Asset_Name'].item()
        print(f"{asset_name}: {np.corrcoef(y_pred[real_target_ind].flatten(), y_true[real_target_ind].flatten())[0, 1]:.4f}")
        plt.plot(y_true, label='Target')
        plt.plot(y_pred, label='Prediction')
        plt.xlabel('Time')
        plt.ylabel('Target')
        plt.title(asset_name)
        plt.legend()
        plt.show()

In [None]:
prediction_details(predictions=predictions, y_test=y_test, asset_details=asset_details, assets=train.Asset_ID.astype(int).unique())

#### Multivariate 2-Layered Bidirectional LSTM

##### Model

In [None]:
#Model
def get_model(n_assets=N_ASSETS):
    x_input = keras.Input(shape=(train_generator[0][0].shape[1], n_assets, train_generator[0][0].shape[-1]))
    branch_outputs = []

    for i in range(n_assets):
        a = layers.Lambda(lambda x: x[:,:, i])(x_input) # Slicing the ith asset:
        a = layers.Masking(mask_value=0.)(a)
        a = tf.keras.layers.BatchNormalization()(a)
        a = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(a)
        a = layers.Bidirectional(layers.LSTM(16))(a)
        #a = layers.GlobalAvgPool1D()(a)
        branch_outputs.append(a)
    
    x = layers.Concatenate()(branch_outputs)
    x = layers.Dense(units = 128)(x)
    out = layers.Dense(units = n_assets)(x)
    model = keras.Model(inputs=x_input, outputs=out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3), loss=masked_mse, metrics=[Correlation])
    return model

model = get_model()
model.summary()

In [None]:
tf.keras.utils.plot_model(get_model(n_assets=1), show_shapes=True)

In [None]:
tf.random.set_seed(0)
estop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=0, mode='min', restore_best_weights=True)
scheduler = keras.optimizers.schedules.ExponentialDecay(1e-3, (0.5 * len(X_train) / BATCH_SIZE), 1e-3)
lr = keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

##### Training and testing

In [None]:
epochs = 200
history = model.fit(train_generator, validation_data=(val_generator), epochs=epochs, callbacks=[lr, estop])

In [None]:
plot_training_history(history)

In [None]:
predictions = model.predict(val_generator)

In [None]:
prediction_details(predictions=predictions, y_test=y_test, asset_details=asset_details, assets=range(N_ASSETS))