<a href="https://colab.research.google.com/github/yashgupta-7/legendre-forecasters/blob/main/da_rnn/darnn_lmu_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kaelzhang/DA-RNN-in-Tensorflow-2-and-PyTorch/

Cloning into 'DA-RNN-in-Tensorflow-2-and-PyTorch'...
remote: Enumerating objects: 242, done.[K
remote: Counting objects: 100% (242/242), done.[K
remote: Compressing objects: 100% (169/169), done.[K
remote: Total 242 (delta 118), reused 183 (delta 59), pack-reused 0[K
Receiving objects: 100% (242/242), 7.32 MiB | 16.43 MiB/s, done.
Resolving deltas: 100% (118/118), done.


In [None]:
cd /content/DA-RNN-in-Tensorflow-2-and-PyTorch/

/content/DA-RNN-in-Tensorflow-2-and-PyTorch


In [54]:
# !pip install keras_lmu

In [55]:
from typing import Optional
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import (
    Layer,
    LSTM,
    Dense,
    Permute
)
from tensorflow.keras.models import Model
from da_rnn.common import (
    check_T
)
import keras_lmu

In [None]:
class InputAttention(Layer):
    T: int

    def __init__(self, T, **kwargs):
        """
        Calculates the encoder attention weight Alpha_t at time t
        Args:
            T (int): the size (time steps) of the window
        """

        super().__init__(name='input_attention', **kwargs)

        self.T = T

        self.W_e = Dense(T)
        self.U_e = Dense(T)
        self.v_e = Dense(1)

    def call(
        self,
        hidden_state,
        cell_state,
        X
    ):
        """
        Args:
            hidden_state: hidden state of shape (batch_size, m) at time t - 1
            cell_state: cell state of shape (batch_size, m) at time t - 1
            X: the n driving (exogenous) series of shape (batch_size, T, n)
        Returns:
            The attention weights (Alpha_t) at time t, i.e.
            (a_t__1, a_t__2, ..., a_t__n)
        """

        n = X.shape[2]

        # [h_t-1; s_t-1]
        hs = K.repeat(
            tf.concat([hidden_state, cell_state], axis=-1),
            # -> (batch_size, m * 2)
            n
        )
        # -> (batch_size, n, m * 2)

        tanh = tf.math.tanh(
            tf.concat([
                self.W_e(hs),
                # -> (batch_size, n, T)

                self.U_e(
                    Permute((2, 1))(X)
                    # -> (batch_size, n, T)
                ),
                # -> (batch_size, n, T)
            ], axis=-1)
            # -> (batch_size, n, T * 2)
        )
        # -> (batch_size, n, T * 2)

        # Equation 8:
        e = self.v_e(tanh)
        # -> (batch_size, n, 1)

        # Equation: 9
        return tf.nn.softmax(
            Permute((2, 1))(e)
            # -> (batch_size, 1, n)
        )
        # -> (batch_size, 1, n)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'T': self.T
        })
        return config

In [215]:
class Encoder(Layer):
    T: int
    m: int

    def __init__(
        self,
        T: int,
        m: int,
        lmu=False,
        **kwargs
    ):
        """
        Generates the new input X_tilde for encoder
        Args:
            T (int): the size (time steps) of the window
            m (int): the number of the encoder hidden states
        """

        super().__init__(name='encoder_input', **kwargs)

        self.T = T
        self.m = m
        self.lmu = lmu

        if lmu:
          self.input_lstm = keras_lmu.LMUCell(memory_d=128,order=4, #LMU
                theta=m,
                hidden_cell=tf.keras.layers.SimpleRNNCell(m),
                hidden_to_memory=False,
                memory_to_memory=False,
                input_to_hidden=True,
                kernel_initializer="ones",
               #return_sequences = False
            )
        else:
          self.input_lstm = LSTM(m, return_state=True)
        self.input_attention = InputAttention(T)

    def call(self, X) -> tf.Tensor:
        """
        Args:
            X: the n driving (exogenous) series of shape (batch_size, T, n)
        Returns:
            The encoder hidden state of shape (batch_size, T, m)
        """

        batch_size = K.shape(X)[0]

        hidden_state = tf.zeros((batch_size, self.m))
        cell_state = tf.zeros((batch_size, self.m))

        X_encoded = []

        for t in range(self.T):
            Alpha_t = self.input_attention(hidden_state, cell_state, X)

            # Equation 10
            X_tilde_t = tf.multiply(
                Alpha_t,
                # TODO:
                # make sure it can share the underlying data
                X[:, None, t, :]
            )
            # -> (batch_size, 1, n)

            # Equation 11
            if self.lmu:
              _ , final_state = self.input_lstm(
                X_tilde_t,
                states = hidden_state + [cell_state])
              
              hidden_state = final_state[:-1] #[:, :-128*4]
              cell_state = final_state[-1] #[:, -128*4:]
            else:
              hidden_state, _, cell_state = self.input_lstm(
                X_tilde_t,
                initial_state=[hidden_state, cell_state]
            )

            X_encoded.append(
                hidden_state[:, None, :]
                # -> (batch_size, 1, m)
            )

        return tf.concat(X_encoded, axis=1)
        # -> (batch_size, T, m)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'T': self.T,
            'm': self.m
        })
        return config

In [216]:
class TemporalAttention(Layer):
    m: int

    def __init__(self, m: int, **kwargs):
        """
        Calculates the attention weights::
            Beta_t = (beta_t__1, ..., beta_t__i, ..., beta_t__T) (1 <= i <= T)
        for each encoder hidden state h_t at the time step t
        Args:
            m (int): the number of the encoder hidden states
        """

        super().__init__(name='temporal_attention', **kwargs)

        self.m = m

        self.W_d = Dense(m)
        self.U_d = Dense(m)
        self.v_d = Dense(1)

    def call(
        self,
        hidden_state,
        cell_state,
        X_encoded
    ):
        """
        Args:
            hidden_state: hidden state `d` of shape (batch_size, p)
            cell_state: cell state `s` of shape (batch_size, p)
            X_encoded: the encoder hidden states (batch_size, T, m)
        Returns:
            The attention weights for encoder hidden states (beta_t)
        """

        # Equation 12
        l = self.v_d(
            tf.math.tanh(
                tf.concat([
                    self.W_d(
                        K.repeat(
                            tf.concat([hidden_state, cell_state], axis=-1),
                            # -> (batch_size, p * 2)
                            X_encoded.shape[1]
                        )
                        # -> (batch_size, T, p * 2)
                    ),
                    # -> (batch_size, T, m)
                    self.U_d(X_encoded)
                ], axis=-1)
                # -> (batch_size, T, m * 2)
            )
            # -> (batch_size, T, m)
        )
        # -> (batch_size, T, 1)

        # Equation 13
        return tf.nn.softmax(l, axis=1)
        # -> (batch_size, T, 1)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'm': self.m
        })
        return config


In [217]:
class Decoder(Layer):
    T: int
    m: int
    p: int
    y_dim: int

    def __init__(
        self,
        T: int,
        m: int,
        p: int,
        y_dim: int,
        **kwargs
    ):
        """
        Calculates y_hat_T
        Args:
            T (int): the size (time steps) of the window
            m (int): the number of the encoder hidden states
            p (int): the number of the decoder hidden states
            y_dim (int): prediction dimentionality
        """

        super().__init__(name='decoder', **kwargs)

        self.T = T
        self.m = m
        self.p = p
        self.y_dim = y_dim

        self.temp_attention = TemporalAttention(m)
        self.dense = Dense(1)
        self.decoder_lstm = LSTM(p, return_state=True)

        self.Wb = Dense(p)
        self.vb = Dense(y_dim)

    def call(self, Y, X_encoded) -> tf.Tensor:
        """
        Args:
            Y: prediction data of shape (batch_size, T - 1, y_dim) from time 1 to time T - 1. See Figure 1(b) in the paper
            X_encoded: encoder hidden states of shape (batch_size, T, m)
        Returns:
            y_hat_T: the prediction of shape (batch_size, y_dim)
        """

        batch_size = K.shape(X_encoded)[0]
        hidden_state = tf.zeros((batch_size, self.p))
        cell_state = tf.zeros((batch_size, self.p))

        # c in the paper
        context_vector = tf.zeros((batch_size, 1, self.m))
        # -> (batch_size, 1, m)

        for t in range(self.T - 1):
            Beta_t = self.temp_attention(
                hidden_state,
                cell_state,
                X_encoded
            )
            # -> (batch_size, T, 1)

            # Equation 14
            context_vector = tf.matmul(
                Beta_t, X_encoded, transpose_a=True
            )
            # -> (batch_size, 1, m)

            # Equation 15
            y_tilde = self.dense(
                tf.concat([Y[:, None, t, :], context_vector], axis=-1)
                # -> (batch_size, 1, y_dim + m)
            )
            # -> (batch_size, 1, 1)

            # Equation 16
            hidden_state, _, cell_state = self.decoder_lstm(
                y_tilde,
                initial_state=[hidden_state, cell_state]
            )
            # -> (batch_size, p)

        concatenated = tf.concat(
            [hidden_state[:, None, :], context_vector], axis=-1
        )
        # -> (batch_size, 1, m + p)

        # Equation 22
        y_hat_T = self.vb(
            self.Wb(concatenated)
            # -> (batch_size, 1, p)
        )
        # -> (batch_size, 1, y_dim)

        return tf.squeeze(y_hat_T, axis=1)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'T': self.T,
            'm': self.m,
            'p': self.p,
            'y_dim': self.y_dim
        })
        return config

In [218]:
class DARNN(Model):
    def __init__(
        self,
        T: int,
        m: int,
        p: Optional[int] = None,
        y_dim: int = 1,
        lmu=False
    ):
        """
        Args:
            T (int): the size (time steps) of the window
            m (int): the number of the encoder hidden states
            p (:obj:`int`, optional): the number of the decoder hidden states. Defaults to `m`
            y_dim (:obj:`int`, optional): prediction dimentionality. Defaults to `1`
        Model Args:
            inputs: the concatenation of
            - n driving series (x_1, x_2, ..., x_T) and
            - the previous (historical) T - 1 predictions (y_1, y_2, ..., y_Tminus1, zero)
        `inputs` Explanation::
            inputs_t = (x_t__1, x_t__2, ..., x_t__n, y_t__1, y_t__2, ..., y_t__d)
            where
            - d is the prediction dimention
            - y_T__i = 0, 1 <= i <= d.
            Actually, the model will not use the value of y_T
        Usage::
            model = DARNN(10, 64, 64)
            y_hat = model(inputs)
        """

        super().__init__(name='DARNN')

        check_T(T)

        self.T = T
        self.m = m
        self.p = p or m
        self.y_dim = y_dim

        self.encoder = Encoder(T, m, lmu)
        self.decoder = Decoder(T, m, p, y_dim=y_dim)

    # Equation 1
    def call(self, inputs):
        X = inputs[:, :, :-self.y_dim]
        # -> (batch_size, T, n)

        # Y's window size is one less than X's
        # so, abandon `y_T`

        # By doing this, there are some benefits which makes it pretty easy to
        # process datasets
        Y = inputs[:, :, -self.y_dim:]
        # -> (batch_size, T - 1, y_dim)

        X_encoded = self.encoder(X)

        y_hat_T = self.decoder(Y, X_encoded)
        # -> (batch_size, y_dim)

        return y_hat_T

    def get_config(self):
        return {
            'T': self.T,
            'm': self.m,
            'p': self.p,
            'y_dim': self.y_dim
        }

In [219]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [220]:
dataroot = '/content/drive/MyDrive/aml_project/data/nasdaq100_padding.csv'
batchsize = 128
nhidden_encoder = 128
nhidden_decoder = 128
ntimestep = 10
lr = 0.001
epochs = 50

In [221]:
!ls '/content/drive/MyDrive/aml_project/data/nasdaq100_padding.csv'

/content/drive/MyDrive/aml_project/data/nasdaq100_padding.csv


In [222]:
import pandas as pd


def read_data(input_path, debug=True): 
#   Each feature item of the dataset should be of shape (batch_size, T, length_of_driving_series + y_dim)

# And each label item of the dataset should be of shape (batch_size, y_dim)

# Development
    """Read nasdaq stocks data.

    Args:
        input_path (str): directory to nasdaq dataset.

    Returns:
        X (np.ndarray): features.
        y (np.ndarray): ground truth.

    """
    df = pd.read_csv(input_path, nrows=250 if debug else None)
    # X = df.iloc[:, 0:-1].values
    X = df.loc[:, [x for x in df.columns.tolist() if x != 'NDX']].to_numpy() #values() #as_matrix()
    # y = df.iloc[:, -1].values
    y = np.array(df.NDX)

    return X, y

In [223]:
features = X

In [224]:
VALIDATION_RATIO = 0.1
def get_labels_from_features(features):
    return features[WINDOW_SIZE - 1:, -Y_DIM:][:, None, :]


def split_by_ratio(features):
    length = len(features)
    validation_length = int(VALIDATION_RATIO * length)
    
    return features[:-validation_length], features[-validation_length:]


training_features, validation_features = split_by_ratio(features)

print('training length', len(training_features))
print('validation length', len(validation_features))

training length 36504
validation length 4056


In [225]:
# !pip install get-rolling-window
BATCH_SIZE = 256
WINDOW_SIZE = 10
ENCODER_HIDDEN_STATES = 128
DECODER_HIDDEN_STATES = 128

Y_DIM = 1

DATA = 'nasdaq100_padding.csv'

VALIDATION_RATIO = 0.2

EPOCHS = 20

In [226]:
import tensorflow as tf
from get_rolling_window import rolling_window

train_f, train_l = rolling_window(training_features, WINDOW_SIZE, 1), get_labels_from_features(training_features)
train_ds = tf.data.Dataset.from_tensor_slices((train_f, train_l)).batch(BATCH_SIZE, drop_remainder=True)

print(train_f.shape, train_l.shape)

val_f, val_l = rolling_window(validation_features, WINDOW_SIZE, 1), get_labels_from_features(validation_features)
val_ds = tf.data.Dataset.from_tensor_slices((val_f, val_l)).batch(BATCH_SIZE, drop_remainder=True)

train_ds

(36495, 10, 81) (36495, 1, 1)


<BatchDataset shapes: ((256, 10, 81), (256, 1, 1)), types: (tf.float64, tf.float64)>

In [227]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Reshape

model = DARNN(
    WINDOW_SIZE,
    ENCODER_HIDDEN_STATES,
    DECODER_HIDDEN_STATES,
    Y_DIM,
    lmu = True
)

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae', 'mape']
)

In [228]:
feature_batch, label_batch = next(iter(train_ds))

print('feature, label shape', feature_batch.shape, label_batch.shape)

print('prediction shape', model(feature_batch).shape)

model(feature_batch[:1])


feature, label shape (256, 10, 81) (256, 1, 1)


InvalidArgumentError: ignored

In [170]:
# model = DARNN(T=int(10), m=int(128), p=128, lmu=False)

In [171]:
save_to = './checkpoint.hdf5'

history = model.fit(
    train_ds,
    validation_data=val_ds,
    callbacks=[
        # Save checkpoints on best validation loss
        tf.keras.callbacks.ModelCheckpoint(
            save_to,
            monitor='val_loss',
            save_best_only=True,
            verbose=1
        ),
        # Stop early if the model overfits
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
    ],
    epochs=EPOCHS,
    verbose=1
)

Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.95584, saving model to ./checkpoint.hdf5
Epoch 2/20

Epoch 00002: val_loss did not improve from 1.95584
Epoch 3/20
 10/142 [=>............................] - ETA: 1:30 - loss: 0.4256 - mae: 0.5614 - mape: 1.4549

KeyboardInterrupt: ignored

In [172]:
model.count_params()

254629

In [173]:
model.summary()

Model: "DARNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (Encoder)      multiple                  105249    
_________________________________________________________________
decoder (Decoder)            multiple                  149380    
Total params: 254,629
Trainable params: 254,609
Non-trainable params: 20
_________________________________________________________________


In [141]:
import numpy as np 
import pandas as pd 
# Read dataset
print("==> Load dataset ...")
X, y = read_data(dataroot, debug=False)

# Initialize model
print("==> Initialize DA-RNN model ...")

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=tf.keras.losses.MeanSquaredError, metrics=["accuracy"]) 
#[12:37 AM] Mohammad Ali Rehan

model.fit(
    (X, y),
    validation_data=None,
    epochs=100,
    verbose=1
)

==> Load dataset ...
==> Initialize DA-RNN model ...
Epoch 1/100


TypeError: ignored