# Creating a RNN block

In [103]:
import numpy as np
import functools

In [88]:
def softmax(logits: np.ndarray) -> np.ndarray:
    """ Returns probabilities """
    return np.exp(logits - np.max(logits))/np.sum(np.exp(logits - np.max(logits)), axis=0, keepdims=True)

In [160]:
def relu(x: np.ndarray) -> np.ndarray:
    """ Rectified Linear Unit activation function """
    return np.fmax(0, x)

In [89]:
# test softmax
logits = np.array([2.0, 1.0, 0.1])
softmax(logits)

array([0.65900114, 0.24243297, 0.09856589])

In [186]:
def cross_entropy(y_true: np.ndarray, y_hat: np.ndarray) -> float:
    """ Cross entropy loss """
    return -np.sum(y_true * np.log(y_hat))

In [187]:
# test loss function
y_true = np.array([0, 1, 0, 0, 0])              # True distribution
y_pred = np.array([0.1, 0.6, 0.1, 0.15, 0.05])  # Predicted distribution

print(f"Cross Entropy: {cross_entropy(y_true, y_pred):.2f}")

Cross Entropy: 0.51


In [145]:
class Module:

    def __init__(self, cls) -> None:
        functools.update_wrapper(self, cls)

    def __call__(self, *args):
        return self.forward(*args)

    def __repr__(self) -> str:
        return f'{self.__class__.__name__}()'

In [268]:
class RNNCell(Module):

    def __init__(
        self, 
        dim_hidden_units: int, 
        dim_input: int, 
        batch_size: int, 
        dim_output: int, 
        activation=np.tanh, 
        loss=cross_entropy
    ) -> None:
        super().__init__(self)
        # input size is ((n_x), m) where n_x is input dimensions
        self.Wa = np.random.randn(dim_hidden_units, dim_input+dim_hidden_units) # we assume the last shape is T_x
        self.Wy = np.random.randn(dim_output, dim_hidden_units)
        self.ba = np.zeros((dim_hidden_units, 1))
        self.by = np.zeros((dim_output, 1))
        self.activation = activation
        self.loss = loss
        self.caches = []

    def forward(self, x: np.ndarray, hidden_state_prev: np.ndarray) -> tuple[np.ndarray, np.ndarray]: 
        
        stack = np.vstack((x, hidden_state_prev))
        z = self.Wa @ stack + self.ba
        hidden_state = self.activation(z)
        logits = self.Wy @ hidden_state + self.by
        y_hat = softmax(logits)

        cache = {}
        cache['x'] = stack
        cache['z'] = z
        cache['hidden_state'] = hidden_state
        cache['y_hat'] = y_hat 

        self.caches.append(cache)

        return hidden_state, y_hat

    def reset_sequence() -> None:
        self.outputs = []
        self.hidden_states = []

    def compute_loss(self, y_true: np.ndarray | list[float | int]) -> float:
        y_hats = np.stack(tuple(self.outputs), axis=-1)
        return np.sum(self.loss(y_true, y_hats)) / len(self.outputs)

    def bptt(self, y_pred, y_true, z) -> None:
        T = len(self.outputs)
        
        # actually this is dL_dlogits since we integrate the derivative of the softmax inside the CEloss
        dL_dyhat = 1/T * (y_pred - y_true[:, :, -1])
        dL_dWy = dL_dyhat @ self.hidden_states[-1].T
        dL_dby = np.sum(dL_dyhat, axis=-1, keepdims=True)
        dL_ht = self.Wy.T @ dL_dyhat
        dL_dtanh = dL_ht @ (1 - self.activation(z)**2)

        print(f"dL_dyhat: {dL_dyhat}, y_hat shape: {y_pred.shape}, dL_dyhat shape: {dL_dyhat.shape}")
        print(f"dL_dWy: {dL_dWy}, Wy shape: {self.Wy.shape}, dL_dWy shape: {dL_dWy.shape}")
        print(f"dL_dby: {dL_dby}, by shape: {self.by.shape}, dL_dby shape: {dL_dby.shape}")
        print(f"dL_ht: {dL_ht}, ht shape: {self.hidden_states[-1].shape}, dL_ht shape: {dL_ht.shape}")

In [265]:
# test rnn forward
np.random.seed(42)

batch_size = 2
seq_length = 3
input_size = 4
hidden_size = 5
output_size = 3
cell = RNNCell(hidden_size, input_size, batch_size, output_size)

input_sequence = np.random.rand(input_size, batch_size, seq_length)
hidden_state = np.zeros((hidden_size, batch_size))

for t in range(seq_length):
    input_t = input_sequence[:, :, t] 
    hidden_state, y_hat = cell(input_t, hidden_state)
    # print(f"Time step {t + 1}: Hidden state =\n{hidden_state}, \nY_hat = \n{y_hat} ")

y_true = np.array([[[1, 0, 0], [0, 1, 0]], [[1, 0, 0], [0, 0, 1]], [[0, 1, 0], [0, 0, 1]]])
print(f"Loss at end of sequence: {cell.compute_loss(y_true)}")
cell.bptt(y_hat, y_true)

Loss at end of sequence: 3.145069063049545
dL_dyhat: [[ 0.09814336  0.12502991]
 [ 0.06892655 -0.24618589]
 [ 0.16626342 -0.21217735]], y_hat shape: (3, 2), dL_dyhat shape: (3, 2)
dL_dWy: [[ 0.17917978 -0.19386396 -0.18667072  0.04367452 -0.2103113 ]
 [-0.09277223  0.12225986  0.18015811 -0.12507178  0.17825159]
 [ 0.02603107 -0.00038286  0.07888883 -0.12372106  0.05749588]], Wy shape: (3, 5), dL_dWy shape: (3, 5)
dL_dby: [[ 0.22317326]
 [-0.17725934]
 [-0.04591392]], by shape: (3, 1), dL_dby shape: (3, 1)
dL_ht: [[ 0.10652788 -0.3673835 ]
 [-0.21128221  0.21527115]
 [ 0.00568092  0.3644284 ]
 [ 0.13096158 -0.17791011]
 [ 0.06023004 -0.68123886]], ht shape: (5, 2), dL_ht shape: (5, 2)


In [222]:
from collections.abc import Iterable

class OneHotEncoder(Module):

    def __init__(self, num_classes: int) -> None:
        super().__init__(self)
        self.num_classes = num_classes

    def forward(self, indices: int | np.ndarray | list[int]) -> np.ndarray:
        if isinstance(indices, Iterable):
            if len(indices) > self.num_classes:
                raise ValueError("Cannot have more 1s than number of classes")
        encoding = np.zeros((self.num_classes,))
        encoding[indices] = 1 
        return encoding


In [183]:
# test onehot encoder class

encoder = OneHotEncoder(5)
encoder(3), encoder([1, 2])

(array([0., 0., 0., 1., 0.]), array([0., 1., 1., 0., 0.]))