In [4]:
import numpy as np

In [None]:
def ReLU(x: float) -> float:
    return max(0,x)

def ReLU_(x: float) -> float:
    '''the derivative of the ReLU function'''
    return 1.0 if x > 0.0 else 0.0

def softmax(Z: np.ndarray) -> np.ndarray:
    S = np.sum(np.exp(Z))
    sigma = np.exp(Z)/S
    return sigma

def CE_loss(y: np.ndarray, y_hat: np.ndarray) -> float:
    return -np.dot(y, np.log(y_hat))

# in the doc strings for the three partials, only the used arguments are documented
def partial_U(H, H_, U, W, V, X, t, y, y_hat):  
    '''H: 1D array of the current hidden state of shape (m,)
       y: 1D array of the one-hot coded target label of shape (k,)
       y_hat: 1D array of the model's predicted probability of each class of shape (k,)'''
    delta_out = y_hat - y
    return np.outer(H, delta_out)   

def partial_W(H, H_, U, W, V, X, t, y, y_hat):
    '''H: 1D array of the current hidden state of shape (m,)
       X: 2D data array of shape (T, num_features) where T is the total length of the time series
       t: specified time step
       U: weight matrix connecting the hidden layer to the output layer of shape (m, k)
       W: weight matrix connecting the input layer to the hidden layer of shape (m, num_features)
       V: recurrent weight matrix of shape (m ,m)
       y: 1D array of the one-hot coded target label of shape (k,)'''
    vReLU_ = np.vectorize(ReLU_)
    scale, delta_out = vReLU_((W @ X[t].T) + (V @ H_)), y_hat - y
    m, num_features = U.shape[0], X.shape[1]
    result = np.empty((m, num_features))
    for j in range(num_features):
        result[:,j] = np.sum(U @ delta_out)*scale*X[t][j]
    return result 
   

def partial_V(H, H_, U, W, V, X, t, y, y_hat):
    '''H_: 1D array of the previous hidden state of shape (m,)
       U: weight matrix connecting the hidden layer to the output layer of shape (m, k)
       W: weight matrix connecting the input layer to the hidden layer of shape (m, num_features)
       V: recurrent weight matrix of shape (m ,m)
       y: 1D array of the one-hot coded target label of shape (k,)''' 
    vReLU_ = np.vectorize(ReLU_)
    scale, delta_out = vReLU_((W @ X[t].T) + (V @ H_)), y_hat - y
    m, num_features = U.shape[0], X.shape[1]
    result = np.empty((m, num_features))
    for j in range(num_features):
        result[:,j] = np.sum(U @ delta_out)*scale*H_[j]
    return result 
    

def mini_batch_GD(data: np.ndarray, alpha, batch_size, num_epochs, epsilon,
                  H, H_, U, X, t, y, y_hat, *partials, *weight_inits):
    
    '''JUST A DRAFT
       data: 3D array of shape (num_samples, num_time_steps, num_features)
       partials: any number of 2D arrays of partial derivative functions of the paramerters (weights) to update
       weight_inits: any number of 2D arrays of weight initializations'''
    partials_arr, weights_arr = np.array(partials, dtype=object), np.array(weights_inits, dtype=object)
    num_batches = int(data.shape[0]/batch_size)
    for epoch in num_epochs:
        np.random.shuffle(data)
        for i in range(num_batches):
            mini_batch = data[i*batch_size:(i+1)*batch_size] # mini_batch is a 3D array of shape (batch_size, num_time_steps, num_features)     
            temp = np.zeros(len(partials))
            for X in mini_batch: 
                temp += np.array([p(H, H_, U, W, V, X, t, y, y_hat) for p in partials_arr])
            weights_arr -= alpha*temp/batch_size # average over batch size
    return weights_arr
                
            

class Input:
    def __init__(self, data):
        '''data: 3D array of shape (num_samples, num_time_steps, num_features)'''
        self.data = data

class Hidden:
    def __init__(self, input, m, W, V, pre_H, t=1):
        '''input: an instance of class Input
           m: number of neurons
           W: weight matrix connecting the input layer to the hidden layer of shape (m, num_features)
           V: recurrent weight matrix of shape (m ,m)
           pre_H: 2D array of shape (num_samples, m)
           t: initialized time step, which typically starts from 1'''
        self.input = input
        self.m = m
        self.W = W
        self.V = V
        self.pre_H = pre_H
        self.t = t
        # current_H not set as an attribute in initialization because the first step (t=1) of the iteration
        # does not need the current hidden state to be set
    def get_current_state(self):  
        data, pre_H, W, V, t = self.input.data, self.pre_H, self.W, self.V, self.t
        num_samples, m = pre_H.shape
        current_H = np.array([np.array([ReLU(np.dot(W[i], data[j][t])
                                           + np.dot(V[i], pre_H[j])) for i in range(m)])
                                                                     for j in range(num_samples)]).reshape(num_samples, m)    
        
        self.pre_H, self.current_H = current_H, current_H
        self.t += 1 # updates current time

class Output:
    def __init__(self, hidden, y, k, U, T): 
        '''hidden: an instance of class Hidden
           y: 2D one-hot coded target labels array of shape (num_samples, num_classes)
           k: number of classes = number of neurons in the output layer
           U: weight matrix connecting the hidden layer to the output layer of shape (m, k)
           T: total length of the time series/sequence'''
        self.hidden = hidden
        self.y = y 
        self.k = k
        self.U = U
        self.T = T 

    def get_yhat(self):
        hidden, T, U = self.hidden, self.T, self.U
        num_samples, m = hidden.pre_H.shape
        yhat = np.empty((num_samples, T, self.k))
        for i in range(num_samples):
            for j in range(T):
                hidden.get_current_state()
                yhat[i][j] = softmax(hidden.current_H[i] @ U)
        self.yhat = yhat
    
    
   # def compute_loss(self):
        #self.get_prediction()
        #self.loss = CE_loss(self.y, self.prediction)
        
   # def update_weights(self, alpha, num_epochs, max_iter):