In [None]:
import numpy as np

In [None]:
def generate_vocab(sentences):
  """
    sentences: (list of str) sentences
    unique_words: (list of str) unique words in the training examples
    V: (int) Size of Vocabulary
    word_to_id: (dict) mapping of words to ids
    id_to_word: (dict) mapping of ids to words
  """
  unique_words = list(set(" ".join(sentences).split()))
  unique_words += ["[PAD]", "[UNK]"]
  V = len(unique_words)

  word_to_id = {w: i for i, w in enumerate(unique_words)}
  id_to_word = {i: w for i, w in enumerate(unique_words)}
  return V, word_to_id, id_to_word

In [None]:
def tokenization(sentences, d=32, padding_size=4, decoder=False):
  """
    sentences: (list of str) sentences
    padding_size: (int) Size of padding
    d: (int) embedding dimension
    unique_words: (list of str) unique words in the training examples
    V: (int) Size of Vocabulary
    word_to_id: (dict) mapping of word to ids
    id_to_word: (dict) mapping of ids to words
    embeddings: (numpy array) array of size (V, d)
    m: (int) number of training examples
    n: (int) sequence length in a single example
    one_hot: (numpy array) one-hot encoded representation of shape (m, n, V)
    one_hot_shifted: (numpy array, optional) one-hot encoded shifted representation for decoder of shape (m, n, V)
    word_ids: (numpy array) tokenized word IDs of shape (m, n)
    word_ids_shifted: (numpy array, optional) tokenized word IDs shifted for decoder of shape (m, n)
    X: (numpy array) array of shape (m, n, d), embedded representation of one-hot encoded tokens
    X_shifted: (numpy array, optional) array of shape (m, n, d), embedded representation of shifted one-hot encoded tokens
  """
  if decoder:
    sentences = ["<START> "+s+" <END>" for s in sentences]

  V, word_to_id, id_to_word = generate_vocab(sentences)
  embeddings=np.random.uniform(low=-1 * np.sqrt(6) / np.sqrt(d), high=np.sqrt(6) / np.sqrt(d), size=(V,d))
  m=len(sentences)
  n=padding_size
  split_sentences = [sentence.split() for sentence in sentences]

  pad_id = word_to_id["[PAD]"]
  unk_id = word_to_id["[UNK]"]
  word_ids = np.full((m, n), pad_id, dtype=int)

  for i, words in enumerate(split_sentences):
    truncated = words[:padding_size]

    if decoder:
      truncated_shifted=[j for j in truncated if j!='<END>']
      mapped_shifted=[word_to_id.get(word, unk_id) for word in truncated_shifted]
      word_ids_shifted=word_ids[:,:-1]
      word_ids_shifted[i, :len(mapped_shifted)] = mapped_shifted

    mapped = [word_to_id.get(word, unk_id) for word in truncated]
    word_ids[i, :len(mapped)] = mapped

  one_hot = np.eye(V)[word_ids]
  if decoder:
    one_hot_shifted = np.eye(V)[word_ids_shifted]
    return embeddings, word_to_id, id_to_word, V, d, n, m, one_hot, one_hot_shifted, word_ids_shifted, word_ids
  return embeddings, word_to_id, id_to_word, V, d, n, m, one_hot, word_ids

def get_embeddings(one_hot, embeddings, one_hot_shifted=None, decoder=False):
  X = np.matmul(one_hot, embeddings)
  if decoder:
    X_shifted = np.matmul(one_hot_shifted, embeddings)
    return X, X_shifted
  return X

In [None]:
def get_pos_enc(d,n):
  """
    d: (int) embedding dimension
    n: (int) sequence length in a single example
    position: (numpy array) array of positions of shape (n, 1)
    div_term: (numpy array) array of division terms for scaling of shape (d,)
    pe: (numpy array) positional encoding array of shape (n, d), with sine and cosine applied
    returns (numpy array) array of shape (n, d), containing positional encodings
  """
  position = np.arange(n)[:, np.newaxis]
  div_term = 10000 ** (np.arange(d) / d)
  pe=position/div_term
  pe[:,0::2] = np.sin(pe[:,0::2])
  pe[:,1::2] = np.cos(pe[:,1::2])
  return pe

In [None]:
def softmax(X):
  s_X=np.exp(X)/np.sum(np.exp(X),axis=1, keepdims=True)
  return s_X

In [None]:
class softmax:
  name=""
  X=None
  def __init__(self, name):
    """
      name: (str) name identifier for the softmax layer
      Initializes the softmax layer with a name attribute.
    """
    self.name=name

  def forward(self, X):
    """
      X: (numpy array) input array of logits to be transformed, of shape (sequence_length, num_classes)
      self.X: (numpy array) output array after applying the softmax function, of shape (sequence_length, num_classes)
      returns: (numpy array) softmax-transformed output, normalized probabilities for each input row
    """
    self.X = np.exp(X - np.max(X, axis=1, keepdims=True)) / np.sum(np.exp(X - np.max(X, axis=1, keepdims=True)), axis=1, keepdims=True)
    return self.X

In [None]:
class attention_head:
  def __init__(self, name,d, d_k, d_v, masked=False):
    """
      name: (str) name identifier for the attention head
      d: (int) embedding dimension
      d_k: (int) dimension of the query and key vectors
      d_v: (int) dimension of the value vectors
      masked: (bool, optional) whether to apply masking for the attention head
      Initializes the attention head with specified dimensions and an optional mask.
    """
    self.name=name
    self.d=d
    self.d_k=d_k
    self.d_v=d_v
    self.masked=masked
    self.layers={}

  def forward(self, X, weights, X2=None):
    """
      X: (numpy array) input array of shape (sequence_length, d)
      weights: (dict) dictionary containing weight matrices 'W_Q', 'W_K', 'W_V'
      X2: (numpy array, optional) secondary input array of shape (sequence_length, d) for cross-attention
      self.attention: (numpy array) output array of shape (sequence_length, d_v), attention-weighted values
      returns: (numpy array) computed attention output for the input sequence
    """
    self.X=X
    self.X2=X2
    self.weights=weights
    W_Q = self.weights['W_Q']
    W_K = self.weights['W_K']
    W_V = self.weights['W_V']

    if self.X2 is None:
      self.Q = np.matmul(X, W_Q)
      self.K = np.matmul(X, W_K)
      self.V = np.matmul(X, W_V)
    else:
      self.Q = np.matmul(self.X, W_Q)
      self.K = np.matmul(self.X2, W_K)
      self.V = np.matmul(self.X2, W_V)

    self.dot_Q_K_norm = np.matmul(self.Q, self.K.T) / np.sqrt(self.d_k)
    if self.masked:
      self.M = np.full((self.dot_Q_K_norm.shape[0], self.dot_Q_K_norm.shape[1]), -np.inf)
      np.fill_diagonal(self.M, 0)
      self.M[np.tril_indices(self.dot_Q_K_norm.shape[0], -1)] = 0
      self.dot_Q_K_norm = self.dot_Q_K_norm + self.M

    if 'softmax' not in self.layers:
      self.layers['softmax']=softmax('softmax')

    self.attentions = self.layers['softmax'].forward(self.dot_Q_K_norm)
    self.attention = np.matmul(self.attentions, self.V)

    return self.attention

  def backward(self, dLoss_dattention, weights, gradients):
    """
      dLoss_dattention: (numpy array) gradient of loss with respect to the attention output, shape (sequence_length, d_v)
      weights: (dict) dictionary containing weight matrices 'W_Q', 'W_K', 'W_V'
      gradients: (dict) dictionary to store computed gradients for 'W_Q', 'W_K', 'W_V'
      returns: (numpy array) gradient of loss with respect to the input, shape (sequence_length, d)
              (numpy array or None) gradient of loss with respect to X2, shape (sequence_length, d) if X2 is provided
              (dict) gradients for the weight matrices 'W_Q', 'W_K', 'W_V'
    """
    dLoss_dV = np.matmul(self.attentions.T, dLoss_dattention)
    dLoss_mid = np.matmul(dLoss_dattention, self.V.T)
    dLoss_dA = dLoss_mid - (np.sum(dLoss_mid, axis=1, keepdims=True) * self.attentions)
    dLoss_dQ = np.matmul(dLoss_dA, self.K)
    dLoss_dK = np.matmul(dLoss_dA.T, self.Q)
    dLoss_dWQ = np.matmul(self.X.T, dLoss_dQ)
    if self.X2 is None:
      prev = self.X
      dLoss_dX = np.matmul(dLoss_dQ, weights['W_Q'].T) + np.matmul(dLoss_dK, weights['W_K'].T) + np.matmul(dLoss_dV, weights['W_V'].T)
    else:
      prev = self.X2
      dLoss_dX = np.matmul(dLoss_dQ, weights['W_Q'].T)
      dLoss_dX2 = np.matmul(dLoss_dK, weights['W_K'].T) + np.matmul(dLoss_dV, weights['W_V'].T)

    dLoss_dWK = np.matmul(prev.T, dLoss_dK)
    dloss_dWV = np.matmul(prev.T, dLoss_dV)
    gradients['W_Q'] = dLoss_dWQ
    gradients['W_K'] = dLoss_dWK
    gradients['W_V'] = dloss_dWV
    if self.X2 is not None:
      return dLoss_dX, dLoss_dX2, gradients
    return dLoss_dX, None, gradients

In [None]:
class multi_head:
  def __init__(self, name, d, h=2, masked=False):
    """
      name: (str) name identifier for the multi-head attention module
      d: (int) embedding dimension
      h: (int, optional) number of attention heads (default is 2)
      masked: (bool, optional) whether to apply masking for attention heads
      Initializes the multi-head attention module with the specified number of heads and optional masking.
    """

    self.name=name
    self.d=d
    self.h=h
    self.masked=masked
    self.layers={}

  def forward(self, X, weights, X2=None):
    """
      X: (numpy array) input array of shape (sequence_length, d)
      weights: (dict) dictionary containing weight matrices for each attention head and 'multihead' for the output layer
      X2: (numpy array, optional) secondary input for cross-attention of shape (sequence_length, d)
      self.m_h: (numpy array) output array after multi-head attention, of shape (sequence_length, d)
      returns: (numpy array) concatenated output of attention heads, transformed by the output weight matrix, shape (sequence_length, d)
    """
    self.X2=X2
    self.weights=weights
    self.X=X
    self.d_k = int(self.d/self.h)
    self.d_v = self.d_k

    new_layers = {f"attn_w{i}":attention_head(f"attn_w{i}", self.d, self.d_k, self.d_v , masked=self.masked) for i in range(self.h) if f"attn_w{i}" not in self.layers}
    self.layers.update(new_layers)

    self.attention_heads = [self.layers[f"attn_w{i}"].forward(self.X, self.weights[f"attn_w{i}"], X2=self.X2) for i in range(self.h)]

    self.m_hs = np.concatenate(self.attention_heads,axis=1)

    W_O = self.weights['multihead']
    self.m_h = np.matmul(self.m_hs, W_O)
    return self.m_h

  def backward(self, dLoss_dm_h, weights, gradients):
    """
      dLoss_dm_h: (numpy array) gradient of loss with respect to the multi-head output, of shape (sequence_length, d)
      weights: (dict) dictionary containing weight matrices for each attention head and 'multihead' for the output layer
      gradients: (dict) dictionary to store gradients for each attention head and 'multihead'
      returns: (numpy array) gradient of loss with respect to the input X, shape (sequence_length, d)
              (numpy array or None) gradient of loss with respect to X2, shape (sequence_length, d) if X2 is provided
              (dict) gradients for the weights in each attention head and 'multihead'
    """
    dloss_dWO = np.matmul(self.m_hs.T, dLoss_dm_h)
    gradients['multihead']=dloss_dWO
    dLoss_m_hs = np.matmul(dLoss_dm_h, weights['multihead'].T)
    dLoss_dattention_heads = np.split(dLoss_m_hs,self.h, axis=1)
    dLoss_dX=np.zeros((self.X.shape))
    if self.X2 is not None:
      dLoss_dX2=np.zeros((self.X2.shape))
    for i in range(self.h):
      returned = self.layers[f'attn_w{i}'].backward(dLoss_dattention_heads[i], weights[f'attn_w{i}'], gradients[f'attn_w{i}'])
      dLoss_dX += returned[0]
      if self.X2 is not None:
        dLoss_dX2 += returned[1]
      gradients[f'attn_w{i}'] = returned[2]
    if self.X2 is not None:
      return dLoss_dX, dLoss_dX2, gradients
    else:
      return dLoss_dX, None, gradients

In [None]:
class res_add_layer_norm:
  def __init__(self,name, d):
    self.name=name
    self.d=d

  def forward(self, prev_X, X, weights):
    self.weights=weights
    self.X=X
    self.prev_X=prev_X
    self.sum = self.X + self.prev_X
    self.var = np.var(self.sum, axis = 1, keepdims = True)
    self.mean = np.mean(self.sum, axis = 1, keepdims = True)
    self.sqrt_learning=np.sqrt(self.var + 0.000001)
    self.out = (self.sum - self.mean) / self.sqrt_learning

    gamma = self.weights['gamma']
    beta = self.weights['beta']

    self.out_g = self.out * gamma
    self.out_b_g = self.out_g + beta
    return self.out_b_g

  def backward(self, dLoss_dout_b_g, weights, gradients):
    dLoss_dgamma = np.sum(dLoss_dout_b_g, axis=0) * np.sum(self.out, axis=0)
    dLoss_dbeta = np.sum(dLoss_dout_b_g, axis=0)
    gradients["gamma"]=dLoss_dgamma
    gradients["beta"]=dLoss_dbeta
    dLoss_dout = dLoss_dout_b_g * weights['gamma']
    dLoss_dvar = np.sum(dLoss_dout * (self.X - self.mean), axis=1, keepdims=True) * -0.5 * np.power(self.var + 0.000001, -1.5)
    dLoss_dmean = np.sum(dLoss_dout * -1/self.sqrt_learning, axis=1, keepdims=True) + (dLoss_dvar * (-2/self.X.shape[1]) * np.sum(self.X - self.mean, axis=1, keepdims=True))
    dLoss_dX = (dLoss_dout * 1/self.sqrt_learning) + (dLoss_dvar * 2*(self.sum - self.mean)/self.X.shape[1]) + (dLoss_dmean * 1/self.X.shape[1])
    return dLoss_dX, gradients

In [None]:
class ffn:
  def __init__(self, name, d):
    """
      name: (str) name identifier for the residual addition and layer normalization module
      d: (int) embedding dimension
      Initializes the residual addition and layer normalization module with the specified embedding dimension.
    """

    self.name=name
    self.d=d

  def forward(self, X, weights):
    """
      prev_X: (numpy array) input from previous layer, of shape (sequence_length, d)
      X: (numpy array) input from the current layer, of shape (sequence_length, d)
      weights: (dict) dictionary containing 'gamma' and 'beta' for scaling and shifting during normalization
      self.out_b_g: (numpy array) output after residual addition and layer normalization, of shape (sequence_length, d)
      returns: (numpy array) normalized and scaled output of the residual connection, shape (sequence_length, d)
    """

    self.X=X
    self.weights=weights
    W1 = self.weights['W1']
    B1 = self.weights['B1']

    self.X1 = np.matmul(self.X,W1) + B1

    self.relu_X1 = np.maximum(0, self.X1)

    W2 = self.weights['W2']
    B2 = self.weights['B2']

    self.X2 = np.matmul(self.relu_X1, W2) + B2

    return self.X2

  def backward(self, dLoss_dlayer_norm, weights, gradients):
    """
      dLoss_dout_b_g: (numpy array) gradient of loss with respect to the normalized output, of shape (sequence_length, d)
      weights: (dict) dictionary containing 'gamma' for scaling during backpropagation
      gradients: (dict) dictionary to store computed gradients for 'gamma' and 'beta'
      returns: (numpy array) gradient of loss with respect to input X, shape (sequence_length, d)
              (dict) gradients for the scaling and shifting parameters 'gamma' and 'beta'
    """

    dLoss_W2 = np.matmul(self.relu_X1.T,dLoss_dlayer_norm)
    dLoss_B2 = np.sum(dLoss_dlayer_norm, axis=0)
    gradients['W2']=dLoss_W2
    gradients['B2']=dLoss_B2
    dLoss_drelu = np.matmul(dLoss_dlayer_norm, weights['W2'].T)
    dLoss_dX1 = dLoss_drelu * np.where(self.X1>0, 1, 0)
    dLoss_dW1 = np.matmul(self.X.T, dLoss_dX1)
    dLoss_dB1 = np.sum(dLoss_dX1, axis=0)
    gradients['W1']=dLoss_dW1
    gradients['B1']=dLoss_dB1
    dLoss_dX = np.matmul(dLoss_dX1, weights['W1'].T)
    return dLoss_dX, gradients

In [None]:
def encoder(input, d, h, name, layers, weights):
  """
    input: (numpy array) input data for the encoder, with shape (sequence_length, d)
    d: (int) embedding dimension
    h: (int) number of attention heads
    name: (str) unique identifier for the encoder layer
    layers: (dict) dictionary containing instances of layers used in the encoder module, initialized if not already present
    weights: (dict) dictionary containing weight matrices and biases for the encoder module, initialized if not already present

    This method initializes and applies a multi-head attention mechanism followed by residual addition and layer normalization.
    It then passes the result through a feed-forward neural network, followed by another layer normalization with residual addition.

    Steps:
      - Initializes the layer normalization weights and multi-head attention weights (W_Q, W_K, W_V) for 'h' attention heads if they don't already exist.
      - Sets up a multi-head attention module, a feed-forward neural network, and two layer normalization modules.
      - Performs a forward pass through the multi-head attention mechanism.
      - Applies residual connection and layer normalization after multi-head attention.
      - Processes the output through a feed-forward network.
      - Completes a final residual connection and layer normalization after the feed-forward network.

    returns:
      (numpy array) final output of the encoder layer after all transformations, with shape (sequence_length, d)
      (dict) updated dictionary containing instances of all layers within the encoder
      (dict) updated weights dictionary, with newly initialized weights for this encoder layer
  """
  if name not in layers:
    layers[name]={}
  if name not in weights:
    b=int(d/h)
    weights[name]={
        f'layer_norm{i}':{
            'beta': init_Bias(d),
            'gamma': init_gamma(d)
        }
        for i in range(1,3)
    }
    att_W = {
        f'attn_w{i}': {
            'W_Q': init_Weights(d, b),
            'W_K': init_Weights(d, b),
            'W_V': init_Weights(d, b)
        }
        for i in range(h)
    }

    att_W['multihead'] = init_Weights(b * h, d)
    ffnn = {
        'W1': init_Weights(d, d * 4),
        'W2': init_Weights(d * 4, d),
        'B1': init_Bias(d * 4),
        'B2': init_Bias(d)
    }

    weights[name]['multi_head_weight'] = att_W
    weights[name]['feed_forward']= ffnn
  weightss=weights.copy()
  if 'multi_head' not in layers[name]:
    layers[name]['multi_head']=multi_head("multi_head", d, h=h)
  mha = layers[name]['multi_head'].forward(input, weights[name]['multi_head_weight'])

  if 'layer_norm1' not in layers[name]:
    layers[name]['layer_norm1']=res_add_layer_norm('layer_norm1', d)
  input = layers[name]['layer_norm1'].forward(input, mha, weights[name]['layer_norm1'])

  if 'feed_forward' not in layers[name]:
    layers[name]['feed_forward'] = ffn('feed_forward', d)
  feed_forward = layers[name]['feed_forward'].forward(input, weights[name]['feed_forward'])

  if 'layer_norm2' not in layers[name]:
    layers[name]['layer_norm2']=res_add_layer_norm('layer_norm2', d)
  input = layers[name]['layer_norm2'].forward(prev_X=input, X=feed_forward, weights=weights[name]['layer_norm2'])
  return input, layers[name], weightss

In [None]:
def decoder(input, d, h, input_enc, name, layers, weights):
  """
      input: (numpy array) input data for the decoder, with shape (sequence_length, d)
      d: (int) embedding dimension
      h: (int) number of attention heads
      input_enc: (numpy array) output from the encoder, used as context for the decoder, with shape (sequence_length, d)
      name: (str) unique identifier for the decoder layer
      layers: (dict) dictionary containing instances of layers used in the decoder module, initialized if not already present
      weights: (dict) dictionary containing weight matrices and biases for the decoder module, initialized if not already present

      This method initializes and applies a masked multi-head attention mechanism followed by residual addition and layer normalization,
      and then performs an additional multi-head attention operation that incorporates encoder outputs.
      It subsequently passes the result through a feed-forward neural network and performs a final layer normalization with residual addition.

      Steps:
        - Initializes the layer normalization weights and multi-head attention weights (W_Q, W_K, W_V) for 'h' attention heads if they don't already exist.
        - Sets up two multi-head attention modules (one masked and one normal), a feed-forward neural network, and three layer normalization modules.
        - Performs a forward pass through the masked multi-head attention mechanism.
        - Applies residual connection and layer normalization after the masked multi-head attention.
        - Processes the output through the normal multi-head attention mechanism using encoder outputs as context.
        - Completes a final residual connection and layer normalization after the multi-head attention.
        - Processes the result through a feed-forward network, followed by another layer normalization with residual addition.

      returns:
        (numpy array) final output of the decoder layer after all transformations, with shape (sequence_length, d)
        (dict) updated dictionary containing instances of all layers within the decoder
        (dict) updated weights dictionary, with newly initialized weights for this decoder layer
  """

  if name not in layers:
    layers[name]={}
  if name not in weights:
    b=int(d/h)
    weights[name]={
        f'layer_norm{i}':{
            'beta': init_Bias(d),
            'gamma': init_gamma(d)
        }
        for i in range(1, 4)
    }

    att_W = {
        f'attn_w{i}': {
            'W_Q': init_Weights(d, b),
            'W_K': init_Weights(d, b),
            'W_V': init_Weights(d, b)
        }
        for i in range(h)
    }

    att_W['multihead'] = init_Weights(b * h, d)
    ffnn = {
        'W1': init_Weights(d, d * 4),
        'W2': init_Weights(d * 4, d),
        'B1': init_Bias(d * 4),
        'B2': init_Bias(d)
    }
    weights[name]['multi_head_weight1'] = att_W
    weights[name]['multi_head_weight2'] = att_W
    weights[name]['feed_forward']= ffnn
  weightss = weights.copy()
  if 'multi_head_masked' not in layers[name]:
    layers[name]['multi_head_masked']=multi_head('multi_head_masked', d, h=h, masked=True)
  mha = layers[name]['multi_head_masked'].forward(input, weights[name]['multi_head_weight1'])

  if 'layer_norm1' not in layers[name]:
    layers[name]['layer_norm1']=res_add_layer_norm('layer_norm1', d)
  input = layers[name]['layer_norm1'].forward(input, mha, weights[name]['layer_norm1'])

  if 'multi_head' not in layers[name]:
    layers[name]['multi_head']=multi_head("multi_head", d, h=h)
  mha = layers[name]['multi_head'].forward(input, weights[name]['multi_head_weight2'], X2=input_enc)

  if 'layer_norm2' not in layers[name]:
    layers[name]['layer_norm2']=res_add_layer_norm('layer_norm2', d)
  input = layers[name]['layer_norm2'].forward(input, mha, weights[name]['layer_norm2'])

  if 'feed_forward' not in layers[name]:
    layers[name]['feed_forward'] = ffn('feed_forward', d)
  feed_forward_dec = layers[name]['feed_forward'].forward(input, weights[name]['feed_forward'])

  if 'layer_norm3' not in layers[name]:
    layers[name]['layer_norm3']=res_add_layer_norm('layer_norm3', d)
  input = layers[name]['layer_norm3'].forward(input, feed_forward_dec, weights[name]['layer_norm3'])

  return input, layers[name], weightss

In [None]:
class linear:
  def __init__(self, name, d, V):
    """
      name: (str) unique identifier for the linear layer
      d: (int) number of input features (dimension of the input)
      V: (int) number of output features (dimension of the output)

      This method initializes the linear layer with the specified name, input dimension, and output dimension.
    """
    self.name=name
    self.d=d
    self.V=V

  def forward(self, X, weights):
    """
      X: (numpy array) input data for the linear layer, with shape (sequence_length, d)
      weights: (dict) dictionary containing weight matrices and biases for the layer

      This method performs a forward pass through the linear layer. It computes the output \(Z\) as the matrix multiplication of the input \(X\) with the output weight matrix \(W_{out}\) and adds the bias \(B_{out}\). If weights for this layer are not already initialized, they will be created.

      returns:
        (numpy array) output of the linear layer, with shape (sequence_length, V)
        (dict) updated weights dictionary for the layer, including the weight matrix and bias
    """

    self.X=X
    if self.name not in weights:
      weights[self.name]={"W_out":init_Weights(self.d,self.V), "B_out":init_Bias(self.V)}

    W_out = weights[self.name]['W_out']

    B_out = weights[self.name]['B_out']

    self.Z = np.matmul(self.X, W_out) + B_out

    return self.Z, weights

  def backward(self, dloss_dZ, weights):
    """
      dloss_dZ: (numpy array) gradient of the loss with respect to the output \(Z\), with shape (sequence_length, V)
      weights: (dict) dictionary containing weight matrices for the layer

      This method computes the gradients for the linear layer during backpropagation. It calculates the gradient of the loss with respect to the output weights \(W_{out}\), the bias \(B_{out}\), and the input \(D\) to the layer.

      returns:
        (numpy array) gradient of the loss with respect to the output weights, with shape (d, V)
        (numpy array) gradient of the loss with respect to the bias, with shape (V,)
        (numpy array) gradient of the loss with respect to the input \(X\), with shape (sequence_length, d)
    """

    dloss_dW_out = np.matmul(self.X.T, dloss_dZ)
    dloss_dB_out = np.sum(dloss_dZ, axis=0)
    dloss_dD = np.matmul(dloss_dZ, weights[self.name]['W_out'].T)
    return dloss_dW_out, dloss_dB_out, dloss_dD

In [None]:
def cross_entropy_loss(Y_hat, Y, epsilon=0.000000000000001):
  """
    Y_hat: (numpy array) predicted probabilities for each class, with shape (sequence_length, num_classes)
    Y: (numpy array) true labels, one-hot encoded, with shape (sequence_length, num_classes)
    epsilon: (float) small constant added to avoid logarithm of zero (default: 1e-15)

    This function computes the cross-entropy loss between the predicted probabilities \(Y_{\hat}\) and the true labels \(Y\). It calculates the loss by taking the negative logarithm of the maximum predicted probability corresponding to the true class for each sample, summing these values across all samples, and returning the total loss.

    returns:
      (float) total cross-entropy loss for the batch
  """

  j = Y_hat * Y
  P_j = np.max(j, axis=1)
  log = np.log(P_j + epsilon)
  loss = np.sum(-1 * log)
  return loss

In [None]:
def init_Weights(a, b):
  return np.random.uniform(low=-1 * np.sqrt(6) / np.sqrt(b), high=np.sqrt(6) / np.sqrt(b), size=(a,b))

In [None]:
def init_Bias(a):
  return np.zeros((a,))

In [None]:
def init_gamma(a):
  return np.ones((a,))

In [None]:
def network(input_enc,input_dec,d,h,V_ger, layers, weights):
  """
    input_enc: (numpy array) input to the encoder, shape (sequence_length, d)
    input_dec: (numpy array) input to the decoder, shape (sequence_length, d)
    d: (int) embedding dimension
    h: (int) number of attention heads
    V_ger: (int) size of the output vocabulary
    layers: (dict) dictionary to hold the network layers
    weights: (dict) dictionary to hold the weights of the layers

    This function constructs a transformer-like architecture consisting of multiple encoder and decoder layers. It processes the encoder input through six successive encoder layers, followed by six decoder layers that utilize the output of the last encoder. After the final decoder layer, the output is passed through a linear layer to produce logits, which are then passed through a softmax layer to obtain predicted probabilities.

    returns:
      y_hat: (numpy array) predicted probabilities for each class, shape (sequence_length, V_ger)
      layers: (dict) updated dictionary of layers
      weights: (dict) updated dictionary of weights
  """

  X1, layers['encoder1'], weights = encoder(input_enc, d, h,'encoder1', layers, weights)
  X2, layers['encoder2'], weights = encoder(X1, d, h, 'encoder2', layers, weights)
  X3, layers['encoder3'], weights = encoder(X2, d, h, 'encoder3', layers, weights)
  X4, layers['encoder4'], weights = encoder(X3, d, h, 'encoder4', layers, weights)
  X5, layers['encoder5'], weights = encoder(X4, d, h, 'encoder5', layers, weights)
  output, layers['encoder6'], weights = encoder(X5, d, h, 'encoder6', layers, weights)

  X1, layers['decoder1'], weights = decoder(input_dec, d, h, output, 'decoder1', layers, weights)
  X2, layers['decoder2'], weights = decoder(X1, d, h, output, 'decoder2', layers, weights)
  X3, layers['decoder3'], weights = decoder(X2, d, h, output,'decoder3', layers, weights)
  X4, layers['decoder4'], weights = decoder(X3, d, h, output, 'decoder4', layers, weights)
  X5, layers['decoder5'], weights = decoder(X4, d, h, output, 'decoder5', layers, weights)
  output_dec, layers['decoder6'], weights = decoder(X5, d, h, output, 'decoder6', layers, weights)

  if 'linear' not in layers:
    layers['linear']=linear('linear', d, V_ger)
  Z, weights = layers['linear'].forward(output_dec, weights)
  if 'softmax' not in layers:
    layers['softmax']=softmax('softmax')
  y_hat = layers['softmax'].forward(Z)
  return y_hat, layers, weights

In [None]:
def normalize_gradient(X):
  return (X - np.mean(X)) / np.sqrt(np.var(X)+0.00001)

In [None]:
def back_decoder(gradients, dLoss_dD, layers, weights):
  """
    gradients: (dict) dictionary holding gradients for each layer
    dLoss_dD: (numpy array) gradient of the loss with respect to the decoder output, shape (sequence_length, d)
    layers: (dict) dictionary of the layers in the decoder
    weights: (dict) dictionary of weights for each layer in the decoder

    This function performs the backward pass through the decoder part of the model, calculating gradients for each layer sequentially. It updates the gradients for the layer normalization layers, the feed-forward layer, and the multi-head attention layers, while accumulating gradients from the previous layers. The gradients are normalized before being passed to the next layer. Finally, it returns the gradients with respect to the decoder input and the multi-head encoder output, along with the updated gradients for each layer.

    returns:
      dLoss_ddecoderin: (numpy array) gradient of the loss with respect to the decoder input, shape (sequence_length, d)
      dLoss_dmulti_head_encoder: (numpy array) gradient of the loss with respect to the multi-head encoder output, shape (sequence_length, d)
      gradients: (dict) updated dictionary of gradients for each layer
  """

  dLoss_dlayernorm_3, gradients['layer_norm3'] =  layers['layer_norm3'].backward(normalize_gradient(dLoss_dD), weights['layer_norm3'], gradients['layer_norm3'])
  dLoss_feed_forward, gradients['feed_forward'] = layers['feed_forward'].backward(normalize_gradient(dLoss_dlayernorm_3), weights['feed_forward'], gradients['feed_forward'])
  dLoss_layernorm_2_in = dLoss_dlayernorm_3 + dLoss_feed_forward
  dLoss_dlayernorm_2, gradients['layer_norm2'] =  layers['layer_norm2'].backward(normalize_gradient(dLoss_layernorm_2_in), weights['layer_norm2'], gradients['layer_norm2'])
  dLoss_dmulti_head, dLoss_dmulti_head_encoder, gradients['multi_head_weight2'] = layers['multi_head'].backward(normalize_gradient(dLoss_dlayernorm_2), weights['multi_head_weight2'], gradients['multi_head_weight2'])
  dLoss_layernorm_1_in = dLoss_dlayernorm_2 + dLoss_dmulti_head
  dLoss_dlayernorm_1, gradients['layer_norm1'] = layers['layer_norm1'].backward(normalize_gradient(dLoss_layernorm_1_in), weights['layer_norm1'], gradients['layer_norm1'])
  dLoss_dmultiheadmasked,_, gradients['multi_head_weight1'] = layers['multi_head_masked'].backward(normalize_gradient(dLoss_dlayernorm_1), weights['multi_head_weight1'], gradients['multi_head_weight1'])
  dLoss_ddecoderin = dLoss_dlayernorm_1 + dLoss_dmultiheadmasked
  return dLoss_ddecoderin, dLoss_dmulti_head_encoder, gradients

In [None]:
def back_encoder(gradients, dLoss_dDecoder, layers, weights):
  """
    gradients: (dict) dictionary holding gradients for each layer
    dLoss_dDecoder: (numpy array) gradient of the loss with respect to the decoder output, shape (sequence_length, d)
    layers: (dict) dictionary of the layers in the encoder
    weights: (dict) dictionary of weights for each layer in the encoder

    This function performs the backward pass through the encoder part of the model, computing gradients for each layer in sequence. It updates the gradients for the layer normalization layers and the feed-forward layer, while accumulating gradients from the previous layers. The gradients are normalized before being passed to the next layer. Finally, it returns the gradients with respect to the encoder output along with the updated gradients for each layer.

    returns:
      dLoss_dencoder: (numpy array) gradient of the loss with respect to the encoder output, shape (sequence_length, d)
      gradients: (dict) updated dictionary of gradients for each layer
  """

  dLoss_dlayernorm_2, gradients['layer_norm2'] =  layers['layer_norm2'].backward(normalize_gradient(dLoss_dDecoder), weights['layer_norm2'], gradients['layer_norm2'])
  dLoss_feed_forward, gradients['feed_forward'] = layers['feed_forward'].backward(normalize_gradient(dLoss_dlayernorm_2), weights['feed_forward'], gradients['feed_forward'])
  dLoss_layernorm_1_in = dLoss_dlayernorm_2 + dLoss_feed_forward
  dLoss_dlayernorm_1, gradients['layer_norm1'] =  layers['layer_norm1'].backward(normalize_gradient(dLoss_layernorm_1_in), weights['layer_norm1'], gradients['layer_norm1'])
  dLoss_dmulti_head, dLoss_dmulti_head_encoder, gradients['multi_head_weight'] = layers['multi_head'].backward(normalize_gradient(dLoss_dlayernorm_1), weights['multi_head_weight'], gradients['multi_head_weight'])
  dLoss_dencoder = dLoss_dmulti_head + dLoss_dlayernorm_1
  return dLoss_dencoder, gradients

In [None]:
def backward(y_hat, Y_one_hot_shifted, layers, gradients, weights, V_en,V_dec, n, word_ids_en, word_ids_dec):
  """
    y_hat: (numpy array) predicted output from the model, shape (sequence_length, vocab_size)
    Y_one_hot_shifted: (numpy array) one-hot encoded true labels, shape (sequence_length, vocab_size)
    layers: (dict) dictionary of layers in the model
    gradients: (dict) dictionary to hold gradients for each layer
    weights: (dict) dictionary of weights for each layer
    V_en: (int) vocabulary size for the encoder
    V_dec: (int) vocabulary size for the decoder
    n: (int) total number of words in the input
    word_ids_en: (numpy array) indices of words in the encoder vocabulary
    word_ids_dec: (numpy array) indices of words in the decoder vocabulary

    This function performs the backward pass of the model, computing gradients with respect to the loss. It calculates the gradient of the output layer using the difference between predicted and true values, followed by a sequence of backward passes through the decoder layers and the encoder layers. The gradients are accumulated for each layer, and updates are applied to the embedding layers for both the encoder and decoder based on the respective word indices.

    returns:
      dLoss_dembeddings_en: (numpy array) gradient of the loss with respect to the encoder embeddings, shape (V_en, d)
      dLoss_dembeddings_dec: (numpy array) gradient of the loss with respect to the decoder embeddings, shape (V_dec, d)
      gradients: (dict) updated dictionary of gradients for each layer
  """

  dloss_dZ = y_hat - Y_one_hot_shifted
  gradients['linear']['W_out'], gradients['linear']['B_out'], dloss_dD = layers['linear'].backward(dloss_dZ, weights)
  dLoss_ddecoder6, dLoss_dencoder_out, gradients['decoder6'] = back_decoder(gradients['decoder6'], dloss_dD, layers['decoder6'], weights['decoder6'])
  dLoss_ddecoder5, dencoder_out, gradients['decoder5'] = back_decoder(gradients['decoder5'], dLoss_ddecoder6, layers['decoder5'], weights['decoder5'])
  dLoss_dencoder_out+=dencoder_out
  dLoss_ddecoder4, dencoder_out, gradients['decoder4'] = back_decoder(gradients['decoder4'], dLoss_ddecoder5, layers['decoder4'], weights['decoder4'])
  dLoss_dencoder_out+=dencoder_out
  dLoss_ddecoder3, dencoder_out, gradients['decoder3'] = back_decoder(gradients['decoder3'], dLoss_ddecoder4, layers['decoder3'], weights['decoder3'])
  dLoss_dencoder_out+=dencoder_out
  dLoss_ddecoder2, dencoder_out, gradients['decoder2'] = back_decoder(gradients['decoder2'], dLoss_ddecoder3, layers['decoder2'], weights['decoder2'])
  dLoss_dencoder_out+=dencoder_out
  dLoss_ddecoder1, dencoder_out, gradients['decoder1'] = back_decoder(gradients['decoder1'], dLoss_ddecoder2, layers['decoder1'], weights['decoder1'])
  dLoss_dencoder_out+=dencoder_out
  dLoss_dembeddings_dec = np.zeros((V_dec, d))
  np.add.at(dLoss_dembeddings_dec, word_ids_dec, normalize_gradient(dLoss_ddecoder1))

  dLoss_dencoder6, gradients['encoder6'] = back_encoder(gradients['encoder6'], dLoss_dencoder_out, layers['encoder6'], weights['encoder6'])
  dLoss_dencoder5, gradients['encoder5'] = back_encoder(gradients['encoder5'], dLoss_dencoder6, layers['encoder5'], weights['encoder5'])
  dLoss_dencoder4, gradients['encoder4'] = back_encoder(gradients['encoder4'], dLoss_dencoder5, layers['encoder4'], weights['encoder4'])
  dLoss_dencoder3, gradients['encoder3'] = back_encoder(gradients['encoder3'], dLoss_dencoder4, layers['encoder3'], weights['encoder3'])
  dLoss_dencoder2, gradients['encoder2'] = back_encoder(gradients['encoder2'], dLoss_dencoder3, layers['encoder2'], weights['encoder2'])
  dLoss_dencoder1, gradients['encoder1'] = back_encoder(gradients['encoder1'], dLoss_dencoder2, layers['encoder1'], weights['encoder1'])
  dLoss_dembeddings_en = np.zeros((V_en, d))
  np.add.at(dLoss_dembeddings_en, word_ids_en, normalize_gradient(dLoss_dencoder1))
  return dLoss_dembeddings_en, dLoss_dembeddings_dec, gradients

In [None]:
def create_gradient_dict(weights):
  """
    weights: (dict) a dictionary containing the weights of the model layers, where keys are layer names and values are dictionaries of parameters (e.g., 'beta', 'gamma', 'W_Q', 'W_K', etc.)

    This function creates a gradient dictionary for the entire model, initializing the gradients for each layer and each parameter to zero. It iterates through the encoder and decoder layers (1 to 6), constructing the gradient structure that mirrors the weights structure. For each layer, it initializes gradients for the layer normalization parameters, feed-forward layers, and multi-head attention weights. The function also initializes gradients for the linear layer at the end of the network.

    returns:
      g: (dict) a dictionary of gradients for each layer in the model, with all values initialized to zero
  """
  g={}
  for i in range(1,7):
    g[f'encoder{i}']={"feed_forward":{}, 'multi_head_weight':{}}
    g[f'decoder{i}']={"feed_forward":{}, 'multi_head_weight1':{}, 'multi_head_weight2':{}}
    for j in range(1,3):
      g[f'encoder{i}'][f'layer_norm{j}']={'beta':np.zeros(weights[f'encoder{i}'][f'layer_norm{j}']['beta'].shape),'gamma':np.zeros(weights[f'encoder{i}'][f'layer_norm{j}']['gamma'].shape)}
      g[f'encoder{i}'][f'feed_forward'].update({f'W{j}':np.zeros(weights[f'encoder{i}'][f'feed_forward'][f'W{j}'].shape),f'B{j}':np.zeros(weights[f'encoder{i}'][f'feed_forward'][f'B{j}'].shape)})
      W={'W_Q':np.zeros(weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']['W_Q'].shape), 'W_K':np.zeros(weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']['W_K'].shape),
         'W_V':np.zeros(weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']['W_V'].shape)}
      g[f'encoder{i}'][f'multi_head_weight'].update({f'attn_w{j-1}':W.copy()})

      g[f'decoder{i}'][f'multi_head_weight1'].update({f'attn_w{j-1}':W.copy()})
      g[f'decoder{i}'][f'multi_head_weight2'].update({f'attn_w{j-1}':W.copy()})
      g[f'decoder{i}'][f'feed_forward'].update({f'W{j}':np.zeros(weights[f'decoder{i}'][f'feed_forward'][f'W{j}'].shape),f'B{j}':np.zeros(weights[f'decoder{i}'][f'feed_forward'][f'B{j}'].shape)})
    for j in range(1,4):
      g[f'decoder{i}'][f'layer_norm{j}']={'beta':np.zeros(weights[f'decoder{i}'][f'layer_norm{j}']['beta'].shape),'gamma':np.zeros(weights[f'decoder{i}'][f'layer_norm{j}']['gamma'].shape)}
    g['linear']={'W_out': np.zeros(weights['linear']['W_out'].shape), 'B_out':np.zeros(weights['linear']['B_out'].shape)}
  return g

In [None]:
def init_adam(d):
  """
    d: (int) the dimensionality of the model parameters, used to initialize the Adam optimizer's state.

    This function initializes an adaptive moment estimation (Adam) optimizer state for each parameter in the model. It creates a nested dictionary structure, where each layer (encoder and decoder) contains the parameters to be optimized. For each layer, it initializes the gradients and the corresponding Adam variables for layer normalization parameters, feed-forward weights and biases, and multi-head attention weights. Additionally, it initializes Adam state for the linear layer and embedding layers for both encoder and decoder.

    returns:
      adj: (dict) a dictionary containing the Adam optimizer state for each model parameter, initialized to zero.
  """

  adj = {}
  for i in range(1, 7):
    adj[f'encoder{i}']={"feed_forward":{}, 'multi_head_weight':{}}
    adj[f'decoder{i}']={"feed_forward":{}, 'multi_head_weight1':{}, 'multi_head_weight2':{}}
    for j in range(1, 3):
      adj[f'encoder{i}'][f'layer_norm{j}']={'beta':adam_optimizer(d),'gamma':adam_optimizer(d)}
      adj[f'encoder{i}'][f'feed_forward'].update({f'W{j}':adam_optimizer(d),f'B{j}':adam_optimizer(d)})
      W={'W_Q':adam_optimizer(d), 'W_K':adam_optimizer(d), 'W_V':adam_optimizer(d)}
      adj[f'encoder{i}'][f'multi_head_weight'].update({f'attn_w{j-1}':W.copy()})
      adj[f'decoder{i}'][f'multi_head_weight1'].update({f'attn_w{j-1}':W.copy()})
      adj[f'decoder{i}'][f'multi_head_weight2'].update({f'attn_w{j-1}':W.copy()})
      adj[f'decoder{i}'][f'feed_forward'].update({f'W{j}':adam_optimizer(d),f'B{j}':adam_optimizer(d)})
    for j in range(1,4):
      adj[f'decoder{i}'][f'layer_norm{j}']={'beta':adam_optimizer(d),'gamma':adam_optimizer(d)}
    adj['linear']={'W_out': adam_optimizer(d), 'B_out':adam_optimizer(d)}
  adj['embeddings_en']=adam_optimizer(d)
  adj['embeddings_dec']=adam_optimizer(d)
  return adj

In [None]:
def get_learningrate(d, t, warmup_steps):
  learning_rate = np.power(d, -0.5) * min(t**(-0.5), t * warmup_steps**(-1.5))
  return learning_rate

In [None]:
class adam_optimizer:
  def __init__(self,d , epsilon=10**(-8), beta1 = 0.9, beta2 = 0.999, warmup_steps=2000):
    """
      Initializes an Adam optimizer for adaptive learning rates.

      Attributes:
          m: (float) The first moment variable, initialized to zero.
          v: (float) The second moment variable, initialized to zero.
          beta1: (float) The exponential decay rate for the first moment estimate.
          beta2: (float) The exponential decay rate for the second moment estimate.
          d: (int) The dimensionality of the model parameters.
          warmup_steps: (int) The number of warmup steps for the learning rate.
          epsilon: (float) A small constant to prevent division by zero.

      Args:
          d: (int) The dimensionality of the parameters to be optimized.
          epsilon: (float) A small value added to the denominator for numerical stability (default: 1e-8).
          beta1: (float) The decay rate for the first moment (default: 0.9).
          beta2: (float) The decay rate for the second moment (default: 0.999).
          warmup_steps: (int) The number of warmup steps for the learning rate (default: 2000).
    """
    self.m = 0
    self.v = 0
    self.beta1 = beta1
    self.beta2 = beta2
    self.d = d
    self.warmup_steps=warmup_steps
    self.epsilon=epsilon

  def compute(self, t, gradient, weight):
     """
      Updates the weight using the Adam optimization algorithm.

      Args:
          t: (int) The current time step (iteration) used to calculate the learning rate.
          gradient: (numpy.ndarray) The gradient of the loss with respect to the weight.
          weight: (numpy.ndarray) The current value of the weight being updated.

      Returns:
          weight: (numpy.ndarray) The updated weight after applying the Adam optimization step.
    """
    m = self.beta1 * self.m + (1-self.beta1) * gradient
    v = self.beta2 * self.v + (1-self.beta2) * np.power(gradient, 2)

    self.b_m = m / (1-self.beta1)
    self.b_v = v / (1-self.beta2)

    self.m = m
    self.v = v
    weight = weight - get_learningrate(self.d, t, self.warmup_steps) * self.b_m / (np.sqrt(self.b_v) + self.epsilon)

    return weight

In [None]:
def weights_adjustment(adj, weights, gradients, embeddings_en, embeddings_dec, d = 8, t=1):
   """
      Updates model weights and embeddings using the Adam optimizer.

      This function iterates through the model's layers, applying the Adam optimization algorithm
      to adjust the weights based on computed gradients and adaptive learning rates. It modifies
      the weights of both the encoder and decoder, as well as the linear output layer, and updates
      the embeddings for the encoder and decoder.

      Args:
          adj: (dict) A dictionary containing Adam optimizers for each layer's parameters.
          weights: (dict) A dictionary containing the current weights of the model.
          gradients: (dict) A dictionary containing the computed gradients for each layer's parameters.
          embeddings_en: (numpy.ndarray) The current embeddings for the encoder.
          embeddings_dec: (numpy.ndarray) The current embeddings for the decoder.
          d: (int, optional) The dimensionality of the embeddings (default: 8).
          t: (int, optional) The current time step for learning rate computation (default: 1).

      Returns:
          weights: (dict) The updated weights after applying the adjustments.
          embeddings_en: (numpy.ndarray) The updated encoder embeddings.
          embeddings_dec: (numpy.ndarray) The updated decoder embeddings.
    """
  for i in range(1, 7):
    for j in range(1, 3):
      weights[f'encoder{i}'][f'layer_norm{j}']['beta']=adj[f'encoder{i}'][f'layer_norm{j}']['beta'].compute(t, gradients[f'encoder{i}'][f'layer_norm{j}']['beta'], weights[f'encoder{i}'][f'layer_norm{j}']['beta'])
      weights[f'encoder{i}'][f'layer_norm{j}']['gamma']=adj[f'encoder{i}'][f'layer_norm{j}']['gamma'].compute(t, gradients[f'encoder{i}'][f'layer_norm{j}']['gamma'], weights[f'encoder{i}'][f'layer_norm{j}']['gamma'])
      weights[f'encoder{i}'][f'feed_forward'][f'W{j}']=adj[f'encoder{i}'][f'feed_forward'][f'W{j}'].compute(t, gradients[f'encoder{i}'][f'feed_forward'][f'W{j}'], weights[f'encoder{i}'][f'feed_forward'][f'W{j}'])
      weights[f'encoder{i}'][f'feed_forward'][f'B{j}']=adj[f'encoder{i}'][f'feed_forward'][f'B{j}'].compute(t, gradients[f'encoder{i}'][f'feed_forward'][f'B{j}'], weights[f'encoder{i}'][f'feed_forward'][f'B{j}'])

      weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_Q"]=adj[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_Q"].compute(t, gradients[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_Q"], weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_Q"])
      weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_K"]=adj[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_K"].compute(t, gradients[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_K"], weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_K"])
      weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_V"]=adj[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_V"].compute(t, gradients[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_V"], weights[f'encoder{i}'][f'multi_head_weight'][f'attn_w{j-1}']["W_V"])

      for k in range(1,3):
        weights[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_Q"]=adj[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_Q"].compute(t, gradients[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_Q"], weights[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_Q"])
        weights[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_K"]=adj[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_K"].compute(t, gradients[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_K"], weights[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_K"])
        weights[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_V"]=adj[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_V"].compute(t, gradients[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_V"], weights[f'decoder{i}'][f'multi_head_weight{k}'][f'attn_w{j-1}']["W_V"])

      weights[f'decoder{i}'][f'feed_forward'][f'W{j}']=adj[f'decoder{i}'][f'feed_forward'][f'W{j}'].compute(t, gradients[f'decoder{i}'][f'feed_forward'][f'W{j}'], weights[f'decoder{i}'][f'feed_forward'][f'W{j}'])
      weights[f'decoder{i}'][f'feed_forward'][f'B{j}']=adj[f'decoder{i}'][f'feed_forward'][f'B{j}'].compute(t, gradients[f'decoder{i}'][f'feed_forward'][f'B{j}'], weights[f'decoder{i}'][f'feed_forward'][f'B{j}'])

    for j in range(1,4):
      weights[f'decoder{i}'][f'layer_norm{j}']['beta']=adj[f'decoder{i}'][f'layer_norm{j}']['beta'].compute(t, gradients[f'decoder{i}'][f'layer_norm{j}']['beta'], weights[f'decoder{i}'][f'layer_norm{j}']['beta'])
      weights[f'decoder{i}'][f'layer_norm{j}']['gamma']=adj[f'decoder{i}'][f'layer_norm{j}']['gamma'].compute(t, gradients[f'decoder{i}'][f'layer_norm{j}']['gamma'], weights[f'decoder{i}'][f'layer_norm{j}']['gamma'])
    weights['linear']['W_out']=adj['linear']['W_out'].compute(t, gradients['linear']['W_out'], weights['linear']['W_out'])
    weights['linear']['B_out']=adj['linear']['B_out'].compute(t, gradients['linear']['B_out'], weights['linear']['B_out'])

    embeddings_en = adj['embeddings_en'].compute(t, gradients['d_embeddings_en'], embeddings_en)
    embeddings_dec = adj['embeddings_dec'].compute(t, gradients['d_embeddings_dec'], embeddings_dec)

    return weights, embeddings_en, embeddings_dec

In [None]:
sentences_eng=['the dog is not sitting','the cat is sitting','a cat is playing','the dog is not playing','a cat is not sitting']
sentences_spa=['el perro no esta sentado','el gato esta sentado','un gato esta jugando','el perro no esta jugando','un gato no esta sentado']

word_embeddings_en, word_to_id_en, id_to_word_en, V_en, d, n_en ,m, one_hot_en, word_ids_en = tokenization(sentences_eng, d=8, padding_size=5)

pos_enc_en = get_pos_enc(d,n_en)

word_embeddings_spa, word_to_id_spa, id_to_word_spa, V_spa, d, n_spa ,m, one_hot, one_hot_shifted, word_ids_shifted, word_ids_dec = tokenization(sentences_spa, d=8, padding_size=7, decoder=True)

pos_enc_dec = get_pos_enc(d,n_spa-1)

weights={}
layers={}

In [None]:
import time

start = time.time()

ada = init_adam(d)
for epoch in range(500):
  X = get_embeddings(one_hot_en, word_embeddings_en)
  X_dec, X_shifted= get_embeddings(one_hot, word_embeddings_spa, one_hot_shifted=one_hot_shifted, decoder=True)

  input_embed_dec = get_pos_enc(d,n_spa) + X_dec

  input_embed = pos_enc_en + X
  input_embed_shifted = pos_enc_dec + X_shifted
  loss=0
  for i in range(5):
    preds, layers, weights = network(input_embed[i], input_embed_shifted[i],d,2,V_ger, layers, weights)

    loss += cross_entropy_loss(preds, one_hot_shifted[i])
    if epoch==0:
      gradients = create_gradient_dict(weights)

    gradients["d_embeddings_en"], gradients["d_embeddings_dec"], gradients = backward(preds, one_hot_shifted[i], layers, gradients, weights, V_en, V_ger,d , word_ids_en[i], word_ids_shifted[i])

    weights, word_embeddings_en, word_embeddings_spa = weights_adjustment(ada, weights, gradients, word_embeddings_en, word_embeddings_spa, t=epoch+11000)

  print(f"epoch {epoch}: {loss/5}")
end = time.time()

print(end - start)

epoch 0: 13.332241699130517
epoch 1: 13.24849063612397
epoch 2: 13.285800714383182
epoch 3: 13.359219363733498
epoch 4: 13.370402393361626
epoch 5: 13.389098826893349
epoch 6: 13.402751791533223
epoch 7: 13.364053349966202
epoch 8: 13.357034165497575
epoch 9: 13.409020901478907
epoch 10: 13.400364327471852
epoch 11: 13.14627033970628
epoch 12: 12.989356700964251
epoch 13: 12.953464312278067
epoch 14: 13.078145780641417
epoch 15: 13.168440162877465
epoch 16: 13.363675521676143
epoch 17: 13.399925124817553
epoch 18: 13.390486396866057
epoch 19: 13.377733168398686
epoch 20: 13.356201686716878
epoch 21: 13.336165894278192
epoch 22: 13.32030124719814
epoch 23: 13.30207984585565
epoch 24: 13.278428381504895
epoch 25: 13.257410163926068
epoch 26: 13.237706599202568
epoch 27: 13.23266997934534
epoch 28: 13.226087212308897
epoch 29: 13.223135755034935
epoch 30: 13.229619382159502
epoch 31: 13.232441447619488
epoch 32: 13.229479903027677
epoch 33: 13.227649050603025
epoch 34: 13.221918885113912


In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/weights/weights.pkl', 'wb') as f:
    pickle.dump(weights, f)

In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/weights/embeddings_en.pkl', 'wb') as f:
    pickle.dump({"embeddings_en":word_embeddings_en}, f)

In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/weights/embeddings_dec.pkl', 'wb') as f:
    pickle.dump({"embeddings_dec":word_embeddings_ger}, f)