In [1]:
# build transfomer model from scratch

In [2]:
import numpy as np


def Relu(x):
    return np.maximum(0,x)
def Relu_derivative(x):
    return np.where(x>0,1,0)
class AddAndNorm():
    def __init__(self):
        pass

class MultiHeadAttention():
    def __init__(self,d_model=4,num_heads=2,layer_name="MHA"):
        """
        Create a multi-head attention object, storing the weights of the layer
        -- It is necessary to initialize the weights of the model before using it
        Parameters:
        d_model: int, the dimension of the model
        num_heads: int, the number of heads in the model
        """
        
        # check if the number of heads is a factor of the model dimension
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.Wq = []
        self.Wk = []
        self.Wv = []
        self.num_heads = num_heads
        self.Wo = None
        self.d_model_head = d_model // num_heads
        self.layer_name=layer_name

        
        
    def init_weights(self,weights_mode="random",weights_q_list=None,weights_k_list=None,weights_v_list=None,weights_o=None):
        """
        Initialize the weights of the model
        
        Parameters:
        
        weights_mode: str, the mode of the weights initialization, can be "random" or "pretrained"
        weights_q_list: list of numpy arrays, the weights of the query matrix for each head
        weights_k_list: list of numpy arrays, the weights of the key matrix for each head
        weights_v_list: list of numpy arrays, the weights of the value matrix for each head
        
        """
        
        if weights_mode=="random":
            for _ in range(self.num_heads):
                self.Wq.append(np.random.randn(self.d_model,self.d_model_head))
                self.Wk.append(np.random.randn(self.d_model,self.d_model_head))
                self.Wv.append(np.random.randn(self.d_model,self.d_model_head))
                
                self.Wo = np.random.randn(self.d_model,self.d_model)
                

                
        elif weights_mode=="pretrained":
            # check if the weights are provided and if they are of the correct shape
            if (weights_q_list is None or weights_k_list is None or 
                weights_v_list is None or weights_o is None):
                raise ValueError("weights_q_list, weights_k_list, weights_v_list, and weights_o must be provided for pretrained mode.")
            assert weights_o.shape == (self.d_model,self.d_model)
            self.Wo = weights_o
            
            
            for ind in range(self.num_heads):
                assert weights_q_list[ind].shape == (self.d_model,self.d_model_head)
                assert weights_k_list[ind].shape == (self.d_model,self.d_model_head)
                assert weights_v_list[ind].shape == (self.d_model,self.d_model_head)
                
                
                # append the weights and biases to the list
                self.Wq.append(weights_q_list[ind])
                self.Wk.append(weights_k_list[ind])
                self.Wv.append(weights_v_list[ind])

                
            
        elif weights_mode == "null":
            self.Wo = np.zeros((self.d_model,self.d_model))
            self.Bo = np.zeros((1,self.d_model))
            for _ in range(self.num_heads):
                self.Wq.append(np.zeros((self.d_model,self.d_model_head)))
                self.Wk.append(np.zeros((self.d_model,self.d_model_head)))
                self.Wv.append(np.zeros((self.d_model,self.d_model_head)))
            
                
    def softmax_single_batch(self,x):
        """
        Compute the softmax of the input matrix x of shape (seq_len,d_model)
        """
        exp_x = np.exp(x - np.max(x,axis=1,keepdims=True)) # subtract the max to avoid numerical instability
        return exp_x / np.sum(exp_x,axis=1,keepdims=True) # sum along the rows to get the softmax
    
    def softmax(self,x):
        """
        Compute the softmax of the input matrix x of shape (batch_size,seq_len,d_model)
        """
        exp_x = np.exp(x - np.max(x,axis=-1,keepdims=True)) # subtract the max to avoid numerical instability
        return exp_x / np.sum(exp_x,axis=-1,keepdims=True) 
    def scaled_dot_product_attention_single_batch(self,Q,K,V):
        """ 
        Compute the scaled dot product attention
        parameters:
        Q: the query matrix of shape (seq_len,d_model)
        K: the key matrix of shape (seq_len,d_model)
        V: the value matrix of shape (seq_len,d_model)
        Returns:
        the attention matrix of shape (seq_len,d_model)
        """
        d_k=K.shape[1]
        
        # compute the dot product
        dot_product = np.dot(Q,K.T)
        
        # scale the dot product
        scaled_dot_product = dot_product / np.sqrt(d_k)
        # apply the softmax
        attention_matrix = self.softmax_single_batch(scaled_dot_product)

        #multiply the attention matrix by the value matrix
        attention_matrix = np.dot(attention_matrix,V)
        
        return attention_matrix
    def scaled_dot_product_attention(self,Q,K,V):
        """ 
        Compute the scaled dot product attention
        parameters:
        Q: the query matrix of shape (batch_size, seq_len, d_model)
        K: the key matrix of shape (batch_size, seq_len, d_model)
        V: the value matrix of shape (batch_size, seq_len, d_model)

        Returns:
        the attention matrix of shape (seq_len,d_model)
        """
        d_k=K.shape[1]
        
        # compute the dot product
        dot_product = np.matmul(Q, K.transpose(0, 2, 1)) 
        
        # scale the dot product
        scaled_dot_product = dot_product / np.sqrt(d_k)
        # apply the softmax
        attention_weights  = self.softmax(scaled_dot_product)

        #multiply the attention matrix by the value matrix
        attention_output = np.matmul(attention_weights, V)
        
        return attention_output
    
    def scaled_dot_product_masked_attention(self,Q,K,V):
        """ 
        Compute the scaled dot product attention
        parameters:
        Q: the query matrix of shape (batch_size, seq_len, d_model)
        K: the key matrix of shape (batch_size, seq_len, d_model)
        V: the value matrix of shape (batch_size, seq_len, d_model)
        
        Returns:
        the attention matrix of shape (seq_len,d_model)
        """
        d_k=K.shape[1]
        
        # compute the dot product
        dot_product = np.matmul(Q, K.transpose(0, 2, 1)) 
        # scale the dot product
        scaled_dot_product = dot_product / np.sqrt(d_k)
        # apply the mask
        
        mask = np.triu(np.ones((Q.shape[1],K.shape[1])),k=1)
        mask = mask == 0
        scaled_dot_product = np.where(mask,scaled_dot_product,-np.inf)
        
        
        # apply the softmax
        attention_weights  = self.softmax(scaled_dot_product)

        
        #multiply the attention matrix by the value matrix
        attention_output = np.matmul(attention_weights, V)
        return attention_output
    
    def compute_MHA(self,inputs):
        """
        Compute the multi-head attention
        Parameters:
        
        inputs: the input matrix of shape (batch_size, seq_len, d_model)
        
        Returns:
        the output of the multi-head attention layer of shape (batch_size, seq_len, d_model)
        """
        
        attention_matrix_list=[]
        
        for ind in range(self.num_heads):
            Wq = self.Wq[ind]
            Wk = self.Wk[ind]
            Wv = self.Wv[ind]
            
            Q_head = np.dot(inputs,Wq)
            K_head = np.dot(inputs,Wk)
            V_head = np.dot(inputs,Wv)
            
            # compute the scaled dot product attention
            attention_matrix_head = self.scaled_dot_product_attention(Q_head,K_head,V_head)

            attention_matrix_list.append(attention_matrix_head)
        
        # concatenate the attention matrices
        attention_matrix = np.concatenate(attention_matrix_list,axis=-1)  # (batch_size, seq_len, d_model)
        
        # apply the final linear transformation Wo
        attention_matrix = np.dot(attention_matrix,self.Wo) 
        return attention_matrix
    def compute_MHA_single_batch(self,inputs):
        """
        Compute the multi-head attention
        Parameters:
        
        inputs: the input matrix of shape (seq_len,d_model)
        
        Returns:
        the output of the multi-head attention layer of shape (seq_len,d_model)
        """
        
        attention_matrix_list=[]
        
        for ind in range(self.num_heads):

            Wq = self.Wq[ind]
            Wk = self.Wk[ind]
            Wv = self.Wv[ind]
            
            Q_head = np.dot(inputs,Wq)
            K_head = np.dot(inputs,Wk)
            V_head = np.dot(inputs,Wv)

            # compute the scaled dot product attention
            attention_matrix_head = self.scaled_dot_product_attention_single_batch(Q_head,K_head,V_head)

            attention_matrix_list.append(attention_matrix_head)
        
        # concatenate the attention matrices
        attention_matrix = np.concatenate(attention_matrix_list,axis=1)
        
        
        # apply the final linear transformation Wo
        attention_matrix = np.dot(attention_matrix,self.Wo) 
        return attention_matrix
        
class FeedForward():
    def __init__(self,d_model=4,max_seq_len=100,units=2048,name="FF_layer_X"):
        self.d_model = d_model
        self.max_seq_len = max_seq_len
        self.weights1 = None
        self.weights2 = None
        self.bias1 = None
        self.bias2 = None
        self.name = name
        self.units = units # the number of units in the hidden layer
    def init_weights(self,weights_mode="random",weights1=None,weights2=None,bias1=None,bias2=None):
        """
        Initialize the weights of the model
        
        Parameters:
        
        weights_mode: str, the mode of the weights initialization, can be "random" or "pretrained"
        weights1: numpy array, the weights of the first layer
        weights2: numpy array, the weights of the second layer
        bias1: numpy array, the bias of the first layer
        bias2: numpy array, the bias of the second layer
        
        """
        
        if weights_mode=="random":
            self.weights1 = np.random.randn(self.d_model,self.units)
            self.weights2 = np.random.randn(self.units,self.d_model)
            self.bias1 = np.random.randn(1,self.units)
            self.bias2 = np.random.randn(1,self.d_model)
        elif weights_mode=="pretrained":
            # check if the weights are provided and if they are of the correct shape
            if (weights1 is None or weights2 is None or 
                bias1 is None or bias2 is None):
                raise ValueError("weights1, weights2, bias1, and bias2 must be provided for pretrained mode.")
            assert weights1.shape == (self.d_model,self.units)
            assert weights2.shape == (self.units,self.d_model)
            assert bias1.shape == (1,self.units)
            assert bias2.shape == (1,self.d_model)
            
            self.weights1 = weights1
            self.weights2 = weights2
            self.bias1 = bias1
            self.bias2 = bias2
            
        elif weights_mode == "null":
            self.weights1 = np.zeros((self.d_model,self.units))
            self.weights2 = np.zeros((self.units,self.d_model))
            self.bias1 = np.zeros((1,self.units))
            self.bias2 = np.zeros((1,self.d_model))
    def forward_batch_sequence(self,inputs):
        
        """
        inputs: numpy array, dimensions (batch_size, max_seq_len, d_model), the input data
        
        Returns:
        numpy array, dimensions (batch_size, max_seq_len, d_model), the output of the layer
        """
        
        # X shape: (batch_size, max_seq_len, d_model)
        self.batch_size, self.max_seq_len, _ = inputs.shape
        # Reshape X to 2D for easier computation
        X_reshaped = inputs.reshape(-1, self.d_model)
        # First layer
        self.Z1 = np.dot(X_reshaped, self.weights1) + self.bias1
        
        # apply the activation function 
        self.A1 = np.maximum(0, self.Z1)  # ReLU activation
            
        # Second layer
        Z2 = np.dot(self.A1, self.weights2) + self.bias2
            
        # Reshape output back to 3D
        return Z2.reshape(self.batch_size, self.max_seq_len, self.d_model)
        
    def backprop_batch_sequence(self, dA2):
        pass
    
    def update_weights(self,learning_rate=0.01):
        pass


class TransformerModel():
    # model parameters
    def __init__(self,embedding_dict_path,vocab_size=1000,d_model=4): 
        self.vocab_size = vocab_size
        self.d_model = d_model # the dimension of the embedding space, aka d_model
        self.max_seq_len = 100
        # read the embedding dictionary
        # the file is a csv file with the first column as the word and the rest as the embedding vector
        embedding_dict={}
        with open(embedding_dict_path,'r') as f:
            # create a dictionary with the word as the key and the embedding vector as the value
            # force the value to float64
            for line in f:
                elements=line.strip().split(',')
                word=elements[0]
                vector=np.array(elements[1:],dtype=np.float64)
                embedding_dict[word] = vector
        self.embedding_dict = embedding_dict
        
        self.MultiHeadAttention_obj_list = []
        self.FeedForward_obj_list = []

    
    def inputs_layer(self,inputs): 
        """
        
        input is a list of valid strings, return a tensor representation of the input in the embedding space
        
        iterate through the list of strings and convert them to numpy arrays through the embedding dictionary
        
        """
        embeddings = [self.embedding_dict[word] for word in inputs]
        tensor = np.array(embeddings)
        return tensor
    
    def positional_encoding(self,seq_len):
        """
        create a positional encoding for the input sequence ( it is independent of the input sequence.
        However, it is dependent on the length of the sequence) 
        
        Parameters: 
        seq_len: the length of the sequence
        Returns : 
        Tensor of shape (seq_len,d_model) 
        """
        pos_encodings = np.zeros((seq_len, self.d_model))


        positions = np.arange(seq_len)[:, np.newaxis]
        dimensions = np.arange(self.d_model)[np.newaxis, :]


        # Compute angles, integer division of indices by 2 is used to fix a pair wise frequency (same frequency for even and odd indices)
        angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / np.float64(self.d_model))
        angle_rads = positions * angle_rates

        # Apply sin and cos to odd and even indices
        pos_encodings[:, 0::2] = np.sin(angle_rads[:, 0::2])
        pos_encodings[:, 1::2] = np.cos(angle_rads[:, 1::2])
        
        return pos_encodings
    
    def add_multi_head_attention(self,num_heads=8,weights_mode="random",layer_name="MHA",weights_q_list=None,weights_k_list=None,weights_v_list=None,weights_o=None):
        """
        Create and add a multi-head attention layer
        
        """
        MHA= MultiHeadAttention(d_model=self.d_model,num_heads=num_heads,layer_name=layer_name)
        MHA.init_weights(weights_mode=weights_mode,weights_q_list=weights_q_list,weights_k_list=weights_k_list,weights_v_list=weights_v_list,weights_o=weights_o)
        self.MultiHeadAttention_obj_list.append(MHA)
    
    
    def LayerNormalization(self,inputs,epsilon=1e-6):
        """
        Apply layer normalization to the input tensor
        Parameters:
        inputs: the input tensor of shape (batch_size,seq_len,d_model)
        epsilon: a small value to avoid division by zero
        Returns:
        Normalized tensor of shape (batch_size,seq_len,d_model)
        
        We could also consider adding 2 learnable parameters to the layer normalization
        gamma and beta
        """
        mean = np.mean(inputs,axis=-1,keepdims=True)
        std = np.std(inputs,axis=-1,keepdims=True)
        normalized_inputs = (inputs - mean) / (std + epsilon)
        return normalized_inputs
    
    def AddAndNorm(self,inputs,sublayer_output,epsilon=1e-6):
        """
        Add the input tensor to the sublayer output and apply layer normalization
        Parameters:
        inputs: the input tensor of shape (batch_size,seq_len,d_model)
        sublayer_output: the output of the sublayer of shape (batch_size,seq_len,d_model)
        Returns:
        Normalized sum tensor of shape (batch_size,seq_len,d_model)
        """
        return self.LayerNormalization(inputs + sublayer_output,epsilon=epsilon)
    def encoder(self,inputs):
        """
        1- create the input layer
        2- create the positional encoding
        3- sum the input layer and the positional encoding
        etc
        """
        # create the input layer / embeddings
        inputs_embeddings = self.inputs_layer(inputs)
        pos_encodings = self.positional_encoding(len(inputs))
        encoder_input = inputs_embeddings + pos_encodings
        
        # pass the input through the multi-head attention layers
        
        return encoder_input
        
     
    def decoder(self):
        pass
    def forward(self):
        pass
    def train(self):
        pass
    def inference(self):
        pass
    def save(self):
        pass
    def load(self):
        pass
    def evaluate(self):
        pass




class PositionalEncoding:
    def __init__(self,seqlen,d_model,batch_size):
        self.seq_len = seqlen
        self.d_model = d_model
        self.batch_size = batch_size
        self._positional_encoding = None  # Cache for the positional encoding
    def positional_encoding(self):
        """
        create a positional encoding for the input sequence ( it is independent of the input sequence.
        However, it is dependent on the length of the sequence) 
        This is computed once and then repeated for the batch size.
        

        Returns : 
        Tensor of shape (batch_size,seq_len,d_model) 
        """ 
        if self._positional_encoding is None:
            # Compute positional encoding for a single sequence
            pos_encoding = np.zeros((self.seq_len, self.d_model))
            positions = np.arange(self.seq_len)[:, np.newaxis]
            dimensions = np.arange(self.d_model)[np.newaxis, :]
            angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / np.float64(self.d_model))
            angle_rads = positions * angle_rates

            pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
            pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])

            # Repeat for batch size and cache
            self._positional_encoding = np.repeat(pos_encoding[np.newaxis, :, :], self.batch_size, axis=0)

        return self._positional_encoding
    def positional_encoding_single_sequence(self):
        """
        create a positional encoding for the input sequence ( it is independent of the input sequence.
        However, it is dependent on the length of the sequence) 
        
        Parameters: 
        seq_len: the length of the sequence
        Returns : 
        Tensor of shape (seq_len,d_model) 
        """
        pos_encodings = np.zeros((self.seq_len, self.d_model))


        positions = np.arange(self.seq_len)[:, np.newaxis]
        dimensions = np.arange(self.d_model)[np.newaxis, :]


        # Compute angles, integer division of indices by 2 is used to fix a pair wise frequency (same frequency for even and odd indices)
        angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / np.float64(self.d_model))
        angle_rads = positions * angle_rates

        # Apply sin and cos to odd and even indices
        pos_encodings[:, 0::2] = np.sin(angle_rads[:, 0::2])
        pos_encodings[:, 1::2] = np.cos(angle_rads[:, 1::2])
        
        return pos_encodings

class InputEmbedding:
    def __init__(self):
        pass
    def input_embedding(self,inputs,embedding_dict):
        """
        Convert the input batch of strings to its corresponding embedding vector
        
        parameters:
        inputs: list of strings, the input batch, the length of the strings is seq_len
        Returns:
        numpy array of shape (batch_size,seq_len,d_model)
        """
        batch_sequences = []
        for input in inputs:
            single_sequence = []
            for word in input:
                if word not in embedding_dict:
                    raise ValueError(f"{word} not found in the embedding dictionary")
                single_sequence.append(embedding_dict[word])
            batch_sequences.append(single_sequence)
        # return a tensor of shape (batch_size,seq_len,d_model)
        return np.array(batch_sequences)
        
class OutputEmbedding:
    def __init__(self):
        pass
class Encoder:
    def __init__(self):
        pass
class Decoder:
    def __init__(self):
        pass

In [3]:
# test the model

transformer=TransformerModel('data/word_embedding.csv')
# test inputs_layer

transformer.inputs_layer(['in','is','the'])

# test positional_encoding

transformer.positional_encoding(10)
transformer.encoder(['in','is','the'])

# test softmax
MHA=MultiHeadAttention()
x=np.array([[1,2,3],[0,0,0],[-1,0,1],[1,1,1],[0,1,0]])
MHA.softmax(x)

# test single head attention

MHA=MultiHeadAttention()
Q_test=np.array([[[1,1,1],[1,1,1],[1,1,1]],[[1,1,1],[1,1,1],[1,1,1]]])
K_test=np.array([[[5,5,5],[5,5,5],[5,5,5]],[[5,5,5],[5,5,5],[5,5,5]]])
V_test=np.array([[[3,3,3],[3,3,3],[3,3,3]],[[3,3,3],[3,3,3],[3,3,3]]])
print(Q_test.shape,K_test.shape,V_test.shape)
MHA.scaled_dot_product_attention(Q_test,K_test,V_test)


# test multi head attention


FileNotFoundError: [Errno 2] No such file or directory: 'data/word_embedding.csv'

In [4]:
# test multi head masked attention
MHA=MultiHeadAttention()
Q_test=np.array([[[0,-1,5],[1,5,1],[1,-7,1]],[[1,1,1],[1,1,1],[1,1,1]]])
K_test=np.array([[[2,4,-1],[0,-1,5],[5,5,5]],[[5,5,5],[5,5,5],[5,5,5]]])
V_test=np.array([[[3,3,3],[3,3,3],[3,3,3]],[[3,3,3],[3,3,3],[3,3,3]]])
MHA.scaled_dot_product_masked_attention(Q_test,K_test,V_test)

array([[[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]],

       [[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]]])

In [5]:
mask_test=np.triu(np.ones((Q_test.shape[1],K_test.shape[1])),k=1)
mask_test

array([[0., 1., 1.],
       [0., 0., 1.],
       [0., 0., 0.]])

In [6]:
# test softmax
MHA=MultiHeadAttention()
x=np.array([
    [[1,2,3],[0,0,0],[-1,0,1],[1,1,1],[0,1,0] ],[[1,2,3],[0,0,0],[-1,0,1],[1,1,1],[0,1,0]]])
print(x.shape)
MHA.softmax(x)

(2, 5, 3)


array([[[0.09003057, 0.24472847, 0.66524096],
        [0.33333333, 0.33333333, 0.33333333],
        [0.09003057, 0.24472847, 0.66524096],
        [0.33333333, 0.33333333, 0.33333333],
        [0.21194156, 0.57611688, 0.21194156]],

       [[0.09003057, 0.24472847, 0.66524096],
        [0.33333333, 0.33333333, 0.33333333],
        [0.09003057, 0.24472847, 0.66524096],
        [0.33333333, 0.33333333, 0.33333333],
        [0.21194156, 0.57611688, 0.21194156]]])

In [7]:
# test a single batch of multi head attention
MHA=MultiHeadAttention(d_model=4,num_heads=2)
MHA.init_weights(weights_mode="random")
x=np.array([[1,2,3,4],[0,0,0,0],[-1,0,0,1],[1,1,1,1],[1,0,1,0]])

MHA.compute_MHA_single_batch(x)

array([[ 4.79869662,  5.59041132, -1.58126427,  1.32009245],
       [ 0.69047119,  1.97661677, -1.54997988,  2.45496703],
       [ 0.83407547,  3.93658877, -3.49740973,  5.78633289],
       [ 4.83756115,  5.69753497, -1.45059594,  1.13045698],
       [ 3.97021874,  4.5865725 , -1.27075659,  1.20886365]])

In [8]:
# test multi head attention with a batch of inputs

x=np.array([
    [[1,2,3,4],[0,0,0,0],[-1,0,0,1],[1,1,1,1]],[[1,2,3,4],[0,0,0,0],[-1,0,0,1],[1,1,1,1]]])
print(x.shape)

MHA=MultiHeadAttention(d_model=4,num_heads=2)
MHA.init_weights(weights_mode="random")
answer=MHA.compute_MHA(x)
print(answer.shape)
print(answer)

(2, 4, 4)
(2, 4, 4)
[[[-5.68512693 -1.40290013 -1.09531664 11.79513054]
  [-2.62975504 -1.43059249 -0.89214632  4.31637048]
  [-2.92025685 -2.724588   -1.03785132  2.91894725]
  [-4.76266657 -1.1561036  -1.02551252  9.92855121]]

 [[-5.68512693 -1.40290013 -1.09531664 11.79513054]
  [-2.62975504 -1.43059249 -0.89214632  4.31637048]
  [-2.92025685 -2.724588   -1.03785132  2.91894725]
  [-4.76266657 -1.1561036  -1.02551252  9.92855121]]]


In [9]:
# generate a matrix of shape 4,2 and another of shape 2 4 4 and multiply them

a=np.array([[1,2],[3,4],[5,6],[7,8]])
b=np.array([[[1,2,3,4],[5,6,7,8]],[[9,10,11,12],[13,14,15,16]]])
print("a shape",a.shape,"b shape",b.shape)
answer=np.dot(b,a)
print(answer)
print(answer.shape)

a shape (4, 2) b shape (2, 2, 4)
[[[ 50  60]
  [114 140]]

 [[178 220]
  [242 300]]]
(2, 2, 2)


In [10]:
# test the feed forward layer

FF=FeedForward(d_model=4,max_seq_len=100,units=1024,name="FF_layer_X")

FF.init_weights(weights_mode="random")
x=np.array([[1,2,3,4],[0,0,0,0],[-1,0,0,1],[1,1,1,1],[1,0,1,0]])
FF.forward(x)

array([[  12.78858908, -233.84729112,   98.4903025 ,  -41.54592723],
       [  28.18929749,    0.82212271,   -5.31149212,   12.68068014],
       [  39.9695274 ,  -15.04990048,   30.53381561,   -9.88624043],
       [   5.77393591,  -65.29353805,    9.35475249,    2.85573199],
       [ -26.88110994,   -9.49735534,  -24.20271546,    0.69148265]])

In [11]:
# Test layer normalization
x=np.array([[[1,2,3,4],[0,0,0,0],[-1,0,0,1],[1,1,1,1],[1,0,1,0]],[[1,2,3,4],[0,0,0,0],[-1,0,0,1],[1,1,1,1],[1,0,1,0]]])
print("x shape",x.shape)
TF=TransformerModel('data/word_embedding.csv')
TF.LayerNormalization(x)

x shape (2, 5, 4)


array([[[-1.34163959, -0.4472132 ,  0.4472132 ,  1.34163959],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [-1.41421156,  0.        ,  0.        ,  1.41421156],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.999998  , -0.999998  ,  0.999998  , -0.999998  ]],

       [[-1.34163959, -0.4472132 ,  0.4472132 ,  1.34163959],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [-1.41421156,  0.        ,  0.        ,  1.41421156],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.999998  , -0.999998  ,  0.999998  , -0.999998  ]]])