In [1]:
pip install torch==2.8.0 transformers==4.55.2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModel
from huggingface_hub import login
import os
from math import sqrt, inf
import numpy as np



# Code from https://www.tensorflow.org/tutorials/text/transformer
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[None,...]
    
  return torch.tensor(pos_encoding)



class SelfAttention(torch.nn.Module):
    def __init__(self, token_dim: int, embedding_size: int, num_heads: int):
        super().__init__()
        self.num_heads = num_heads
        self.w_queries = [
            torch.nn.Parameter(torch.rand(1,token_dim, embedding_size))
            for i in range(self.num_heads)
        ]
        self.w_keys = [
            torch.nn.Parameter(torch.rand(1,token_dim, embedding_size))
            for i in range(self.num_heads)
        ]
        self.w_values = [
            torch.nn.Parameter(torch.rand(1,token_dim, embedding_size))
            for i in range(self.num_heads)
        ]
        self.w_agg = torch.nn.Parameter(torch.rand(embedding_size*self.num_heads, token_dim))
        self.embedding_size = embedding_size
        self.token_dim = token_dim
            
    def forward(self, x, attention_mask):
        attention_heads = []
        for i in range(self.num_heads):

            # b, s, token_dim -> b, s, embedding_size
            Q = x @ self.w_queries[i] 
            K = x @ self.w_keys[i] 
            V = x @ self.w_values[i]
            attention_heads.append(
                torch.softmax(
                    ((Q @ K.transpose(2,1))) / sqrt(self.embedding_size),
                    dim=1
                ) @ V)
            
        
        multiple_heads = torch.cat(attention_heads, dim=-1)
        return multiple_heads @ self.w_agg

class FeedForwardNeuralNetwork(torch.nn.Module):
    def __init__(self, num_layers: int, token_dim: int, dropout_p = 0.0):
        super().__init__()
        self.linears = [torch.nn.Linear(token_dim,token_dim) for i in range(num_layers)]
        self.dropout_p = dropout_p
        self.dropout = torch.nn.Dropout(dropout_p)
        self.leaky_relu = torch.nn.LeakyReLU(negative_slope=0.01)
        
    def forward(self, x):
        for layer in self.linears:
            x = layer(x)
            x = self.leaky_relu(x)
            if self.dropout_p > 0:
                x = self.dropout(x)
        return x
        
class EncoderDecoderAttention(torch.nn.Module):
    def __init__(self, token_dim: int, embedding_size: int, num_heads: int):
        super().__init__()
        self.num_heads = num_heads
        self.w_queries = [
            torch.nn.Parameter(torch.rand(1,token_dim, embedding_size))
            for i in range(self.num_heads)
        ]
        self.w_keys = [
            torch.nn.Parameter(torch.rand(1,token_dim, embedding_size))
            for i in range(self.num_heads)
        ]
        self.w_values = [
            torch.nn.Parameter(torch.rand(1,token_dim, embedding_size))
            for i in range(self.num_heads)
        ]
        self.w_agg = torch.nn.Parameter(torch.rand(embedding_size*self.num_heads, token_dim))
        self.embedding_size = embedding_size
        self.token_dim = token_dim
            
    def forward(self, x, encoder_context, attention_mask, current_index):
        attention_heads = []
        zero_i = (attention_mask==0).nonzero()
        for i in range(self.num_heads):
            mask = torch.zeros(x.shape[0], x.shape[1], x.shape[1], dtype=torch.float)
            for j in range(x.shape[0]):

                min_col = min(current_index+1, x.shape[1]-1)
                mask[j, min_col:,:] = -inf
            # b, s, token_dim -> b, s, embedding_size
            Q = x @ self.w_queries[i] 
            K = encoder_context @ self.w_keys[i] 
            V = encoder_context @ self.w_values[i]
            relations = Q @ K.transpose(2,1) + mask
            print(mask)
            attention_heads.append(
                torch.softmax(
                    relations / sqrt(self.embedding_size), 
                    dim=1
                ) @ V)
        multiple_heads = torch.cat(attention_heads, dim=-1)
        return multiple_heads @ self.w_agg
        
class Decoder(torch.nn.Module):
    def __init__(self, token_dim: int, embedding_size: int, num_heads: int, num_layers: int, dropout_p: float = 0.0):
        super().__init__() 
        self.self_attention = SelfAttention(token_dim, embedding_size, num_heads)
        self.encoder_decoder_attention = EncoderDecoderAttention(token_dim, embedding_size, num_heads)
        self.feed_forward = FeedForwardNeuralNetwork(num_layers, token_dim, dropout_p=dropout_p)
        self.layer_norm_1 = torch.nn.LayerNorm((token_dim,))
        self.layer_norm_2 = torch.nn.LayerNorm((token_dim,))
        self.layer_norm_3 = torch.nn.LayerNorm((token_dim,))
        
    def forward(self, original_context, x, attention_mask , current_index):
        attentioned = self.self_attention(x, attention_mask)
        attentioned = self.layer_norm_1(attentioned + x)
        attentioned_2 = self.encoder_decoder_attention(attentioned, original_context, attention_mask, current_index)
        attentioned_2 = self.layer_norm_2(attentioned + attentioned_2)
        return self.layer_norm_3(attentioned_2 + self.feed_forward(attentioned_2))

class Encoder(torch.nn.Module):
    def __init__(self, token_dim: int, embedding_size: int, num_heads: int, num_layers: int, dropout_p: float = 0.0):
        super().__init__() 
        self.self_attention = SelfAttention(token_dim, embedding_size, num_heads)
        self.feed_forward = FeedForwardNeuralNetwork(num_layers, token_dim, dropout_p=dropout_p)
        self.layer_norm_1 = torch.nn.LayerNorm((token_dim,))
        self.layer_norm_2 = torch.nn.LayerNorm((token_dim,))
        
    def forward(self, x, attention_mask):
        attentioned = self.self_attention(x, attention_mask)
        attentioned = self.layer_norm_1(attentioned + x)
        return self.layer_norm_2(attentioned + self.feed_forward(attentioned))
        


transformer = Transformer(
        embedding_model="bert-base-uncased", 
        num_heads= 2,
        num_encoders= 6,
        num_nn_layers = 3,
        embedding_size = 128,
        device="cpu",
        max_length = 20
    )


t_input = ["hello how are ", "hello how are you im fine and you?"]
output_logits, outputs, ended = transformer(t_input)


NameError: name 'Transformer' is not defined

In [None]:
output_logits[0]

In [None]:
output_logits[1]

In [None]:

output = ""
current_token = -1
outputs = [[None for _ in range(transformer.max_length)] for j in range(len(tokens))]
output_logits = [[None for _ in range(transformer.max_length)] for j in range(len(tokens))]
ended = [False for j in range(len(tokens))]
i = 0
while i < transformer.max_length:
    print("b")
    output_tokens = transformer.tokenizer([output], return_tensors="pt", padding='max_length', max_length=transformer.max_length)
    outputs_tokens = transformer.embedding_model(**output_tokens)
    token_embeddings = outputs_tokens.last_hidden_state
    attention_mask = output_tokens['attention_mask']
    
    current_embedding = torch.clone(token_embeddings)
    
    for decoder in transformer.decoders:
        current_embedding = decoder(token_embeddings, current_embedding, attention_mask, i)
    # to vocab size
    logits = current_embedding.view(current_embedding.shape[0], -1) @ transformer.linear 
    probs = torch.nn.functional.softmax(logits, dim=1)
    # fill in outputs
    for j in range(len(tokens)):
        outputs[j][i] = transformer.tokenizer.decode(preds, skip_special_tokens=True)
        if outputs[j][i] == transformer.tokenizer.eos_token:
            # This sentence is ended
            ended[j] = True
        output_logits[j][i] = probs.squeeze()
    if all(ended):
        break
    i += 1
    
    

In [None]:
output_logits[0][0].shape

In [None]:
transformer.tokenizer.decode(preds, skip_special_tokens=True)

In [None]:
probs = torch.nn.functional.softmax(logits, dim=1)

In [None]:
preds = probs.argmax(1)

In [None]:
current_token = transformer.tokenizer.decode(preds, skip_special_tokens=True)

In [None]:
current_token

In [None]:
    probs = torch.nn.functional.softmax(logits)
    preds = probs.argmax(1)
    current_token = transformer.tokenizer.decode(preds)
    output += current_token 
    print("a",current_token)
    
    i += 1

In [None]:
t_input = ["hello how are ", "hello how are you im fine and you?"]
tokens = transformer.tokenizer(t_input, return_tensors="pt", padding='max_length', max_length=transformer.max_length)
outputs = transformer.embedding_model(**tokens)

In [None]:
matrix = torch.tensor([[1, 2, 3],
                       [4, 5, 6],
                       [7, 8, 9]]).float()

lower_triangular_mask = torch.ones_like(matrix, dtype=torch.bool).tril(diagonal=0)
matrix[~lower_triangular_mask] = float('-inf')

In [None]:
matrix

In [None]:
t_input = ["hello how are ", "hello how are you im fine and you?"]
tokens = transformer(t_input)


In [None]:
t_input = ["hello how are ", "hello how are you im fine and you?"]
tokens = transformer.tokenizer(t_input, return_tensors="pt", padding='max_length', max_length=transformer.max_length)
outputs = transformer.embedding_model(**tokens)
token_embeddings = outputs.last_hidden_state
positional_embeddings = positional_encoding(*token_embeddings.shape[-2:])
token_embeddings += positional_embeddings

In [None]:
token_embeddings.shape

In [None]:
tokens = transformer.tokenizer(t_input, return_tensors="pt", padding='max_length', max_length=20)

In [None]:
transformer.tokenizer.vocab_size

In [None]:
outputs = transformer.embedding_model(**tokens)

In [None]:
encoder = Encoder(transformer.token_dim, transformer.embedding_size, 3,3).to("cpu")

In [None]:
encoded = encoder(embeddings)

In [None]:
encoded.shape

In [None]:
token_embeddings = outputs.last_hidden_state

In [None]:
token_embeddings.shape[-1]

In [None]:
from google_auth_oauthlib.flow import InstalledAppFlow
SCOPES = ['https://www.googleapis.com/auth/userinfo.email', 'openid']
flow = InstalledAppFlow.from_client_secrets_file('client_secret.json', SCOPES)
credentials = flow.run_local_server(port=0)

In [None]:
pip install 