## Transformer

In [None]:
import torch 
import os
import re 
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
import math

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Save the name of the model whose tokenizer we are using. We will need it later.
model_name = 'bert-base-multilingual-cased'
# Download the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
pre_trained = AutoModelForSeq2SeqLM.from_pretrained(model_name)
embedding_layer = pre_trained.get_input_embeddings()
embedding_layer = embedding_layer.requires_grad_(False)



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
torch.cuda.get_device_name(0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_file, tgt_file, tokenizer, max_len, src_lang, tgt_lang):
        self.src_file = src_file
        self.tgt_file = tgt_file
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.src_lang_text = []
        self.tgt_lang_text = []
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self._build()
    
    def _build(self):
        with open(self.src_file, 'r', encoding='utf-8') as f:
            self.src_lang_text = f.readlines()
        with open(self.tgt_file, 'r', encoding='utf-8') as f:
            self.tgt_lang_text = f.readlines()
    
    def __len__(self):
        return len(self.src_lang_text)
    
    def __getitem__(self, idx):
        src_lang_text = self.src_lang_text[idx]
        tgt_lang_text = self.tgt_lang_text[idx]
        
        tokenized_src_lang_text = self.tokenizer.encode_plus(
            src_lang_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        tokenized_tgt_lang_text = self.tokenizer(
            tgt_lang_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        tokenized_src_lang_text['input_ids'] = tokenized_src_lang_text['input_ids'].squeeze()
        # src_mask = tokenized_src_lang_text['attention_mask'].squeeze()  
        tokenized_tgt_lang_text['input_ids'] = tokenized_tgt_lang_text['input_ids'].squeeze()        
        # tgt_mask = tokenized_tgt_lang_text['attention_mask'].squeeze()
        

        return tokenized_src_lang_text, tokenized_tgt_lang_text

In [5]:
# Paths to the data files
train_en_path = 'umc005-corpus/quran/train.en'
train_ur_path = 'umc005-corpus/quran/train.ur'
dev_en_path = 'umc005-corpus/quran/dev.en'
dev_ur_path = 'umc005-corpus/quran/dev.ur'
test_en_path = 'umc005-corpus/quran/test.en'
test_ur_path = 'umc005-corpus/quran/test.ur'

In [6]:
max_len = 256
train = TranslationDataset(dev_en_path, dev_ur_path, tokenizer, max_len, 'en', 'ur')
# train_loader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)

({'input_ids': tensor([   101,  12689,  10261,    113,  10105,    175,  12090,  25087,  81520,
            114,  10124,  10472,  10160,  10435,  15858,  52347,  10106,    113,
          27224,  13788,  10230,    114,  10105,  11038,  20645,    119,    113,
          10117,  13440,  10108,  10105,  51635,  34962,  10393,  10472,  12153,
          10142,  10957,  11178, 100745, 108787,    119,    114,    102,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              

In [7]:
def to_word(input_ids, tokenizer):
    return tokenizer.convert_ids_to_tokens(input_ids)

In [8]:
# For English text
for batch in train_loader:
    tokens_en, tokens_ur = batch
    input_ids_en = tokens_en['input_ids'][0]
    tokens_en_tokens = to_word(input_ids_en, tokenizer)

    print("Tokens for English text:")
    print(tokens_en_tokens)

    # For Urdu text
    input_ids_ur = tokens_ur['input_ids'][0]
    tokens_ur_tokens = tokenizer.convert_ids_to_tokens(input_ids_ur)

    print("Tokens for Urdu text:")
    print(tokens_ur_tokens)
    print()
    break  # Remove this break if you want to process the entire dataset

Tokens for English text:
['[CLS]', 'So', ',', 'continuously', 'ad', '##moni', '##sh', 'them', ',', 'for', 'you', 'are', 'but', 'an', 'ad', '##moni', '##sher', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

In [9]:
pos_embeddings = pre_trained.bert.embeddings.position_embeddings
pos_embeddings = pos_embeddings.requires_grad_(False)
pos_embeddings

Embedding(512, 768)

In [10]:
class LearnedPositionalEncoding(nn.Module):
    def __init__(self, pos_embeddings):
        super(LearnedPositionalEncoding, self).__init__()
        self.pos_embedding = pos_embeddings

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(batch_size, seq_len)
        pos_embed = self.pos_embedding(positions)
        return x + pos_embed

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads, pad_token_id=None, apply_mask=False):
        super(MultiHeadAttention, self).__init__()
        assert embed_size % num_heads == 0, "Embedding size must be divisible by number of heads"

        self.embed_size = embed_size
        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads
        self.pad_token_id = pad_token_id
        self.mask = apply_mask

        # Linear layers to project queries, keys, and values
        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)

        # Output linear layer
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, q, k, v, input_ids=None):
        batch_size, seq_length, embed_size = q.size()
        
        queries = self.query(q)
        keys = self.key(k)
        values = self.value(v)

        # Reshape for multi-head attention
        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled Dot-Product Attention
        attention_scores = torch.matmul(queries, keys.transpose(-1, -2)) / (self.head_dim ** 0.5)

        # Apply causal mask for decoder self-attention
        if self.mask:
            causal_mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool()
            causal_mask = causal_mask.unsqueeze(0).unsqueeze(0).to(device)  # Shape: [1, 1, seq_length, seq_length]
            attention_scores = attention_scores.masked_fill(causal_mask, float('-inf'))

        # Apply padding mask to ignore padding tokens
        if input_ids is not None and self.pad_token_id is not None:
            padding_mask = (input_ids == self.pad_token_id).unsqueeze(1).unsqueeze(2)  # Shape: [batch_size, 1, 1, seq_length]
            attention_scores = attention_scores.masked_fill(padding_mask, float('-inf'))

        # Calculate attention weights and apply to values
        attention_weights = F.softmax(attention_scores, dim=-1)
        out = torch.matmul(attention_weights, values)  # Shape: [batch_size, num_heads, seq_length, head_dim]

        # Reshape back to [batch_size, seq_length, embed_size]
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_length, embed_size)
        out = self.fc_out(out)

        return out

In [12]:
# class MultiHeadAttention(nn.Module):
#     def __init__(self, embed_size, num_heads, pad_token_id=None, apply_mask=False):
#         super(MultiHeadAttention, self).__init__()
#         assert embed_size % num_heads == 0, "Embedding size must be divisible by number of heads"

#         self.embed_size = embed_size
#         self.num_heads = num_heads
#         self.head_dim = embed_size // num_heads
#         self.pad_token_id = pad_token_id
#         self.mask = apply_mask

#         # Linear layers to project queries, keys, and values
#         self.query = nn.Linear(embed_size, embed_size)
#         self.key = nn.Linear(embed_size, embed_size)
#         self.value = nn.Linear(embed_size, embed_size)

#         # Output linear layer
#         self.fc_out = nn.Linear(embed_size, embed_size)

#     def forward(self, q, k, v, input_ids=None):
#         batch_size, seq_length, embed_size = q.size()
        
#         queries = self.query(q)
#         keys = self.key(k)
#         values = self.value(v)

#         # Reshape for multi-head attention
#         queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
#         keys = keys.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
#         values = values.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

#         # Scaled Dot-Product Attention
#         attention_scores = torch.matmul(queries, keys.transpose(-1, -2)) / (self.head_dim ** 0.5)

#         # Apply causal mask for decoder self-attention
#         if self.mask:
#             causal_mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool()
#             causal_mask = causal_mask.unsqueeze(0).unsqueeze(0).to(x.device)  # Shape: [1, 1, seq_length, seq_length]
#             attention_scores = attention_scores.masked_fill(causal_mask, float('-inf'))

#         # Apply padding mask to ignore padding tokens
#         if input_ids is not None and self.pad_token_id is not None:
#             padding_mask = (input_ids == self.pad_token_id).unsqueeze(1).unsqueeze(2)  # Shape: [batch_size, 1, 1, seq_length]
#             attention_scores = attention_scores.masked_fill(padding_mask, float('-inf'))

#         # Calculate attention weights and apply to values
#         attention_weights = F.softmax(attention_scores, dim=-1)
#         out = torch.matmul(attention_weights, values)  # Shape: [batch_size, num_heads, seq_length, head_dim]

#         # Reshape back to [batch_size, seq_length, embed_size]
#         out = out.transpose(1, 2).contiguous().view(batch_size, seq_length, embed_size)
#         out = self.fc_out(out)

#         return out

# # Example usage
# embed_size = 768
# num_heads = 8
# batch_size = 32
# seq_length = 128
# pad_token_id = 0  # Assume 0 is the padding token ID

# # Create dummy input tensor and encoder outputs
# x = torch.rand(batch_size, seq_length, embed_size)
# encoder_outputs = torch.rand(batch_size, seq_length, embed_size)
# input_ids = torch.randint(0, 20, (batch_size, seq_length))  # Dummy input IDs

# # Encoder Self-Attention
# encoder_mha = MultiHeadAttention(embed_size, num_heads, pad_token_id=pad_token_id, apply_mask=False)
# encoder_output = encoder_mha(x,x,x, input_ids=input_ids)

# # Decoder Self-Attention
# decoder_self_mha = MultiHeadAttention(embed_size, num_heads, pad_token_id=pad_token_id, apply_mask=True)
# decoder_self_output = decoder_self_mha(x,x,x, input_ids=input_ids)

# # Decoder Cross-Attention
# decoder_cross_mha = MultiHeadAttention(embed_size, num_heads, pad_token_id=pad_token_id, apply_mask=False)
# decoder_cross_output = decoder_cross_mha(x, encoder_outputs, encoder_outputs, input_ids=input_ids)

# print("Encoder Self-Attention Output Shape:", encoder_output.shape)
# print("Decoder Self-Attention Output Shape:", decoder_self_output.shape)
# print("Decoder Cross-Attention Output Shape:", decoder_cross_output.shape)


In [13]:
# mlh = MultiHeadAttention(12, apply_mask=True)
# q = torch.randn(32, 128, 768)
# k = torch.randn(32, 128, 768)
# v = torch.randn(32, 128, 768)

# output = mlh.forward(q, k, v)

In [14]:
class EncoderBlock(nn.Module):
    def __init__(self, num_heads):
        super(EncoderBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttention(768,num_heads,tokenizer.pad_token_id,  False)
        self.layer_norm1 = nn.LayerNorm(768)
        self.feed_forward = nn.Sequential(
            nn.Linear(768, 2048),
            nn.ReLU(),
            nn.Linear(2048, 768)
        )
        self.layer_norm2 = nn.LayerNorm(768)

    def forward(self, x, input_ids):

        # Multi-Head Attention
        attention_output = self.multi_head_attention(x, x, x, input_ids)

        # Add & Norm
        x = self.layer_norm1(x + attention_output)

        # Feed Forward
        feed_forward_output = self.feed_forward(x)

        # Add & Norm
        x = self.layer_norm2(x + feed_forward_output)

        return x

In [15]:
class Encoder(nn.Module):
    def __init__(self, num_layers, num_heads):
        super(Encoder, self).__init__()
        self.encoder_blocks = nn.ModuleList([EncoderBlock(num_heads) for _ in range(num_layers)])

    def forward(self, x, src_mask):
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x, src_mask)
        return x

In [16]:
class DecoderBlock(nn.Module):
    def __init__(self, num_heads):
        super(DecoderBlock, self).__init__()
        
        self.masked_multi_head_attention = MultiHeadAttention(768,num_heads,tokenizer.pad_token_id,  True)
        self.layer_norm1 = nn.LayerNorm(768)

        self.multi_head_attention = MultiHeadAttention(768,num_heads,tokenizer.pad_token_id,  False)
        self.layer_norm2 = nn.LayerNorm(768)

        self.feed_forward = nn.Sequential(
            nn.Linear(768, 2048),
            nn.ReLU(),
            nn.Linear(2048, 768)
        )
        self.layer_norm3 = nn.LayerNorm(768)

    def forward(self, x, encoder_output, tgt_ids, src_ids):

        # Masked Multi-Head Attention
        masked_attention_output = self.masked_multi_head_attention(x, x, x, tgt_ids)

        # Add & Norm
        x = self.layer_norm1(x + masked_attention_output)

        # Multi-Head Attention
        attention_output = self.multi_head_attention(x, encoder_output, encoder_output, src_ids)

        # Add & Norm
        x = self.layer_norm2(x + attention_output)

        # Feed Forward
        feed_forward_output = self.feed_forward(x)

        # Add & Norm
        x = self.layer_norm3(x + feed_forward_output)

        return x

In [17]:
class Decoder(nn.Module):
    def __init__(self, num_layers, num_heads):
        super(Decoder, self).__init__()
        self.decoder_blocks = nn.ModuleList([DecoderBlock(num_heads) for _ in range(num_layers)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoder_output, src_mask, tgt_mask)
        return x

In [18]:
class Transformer(nn.Module):
    def __init__(self, num_layers, num_heads, pos_embeddings, embedding_layer):
        super(Transformer, self).__init__()

        self.positional_encoding = LearnedPositionalEncoding(pos_embeddings)
        self.embedding_layer = embedding_layer
        
        self.encoder = Encoder(num_layers, num_heads)
        self.decoder = Decoder(num_layers, num_heads)

        self.linear = nn.Linear(768, tokenizer.vocab_size)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    def forward(self, src_input_ids, tgt_input_ids):
        src_embeddings = self.embedding_layer(src_input_ids)
        src_embeddings = self.positional_encoding(src_embeddings)

        encoder_output = self.encoder(src_embeddings, src_input_ids)

        tgt_embeddings = self.embedding_layer(tgt_input_ids)
        tgt_embeddings = self.positional_encoding(tgt_embeddings)

        decoder_output = self.decoder(tgt_embeddings, encoder_output, tgt_input_ids, src_input_ids)

        output = self.linear(decoder_output)

        return output
    
    def calculate_loss(self, output, tgt_input_ids):
        # print(output.reshape(-1, tokenizer.vocab_size).shape)
        # print(tgt_input_ids.reshape(-1).shape)

        return self.loss_fn(
            output.reshape(-1, tokenizer.vocab_size),
            tgt_input_ids.reshape(-1)
        )
    
    def generate(self, src_input_ids, max_len=128, temperature=0.7, top_k=5):
        self.eval()
        with torch.no_grad():
            # Encode source
            src_embeddings = self.embedding_layer(src_input_ids)
            src_embeddings = self.positional_encoding(src_embeddings)
            encoder_output = self.encoder(src_embeddings, src_input_ids)
            
            # Initialize with BOS token
            tgt_input_ids = torch.full((src_input_ids.size(0), 1), 
                                    tokenizer.cls_token_id,
                                    device=src_input_ids.device)
            
            for _ in range(max_len-1):
                # Get target embeddings
                tgt_embeddings = self.embedding_layer(tgt_input_ids)
                tgt_embeddings = self.positional_encoding(tgt_embeddings)
                
                # Create proper masks
                src_mask = (src_input_ids != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
                tgt_mask = self.get_tgt_mask(tgt_input_ids)
                
                # Decode
                decoder_output = self.decoder(tgt_embeddings, encoder_output, src_mask, tgt_mask)
                logits = self.linear(decoder_output[:, -1]) / temperature
                
                # Top-k sampling
                top_k_logits, top_k_indices = torch.topk(logits, k=top_k)
                probs = F.softmax(top_k_logits, dim=-1)
                next_token_idx = torch.multinomial(probs, num_samples=1)
                next_token = top_k_indices.gather(-1, next_token_idx)
                
                tgt_input_ids = torch.cat([tgt_input_ids, next_token], dim=1)
                
                if next_token.item() == tokenizer.sep_token_id:
                    break
                    
            return tgt_input_ids
    
    def translate(self, src_input_ids):
        tgt_input_ids = self.generate(src_input_ids)
        print(tgt_input_ids)
        tgt_text = tokenizer.decode(tgt_input_ids[0], skip_special_tokens=True)
        return tgt_text
    

In [None]:
# Training the model
num_layers = 8
num_heads = 8
model = Transformer(num_layers, num_heads, pos_embeddings, embedding_layer).to(device)
model = model.float()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
total_batches = len(train_loader)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    batch_num = 0
    for batch in train_loader:
        tokens_en, tokens_ur = batch
        batch_num += 1
        input_ids_en = tokens_en['input_ids'].to(device)
        input_ids_ur = tokens_ur['input_ids'].to(device)

        output = model(input_ids_en, input_ids_ur)
        loss = model.calculate_loss(output, input_ids_ur)
        # print(output)
        # print(input_ids_ur)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f'Epoch {epoch + 1}/{num_epochs} Batch:{batch_num}/{total_batches}  Loss: {loss.item():.4f}')
    print(f'Epoch {epoch + 1}/{num_epochs} Loss: {total_loss:.4f}')


In [21]:
# print(train[0][0]['input_ids'].unsqueeze(0))
# print(train[0][1]['input_ids']) 
print(to_word(train[0][0]['input_ids'], tokenizer))
print(to_word(train[0][1]['input_ids'], tokenizer))
model.translate(train[0][0]['input_ids'].unsqueeze(0).to(device))