**BPE tokenizer**

In [1]:
pip install transformers




In [2]:
from transformers import GPT2Tokenizer
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import math
bpe_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [3]:
text = "building a generative pretrained transformer"
encoded_text = bpe_tokenizer.encode(text,return_tensors='pt')
encoded_text

tensor([[16894,   257,  1152,   876,  2181, 13363, 47385]])

**Configurations**

In [4]:
import torch
import torch.nn as nn
import math

config = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

**Token Embedding**

In [5]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super().__init__()
        self.embeder = nn.Embedding(vocab_size, emb_size)

    def forward(self, input):
        return self.embeder(input)

**Positional Encoding**

In [25]:
class Positional_Encoding(nn.Module):
    def __init__(self, emb_size, max_seq_length=512):
        super().__init__()

        pe = torch.zeros(max_seq_length, emb_size)
        pos = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_size, 2).float() * -(math.log(10000) / emb_size))

        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))  # Prevent PyTorch from treating it as a parameter

    def forward(self, input):
        return input + self.pe[:, :input.size(1)]

**Masked Multi-Head Attention**

In [26]:
class MultiHeadAttention(nn.Module):
    def __init__(self, emb_size, head_num, bias=False):
        super().__init__()

        self.emb_size = emb_size
        self.head_num = head_num
        self.head_dim = emb_size // head_num

        self.query = nn.Linear(emb_size, emb_size, bias=bias)
        self.key = nn.Linear(emb_size, emb_size, bias=bias)
        self.value = nn.Linear(emb_size, emb_size, bias=bias)
        self.out = nn.Linear(emb_size, emb_size)

    def forward(self, input, mask=None):
        batch_size, seq_length, embed_dim = input.shape

        Q = self.query(input)
        K = self.key(input)
        V = self.value(input)

        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (embed_dim ** 0.5)

        if mask is not None:
            mask = mask.unsqueeze(1)
            attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))

        attention_probs = torch.softmax(attention_scores, dim=-1)
        attention_output = torch.matmul(attention_probs, V)

        return attention_output


**Layer Normalization**

In [27]:
class LayerNormalization(nn.Module):
    def __init__(self, emb_size, eps=1e-5):
        super().__init__()

        self.eps = eps
        self.scale = nn.Parameter(torch.ones(emb_size))
        self.shift = nn.Parameter(torch.zeros(emb_size))

    def forward(self, input):
        mean = input.mean(dim=-1, keepdim=True)
        variance = input.var(dim=-1, keepdim=True, unbiased=False)

        input_normalization = (input - mean) / torch.sqrt(variance + self.eps)
        return self.scale * input_normalization + self.shift

**Feed-Forward Neural Network**

In [28]:
class FeedForward(nn.Module):
    def __init__(self, emb_size, h_states):
        super().__init__()

        self.first_nn = nn.Linear(emb_size, h_states)
        self.second_nn = nn.Linear(h_states, emb_size)
        self.gelu = nn.GELU()

    def forward(self, input):
        return self.second_nn(self.gelu(self.first_nn(input)))

**Transformer Block**

In [29]:
class TransformerBlock(nn.Module):
    def __init__(self, emb_size, num_head, h_states, drop_out=0.1, bias=False):
        super().__init__()

        self.mha = MultiHeadAttention(emb_size=emb_size, head_num=num_head, bias=bias)
        self.ff = FeedForward(emb_size=emb_size, h_states=h_states)
        self.norm_layer_1 = LayerNormalization(emb_size=emb_size)
        self.norm_layer_2 = LayerNormalization(emb_size=emb_size)
        self.drop_out_layer = nn.Dropout(drop_out)

    def forward(self, input, mask=None):
        attention_scores = self.mha(input, mask)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)

        attention_output = self.norm_layer_1(input + self.drop_out_layer(attention_scores))
        feed_forward_output = self.norm_layer_2(attention_output + self.drop_out_layer(self.ff(attention_output)))

        return feed_forward_output


**GPT-2 Model**

In [46]:
class GPT2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.vocab_size = config["vocab_size"]
        self.emb_dim = config["emb_dim"]
        self.context_length = config["context_length"]
        self.n_layers = config["n_layers"]
        self.n_heads = config["n_heads"]
        self.ff_hidden_size = self.emb_dim * 4
        self.drop_rate = config["drop_rate"]
        self.qkv_bias = config["qkv_bias"]

        self.embedding = Embedding(self.vocab_size, self.emb_dim)
        self.positional_embedding = Positional_Encoding(self.emb_dim, self.context_length)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(self.emb_dim, self.n_heads, self.ff_hidden_size, self.drop_rate, self.qkv_bias)
            for _ in range(self.n_layers)
        ])

        self.norm = LayerNormalization(self.emb_dim)
        self.l_head = nn.Linear(self.emb_dim, self.vocab_size, bias=False)

    def forward(self, input_ids, mask=None, labels=None):
        x = self.embedding(input_ids)
        x = self.positional_embedding(x)

        for block in self.transformer_blocks:
            x = block(x, mask)

        x = self.norm(x)
        logits = self.l_head(x)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            return loss, logits

        return logits

**Training Dataset**

In [47]:
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.max_length = max_length

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        src = self.tokenizer(self.source_texts[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        tgt = self.tokenizer(self.target_texts[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': src['input_ids'].squeeze(0),
            'attention_mask': src['attention_mask'].squeeze(0),
            'labels': tgt['input_ids'].squeeze(0)
        }

**Training The Model**

In [48]:
def train_model(model, dataloader, tokenizer, device, epochs, lr=5e-5):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            optimizer.zero_grad()
            loss, _ = model(input_ids, mask=attention_mask, labels=labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader)}")

def translate_text(model, tokenizer, text, device, max_length=50):
    model.eval()
    input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to(device)

    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
            if next_token.item() == tokenizer.eos_token_id:
                break

            input_ids = torch.cat([input_ids, next_token], dim=-1)
    return tokenizer.decode(input_ids.squeeze(), skip_special_tokens=True)


**Testing**

In [50]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2(config).to(device)

source_texts = ["Hello, how are you?", "What is your name?"]
target_texts = ["Bonjour, comment ça va?", "Quel est votre nom?"]
dataset = TranslationDataset(source_texts, target_texts, tokenizer, max_length=50)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

train_model(model, dataloader, tokenizer, device, epochs=30)
print(translate_text(model, tokenizer, "Hello, how are you?", device))

Epoch 1, Loss: 10.840285301208496
Epoch 2, Loss: 5.225130081176758
Epoch 3, Loss: 2.9078779220581055
Epoch 4, Loss: 2.2890539169311523
Epoch 5, Loss: 2.2090260982513428
Epoch 6, Loss: 2.1719634532928467
Epoch 7, Loss: 2.1170291900634766
Epoch 8, Loss: 2.0930538177490234
Epoch 9, Loss: 2.0686275959014893
Epoch 10, Loss: 2.004240036010742
Epoch 11, Loss: 1.966560959815979
Epoch 12, Loss: 1.9218922853469849
Epoch 13, Loss: 1.8913052082061768
Epoch 14, Loss: 1.8570829629898071
Epoch 15, Loss: 1.824256181716919
Epoch 16, Loss: 1.821496605873108
Epoch 17, Loss: 1.7778239250183105
Epoch 18, Loss: 1.754194974899292
Epoch 19, Loss: 1.7058868408203125
Epoch 20, Loss: 1.6945725679397583
Epoch 21, Loss: 1.657596230506897
Epoch 22, Loss: 1.6276458501815796
Epoch 23, Loss: 1.5816651582717896
Epoch 24, Loss: 1.5401997566223145
Epoch 25, Loss: 1.4900282621383667
Epoch 26, Loss: 1.4440747499465942
Epoch 27, Loss: 1.4056957960128784
Epoch 28, Loss: 1.3748407363891602
Epoch 29, Loss: 1.3478165864944458
E

Since GPT-2 is designed as a decoder-only model, it isn’t naturally suited for translation tasks, which typically require both encoding and decoding capabilities. Additionally, training it effectively demands an enormous amount of data, making the process computationally expensive and resource-intensive , So this just a simple implmentaion to GPT 2 architecture