# Text Classification

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def get_device():
  device="cpu"
  if torch.cuda.is_available():
    device="cuda"
  elif  torch.backends.mps.is_available():
    device='mps'
  else:
    device="cpu"
  return device


device = get_device()
print(device)

mps


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_embedding, dropout=0.1, max_seq_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        postional_encoding = torch.zeros(max_seq_len, dim_embedding)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        denom_term = torch.exp(torch.arange(0, dim_embedding, 2).float() * (-math.log(10000.0) / dim_embedding))
        postional_encoding[:, 0::2] = torch.sin(position * denom_term)
        postional_encoding[:, 1::2] = torch.cos(position * denom_term)
        postional_encoding = postional_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer('postional_encoding', postional_encoding)
    def forward(self, x):
        x = x + self.postional_encoding[:x.size(0), :]
        return self.dropout(x)

## Data Preparation: 
Here's our plan for data preparation.
Prepare the IMDb dataset for text classification:

1. Load the IMDb dataset and tokenize the text using an appropriate pre-trained tokenizer.
2.Pad and truncate the tokenized text to ensure a consistent sequence length across different examples.
3. Ensure the data shapes are compatible with the TransformerEncoder requirements:
  * Reshape the tokenized data (input_ids) to have a shape of (seq_length, batch_size).
  * Reshape the attention mask to have a shape of (batch_size, seq_length).
  * One-hot encode the labels for binary-class classification.

4. Create TensorDataset to use the huggingface data on Pytorch.
  * TensorDataset is a utility class in PyTorch (from the torch.utils.data module) that allows you to create a dataset object by wrapping one or more tensors. Each tensor in the dataset represents a different field or attribute of your data samples, such as input data, labels, or attention masks.
5. Define custom Collate_fn
  * The default collate function in DataLoader simply combines the samples into a batch without any additional processing. However, in many cases, you may need to perform custom processing on your samples
  * Accorind to our shape requirement. Performe Shape transformation in this function.
  * data: (seq_len, batch_size)
  * atten_mask: (batch_size, seq_len)
  * labe: one-hot-encode

Important Point: Understand the Attention Mask:

  * The attention mask is a binary tensor that indicates which tokens in the input sequence should be attended to by the model.
In the attention mask tensor, the value 1 corresponds to an actual token (word or subword), while the value 0 corresponds to a padding token.
  * The model uses this mask to ignore padding tokens during the self-attention mechanism in the Transformer architecture.

Let's Code it!

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import TensorDataset


# Load the dataset and tokenizer
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, return_tensors="pt", max_length=512)

train_dataset = dataset["train"].map(tokenize, batched=True, batch_size=len(dataset["train"]))
val_dataset = dataset["test"].map(tokenize, batched=True, batch_size=len(dataset["test"]))

# Extract input_ids and attention_mask from the tokenized dataset
train_data = torch.tensor(train_dataset["input_ids"])
train_attention_mask = torch.tensor(train_dataset["attention_mask"])
train_labels = torch.tensor(train_dataset["label"])

val_data = torch.tensor(val_dataset["input_ids"])
val_attention_mask = torch.tensor(val_dataset["attention_mask"])
val_labels = torch.tensor(val_dataset["label"])

# Create TensorDatasets
train_dataset = TensorDataset(train_data, train_attention_mask, train_labels)
val_dataset = TensorDataset(val_data, val_attention_mask, val_labels)

# Create DataLoaders

def collate_fn(batch):
    input_ids, attention_mask, labels = zip(*batch)
    input_ids = torch.stack(input_ids).transpose(0, 1)  # Transpose input_ids
    attention_mask = torch.stack(attention_mask)   # Transpose attention_mask
    labels = torch.nn.functional.one_hot(torch.tensor(labels), num_classes=2).float().to(device)
    return input_ids, attention_mask, labels

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


Found cached dataset imdb (/Users/premtimsina/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/premtimsina/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5ff46743b60fe77a.arrow
Loading cached processed dataset at /Users/premtimsina/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b5dbdc1fc6b34a20.arrow


The src_key_padding_mask in the TransformerEncoder should be a 2D boolean tensor of shape (batch_size, sequence_length). Each element in the tensor should be True if the corresponding token in the input sequence is a padding token, and False otherwise.

The src_key_padding_mask is used to mask out the padding tokens in the input so that they don't contribute to the attention calculation. When the padding tokens are masked out, the transformer will not consider them while computing attention scores for the non-padding tokens.

This is our idea for Text Classification
1. Initialize the following components in the constructor:
  * Embedding layer to convert input tokens into embeddings.
  * Positional encoding to add position information to the embeddings.
  * Transformer encoder consisting of multiple layers, each with multi-head self-attention and feedforward neural networks.
  * Fully connected (linear) layer for classification.
2. Implement the init_weights method to initialize the weights of the model components.

3. Implement the forward method to define the forward pass of the model.
  * Pass the input through the embedding layer and apply positional encoding. We also add multiplication term to the embedding. The purpose of this multiplication is to scale the embeddings. This can help the model learn better and avoid vanishing gradients. The square root of the model's dimension is used as a scaling factor because it is a simple heuristic that works well in practice.
  * Pass the embeddings through the transformer encoder with an optional key_padding_mask.
  * Perform mean pooling on the last dimension and use the first token representation.
    1. The output of encoder is (Seq_length, batch_size, emb_dim). Performing the mean polling across the dimesion `0` will change the data in the form of (batch_size, emb_dim)
  * Pass the pooled representation through the fully connected layer for classification.
  * Apply a sigmoid activation function to obtain probabilities.

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, nhead, num_layers, num_classes):
        super(TextClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim)
        # Create the transformer encoder layer
        self.encoder_layer = nn.TransformerEncoderLayer(embedding_dim, nhead)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers)
        self.fc = nn.Linear(embedding_dim, num_classes)
        self.embedding_dim=embedding_dim
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for layer in self.encoder.layers:
            nn.init.xavier_uniform_(layer.self_attn.out_proj.weight)
            nn.init.zeros_(layer.self_attn.out_proj.bias)
            nn.init.xavier_uniform_(layer.linear1.weight)
            nn.init.zeros_(layer.linear1.bias)
            nn.init.xavier_uniform_(layer.linear2.weight)
            nn.init.zeros_(layer.linear2.bias)
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, key_padding_mask=None):
        x = self.embedding(x)* math.sqrt(self.embedding_dim)
        x = self.positional_encoding(x)
        x = self.encoder(x, src_key_padding_mask=key_padding_mask)

        # Pooling the last dimension and use the first token representation
        x = x.mean(dim=0)

        # Fully connected layer for classification
        x = self.fc(x)
        x=torch.sigmoid(x)
        return x

## Declaring Model

In [None]:
vocab_size = tokenizer.vocab_size
embedding_dim = 512
nhead = 8
num_layers = 6
num_classes = 2

# Create the model
model = TextClassifier(vocab_size, embedding_dim, nhead, num_layers,  num_classes).to(device)
criterion = nn.BCELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training
Here are a few important things to note:

* batch_attention_mask = (batch_attention_mask == 0).to(device): Convert the attention mask to a boolean tensor by checking if the values are equal to 0.
  1. The attention mask that comes from Hugging Face tokenizer is as follows:
    * attention_mask = 1 if it is a real token
    * attention_mask = 0 if it is a pad token
  2. PyTorch's attention_mask requires us to send:
    * attention_mask = False if it is a real token
    * attention_mask = True if it is a pad token
  3. Therefore, we are converting batch_attention_mask == 0 to True. This means we are telling the model that if it is a pad token, set attention_mask = True.
* To avoid exploding gradients, clip the gradients of the model's parameters using torch.nn.utils.clip_grad_norm_(). This helps maintain the stability of the training process and prevents the gradients from becoming too large.


In [None]:
num_epochs = 1
for epoch in range(num_epochs):
    i=0
    for batch_data, batch_attention_mask, batch_labels in train_dataloader:
      
        optimizer.zero_grad()

        # Convert attention_mask to boolean tensor
        batch_attention_mask = (batch_attention_mask==0).to(device)

        outputs = model(batch_data.to(device), key_padding_mask=batch_attention_mask)
        loss = criterion(outputs, batch_labels.to(device))
        if i%100==0:
          print ("epoch ", epoch, "batch ", i, "loss ", loss)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        i=i+1

    print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")

epoch  0 batch  0 loss  tensor(0.8329, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  100 loss  tensor(0.7013, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  200 loss  tensor(0.6965, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  300 loss  tensor(0.6912, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  400 loss  tensor(0.6935, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  500 loss  tensor(0.6960, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  600 loss  tensor(0.6958, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
epoch  0 batch  700 loss  tensor(0.6929, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)
Epoch: 1, Loss: 0.6728882789611816


In [None]:
torch.save(model.state_dict(), "/Users/premtimsina/Documents/bpbbook/chapter3/TextClassificationModel.pth")

## Inference


In [None]:
# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
vocab_size = tokenizer.vocab_size
embedding_dim = 512
nhead = 8
num_layers = 3
num_classes = 2

# Create the model
model_loaded = TextClassifier(vocab_size, embedding_dim, nhead, num_layers,  num_classes).to(device)

# Load the trained model weights (replace 'path_to_weights.pth' with the path to your trained model)
model_loaded.load_state_dict(torch.load('/Users/premtimsina/Documents/bpbbook/chapter3/TextClassificationModel.pth'))
model.eval()

TextClassifier(
  (embedding): Embedding(30522, 512)
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
     

### Printing total parameter of our model

In [None]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total trainable parameters:", total_params)

Total trainable parameters: 37694978


In [None]:

# Function to perform inference on a given text
def infer(text):
    # Tokenize the input text
    tokens = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    input_ids = tokens["input_ids"].to(device).transpose(0,1)

    attention_mask = tokens["attention_mask"]
    attention_mask=(attention_mask==0).to(device)
    print(input_ids.shape)
    print(attention_mask)

    # Perform inference
    with torch.no_grad():
        output = model(input_ids, key_padding_mask=attention_mask)
    # Convert the output to class probabilities
    probabilities = output.squeeze(0)
    return probabilities




In [None]:
# Test with an example text
example_text = "This movie is  good! ."
probabilities = infer(example_text)

print("Probabilities:", probabilities)

torch.Size([8, 1])
tensor([[False, False, False, False, False, False, False, False]],
       device='mps:0')
Probabilities: tensor([0.4503, 0.5445], device='mps:0')


Summary:
The provided code offers a basic implementation of a TransformerEncoder for text classification. The primary goal is to demonstrate how to utilize a TransformerEncoder for this task. In subsequent chapters, we will explore optimized versions of Transformer-based classification. To achieve better results with the current code, consider increasing the number of encoder layers and fine-tuning various parameters.

# Text Generation
## Decoder-Only Layer
1. We will use the Shakespeare dataset to create a decoder-only model that generates text in the style of Shakespeare.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


What we are doing Here?
1. This code defines a custom PyTorch dataset called ShakespeareDataset.
2. It takes a file path to a text file and a tokenizer as input. The dataset reads the file and splits it into examples of size block_size. 
3. Each example is then tokenized using the tokenizer and padded or truncated to a maximum length of block_size. The resulting tokenized examples are stored in a list.

In [None]:
class ShakespeareDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        self.block_size = block_size
        self.tokenizer = tokenizer

        with open(file_path, 'r') as f:
            self.data = f.read()

        self.examples = []
        for i in range(0, len(self.data)-self.block_size, self.block_size):
            example = self.data[i:i+self.block_size]
            tokenized = self.tokenizer(example, padding='max_length', truncation=True, max_length=block_size, return_tensors='pt')
            self.examples.append(tokenized)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        input_ids = self.examples[idx]['input_ids'].squeeze()
        attention_mask = self.examples[idx]['attention_mask'].squeeze()
        return input_ids, attention_mask

  


In [None]:
import os,urllib
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filename = '/Users/premtimsina/Documents/bpbbook/chapter3/dataset/input.txt'
if not os.path.isfile(filename):
    urllib.request.urlretrieve(url, filename)


In [None]:
train_dataset = ShakespeareDataset(filename, tokenizer)

What we are doing here?
1. `inputs = torch.stack(inputs).transpose(0, 1)`: The dimension of input needed is (seq_length, batch_size). Thus, we are transposing.
2. `torch.stack(inputs)` takes a list of tensors and stacks them along a new dimension (the result has one more dimension than the input tensors). 
  * For example, each inputs has dimension of (128). If the batch size 6; then, stacking operation will produce list of (6,128)=> (batch_size, seq_length)
  * transpose will result in (128,6)==> (seq_length, batch_size)

In [None]:
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    inputs, masks = zip(*batch)
    inputs = torch.stack(inputs).transpose(0, 1)
    masks = torch.stack(masks)
    return inputs, masks
train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=collate_fn,shuffle=True)


In [None]:
# Just looking at the dimension of data
item=next(iter(train_dataloader))
input_ids,attention_masks=item
print(input_ids.shape, attention_masks.shape)

torch.Size([128, 4]) torch.Size([4, 128])


## Let's Declare Positional Encoding Class.
1. It is same as what we discussed in earlier example

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, dim_embedding, dropout=0.1, max_seq_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        postional_encoding = torch.zeros(max_seq_len, dim_embedding)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        denom_term = torch.exp(torch.arange(0, dim_embedding, 2).float() * (-math.log(10000.0) / dim_embedding))
        postional_encoding[:, 0::2] = torch.sin(position * denom_term)
        postional_encoding[:, 1::2] = torch.cos(position * denom_term)
        postional_encoding = postional_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer('postional_encoding', postional_encoding)
    def forward(self, x):
        x = x + self.postional_encoding[:x.size(0), :]
        return self.dropout(x)

## Let's Declare Model
### Model Architecture
1. This model is a Transformer-based decoder-only language model, which takes as input a target sequence (tgt) and an  memory sequence (memory) and generates an output sequence of the same length as the input sequence.

2. The input target sequence is first passed through an embedding layer and a positional encoding layer. Similarly, the input memory sequence is passed through an embedding layer and a positional encoding layer.

3. During, Training
  * `memory` is train data of shape (seq_len, batch_size)
  * `target`:During model training, the target sequence would be the input sequence shifted by one position.

4. These processed input sequences are then fed into the Transformer decoder, which consists of multiple Transformer decoder layers. Each decoder layer processes the input sequences using multi-head self-attention and a feedforward neural network.

5. Finally, the output of the Transformer decoder is passed through a linear layer (fully-connected neural network) to generate the final output sequence, with each element of the sequence representing the probability distribution over the vocabulary of the target language.

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_layers, dropout):
        super().__init__()

        self.memory_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.memory_pos_encoder = PositionalEncoding(embedding_dim, dropout)
        self.tgt_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.tgt_pos_encoder = PositionalEncoding(embedding_dim, dropout)
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=8, dim_feedforward=2048, dropout=dropout),
            num_layers=num_layers)

        self.fc = nn.Linear(embedding_dim, vocab_size)
        self.d_model=embedding_dim
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1

        # Initialize the embedding layers
        nn.init.uniform_(self.memory_embedding.weight, -initrange, initrange)
        nn.init.uniform_(self.tgt_embedding.weight, -initrange, initrange)

        # Initialize the decoder layers
        for param in self.decoder.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

        # Initialize the output layer
        nn.init.uniform_(self.fc.weight, -initrange, initrange)
        nn.init.zeros_(self.fc.bias)
    def forward(self, tgt,  memory=None, tgt_mask=None, memory_mask=None, memory_key_padding_mask=None,tgt_key_padding_mask=None):
        tgt = self.tgt_embedding(tgt) * self.d_model ** 0.5
        tgt=self.tgt_pos_encoder(tgt)
        print(tgt)
        memory=self.memory_embedding(memory) * self.d_model ** 0.5
        memory=self.memory_pos_encoder(memory)
        print(memory)
        output = self.decoder(tgt=tgt, memory=memory, tgt_mask=tgt_mask, memory_mask=memory_mask, memory_key_padding_mask=memory_key_padding_mask,tgt_key_padding_mask=tgt_key_padding_mask)
        print(output)
        output = self.fc(output)
        return output

Parameters  for decoding Layer

1. tgt: The input sequence to the decoder layer. It is a tensor of shape (seq_len, batch_size, emb_dim) where seq_len is the length of the input sequence and batch_size is the number of sequences in a batch.

2. memory: The output of the last layer of the encoder. It is a tensor of shape (src_seq_len, batch_size, emb_dim) where src_seq_len is the length of the input sequence in the encoder.

3. tgt_mask: An optional tensor of shape (seq_len, seq_len) representing the mask for the input sequence. It is used to prevent the decoder from attending to future tokens.
The format should be: 
```
tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]], device='mps:0')
```
  * in above example, seq_length=3
  * where `-inf` signifies the tokens that need to be masked

4. memory_mask: An optional tensor of shape (seq_len, src_seq_len) representing the mask for the encoder output sequence. It is used to prevent the decoder from attending future tokens in the encoder input sequence.
```
tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]], device='mps:0')
```
  * in above example, seq_length=3
  * where `-inf` signifies the tokens that need to be masked
```
Usually, you will not mask the memory: Thus, you will pass:
tensor([[0., 0, 0],
        [0., 0., 0],
        [0., 0., 0.]], device='mps:0')

5. tgt_key_padding_mask: An optional tensor of shape (batch_size, seq_len) representing the mask for padding tokens in the input sequence.
```
tensor([[False, False, False],
        [False, False, False],
        [False, True, False],
        [True, True, False]], device='mps:0')
```
  * In above example, batch_size=4,  seq_len=3
  * True signifies the particular token is padded token and mask it
  * False signifies the particular token is padded token and mask it

6. memory_key_padding_mask: An optional tensor of shape (batch_size, src_seq_len) representing the mask for padding tokens in the encoder output sequence.
```
tensor([[False, False, False],
        [False, False, False],
        [False, True, False],
        [True, True, False]], device='mps:0')
```
  * In above example, batch_size=4,  seq_len=3
  * True signifies the particular token is padded token and mask it
  * False signifies the particular token is padded token and mask it


In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask.to(device)


def create_mask(src, tgt,tokenizer_src=tokenizer,tokenizer_tgt=tokenizer):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == tokenizer_src.pad_token_id).transpose(0, 1)
    tgt_padding_mask = (tgt == tokenizer_tgt.pad_token_id).transpose(0, 1)
    return src_mask.to(device), tgt_mask.to(device), src_padding_mask.to(device), tgt_padding_mask.to(device)

In [None]:
import math
device='mps'
model = TransformerDecoder(vocab_size=tokenizer.vocab_size, embedding_dim=768, num_layers=3, dropout=0.1).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


## Exercise 
1. Write the Training Loop
2. Write the Inference Loop

# Transformer Layer
## Machine Translation
1. Machine Translation is the task of converting a text from one language to another. In this context, we will focus on English to German (en-de) translation. The task involves processing a sequence of tokens in one language and producing a corresponding sequence of tokens in another language.
2. We will use Encoder-Decoder Layer of Transformer




## Let's Download the Data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset

# Load the dataset and tokenizer
dataset = load_dataset("iwslt2017", "iwslt2017-de-en", split="train[:1%]")  # Only use a 1% portion of the dataset
tokenizer_src = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_tgt = AutoTokenizer.from_pretrained("bert-base-german-cased")


Downloading and preparing dataset iwslt2017/iwslt2017-de-en to /Users/premtimsina/.cache/huggingface/datasets/iwslt2017/iwslt2017-de-en/1.0.0/03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49...


Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/206112 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8079 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/888 [00:00<?, ? examples/s]

Dataset iwslt2017 downloaded and prepared to /Users/premtimsina/.cache/huggingface/datasets/iwslt2017/iwslt2017-de-en/1.0.0/03ce9110373117c6f6687719f49f269486a8cd49dcad2527993a316cd4b6ad49. Subsequent calls will reuse this data.


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [None]:
dataset[0]

{'translation': {'de': 'Vielen Dank, Chris.',
  'en': 'Thank you so much, Chris.'}}

## Custom Dataset to Prepare Data
1. Let's Prepare the data so that we have src_tokem and tgt_token of same length
2. Here, we define the max length of token is 50

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer_src, tokenizer_tgt, max_length=50):
        self.dataset = dataset
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_text = self.dataset[idx]['translation']['en']
        tgt_text = self.dataset[idx]['translation']['de']

        src_tokens = self.tokenizer_src.encode_plus(
            src_text, 
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        tgt_tokens = self.tokenizer_tgt.encode_plus(
            tgt_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return src_tokens["input_ids"].squeeze(),tgt_tokens["input_ids"].squeeze()


In [None]:
train_data = TranslationDataset(dataset, tokenizer_src, tokenizer_tgt)


In [None]:
# View Sample data after creating torch.tensor
train_data[2]

(tensor([  101,  1045,  2031,  2042, 10676,  2185,  2011,  2023,  3034,  1010,
          1998,  1045,  2215,  2000,  4067,  2035,  1997,  2017,  2005,  1996,
          2116,  3835,  7928,  2055,  2054,  1045,  2018,  2000,  2360,  1996,
          2060,  2305,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([    3,  1671,  4058,  4899, 15227,    88,   534, 13854, 26918,    42,
          1169,  9334, 26897,  9830,  2122,   142,    30,  2709,  2055,  2636,
             7, 18930,    81, 10183,  4468,  2085,  6738,  4253, 26914,     4,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]))

Analysis: If you look at the above sample, `0` represents the padding token

## Create DataLoader:
* The output after the dataloder should be of shape (seq_len, batch_size)
* Thus, we have transposed both src_ids, and tgt_ids

In [None]:
def collate_fn(batch):
    src_ids ,tgt_ids = zip(*batch)
    src_ids = torch.stack(src_ids).transpose(0, 1)
    tgt_ids = torch.stack(tgt_ids).transpose(0, 1)
    return src_ids, tgt_ids
dataloader = DataLoader(train_data, batch_size=16, shuffle=True, collate_fn=collate_fn)


In [None]:
item=next(iter(dataloader))
src_ids,tgt_ids=item
print('src_ids ',src_ids.shape)
print(' tgt_ids ',tgt_ids.shape)


src_ids  torch.Size([50, 16])
 tgt_ids  torch.Size([50, 16])


It looks perfect:
1. the shape of source and target are [seq_length, batch_size]


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
class PositionalEncoding(nn.Module):
    def __init__(self, dim_embedding, dropout=0.1, max_seq_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        postional_encoding = torch.zeros(max_seq_len, dim_embedding)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        denom_term = torch.exp(torch.arange(0, dim_embedding, 2).float() * (-math.log(10000.0) / dim_embedding))
        postional_encoding[:, 0::2] = torch.sin(position * denom_term)
        postional_encoding[:, 1::2] = torch.cos(position * denom_term)
        postional_encoding = postional_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer('postional_encoding', postional_encoding)
    def forward(self, x):
        x = x + self.postional_encoding[:x.size(0), :]
        return self.dropout(x)

## Transformer Model
What we are doing?
1. `forward`
  * The source and target sequences are embedded and scaled by the square root of the embedding dimension.
  * The positional encodings are added to the embeddings.
  * The Transformer processes the source and target sequences, with masking
    1. src_mask, trg_mask==> This is done to prevent future flow of information
    2. src_padding_mask, trg_padding_mask ==> This is done to mask padded data. We are doing this so that model donot attent to padded tokens
  * The output of the Transformer is passed through a fully connected layer to get the predicted target sequence..
  * The model is predicting next token in german given all the tokens in en, and tokens untill the current step in german.
  

In [None]:
class TransformerModel(nn.Module):
    def __init__(self,num_encoder_layers, num_decoder_layers, d_model, nhead, src_vocab_size=tokenizer_src.vocab_size, tgt_vocab_size=tokenizer_tgt.vocab_size, dim_feedforward=512, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.src_embedding = nn.Embedding(input_dim, d_model)
        self.trg_embedding = nn.Embedding(output_dim, d_model)
        self.src_pos_encoder = PositionalEncoding(d_model, dropout)
        self.trg_pos_encoder = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers,dim_feedforward=dim_feedforward, dropout=dropout)
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def forward(self, src, trg, src_mask=None, src_padding_mask=None,trg_mask=None, trg_padding_mask=None, memory_key_padding_mask=None):

        src = self.src_embedding(src) * (self.d_model ** 0.5)
        src = self.src_pos_encoder(src)
        trg = self.trg_embedding(trg) * (self.d_model ** 0.5)
        trg = self.trg_pos_encoder(trg)
        output = self.transformer(src, trg,src_mask, trg_mask, None,
                                src_padding_mask, trg_padding_mask, memory_key_padding_mask)
        output = self.fc(self.dropout(output))
        return output

## Let's Creating Masking
1. `generate_square_subsequent_mask` will create following matrix for the tgt_msk. Here, we supposed the tgt sequence length is 3.

```
tensor([[ 0., -inf, -inf],
        [ 0.,  0., -inf],
        [ 0.,  0.,  0.]])
```


2. This means that each token would only be allowed to attend to the tokens that have already been generated during decoding.
3. `src_mask` square matrix fill with False. This means, we are not masking any source sequence
4. `src_padding_mask` and `tgt_padding_mask`: It looks at the src and tgt which is filled with padded token. And, mask all padded tokens
5. We transpose `src_padding_mask` and `tgt_padding_mask` because Transformer requires the dimension to be [batch_size, seq_len]. This is opposite to the Transformer requirement for src and tgt to be [seq_length, batch_size]


In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask.to(device)


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == tokenizer_src.pad_token_id).transpose(0, 1)
    tgt_padding_mask = (tgt == tokenizer_tgt.pad_token_id).transpose(0, 1)
    return src_mask.to(device), tgt_mask.to(device), src_padding_mask.to(device), tgt_padding_mask.to(device)

In [None]:
generate_square_subsequent_mask(3)

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]], device='mps:0')

## Let's Intiate Model and Conduct Training
1. `nn.CrossEntropyLoss(ignore_index=tokenizer_tgt.pad_token_id)`
  * We are asking loss function to ignore where it is padded token
2. `tgt_out = tgt[1:, :]`

  * we are removing the first token of the target sequence, since it corresponds to the special start-of-sentence token <sos>. By removing this token, we obtain a new tensor tgt_out that contains the remaining tokens of the target sequence, which will be used as input to the decoder during training. This is because during training, we want the model to learn to generate the target sequence given the input source sequence, without being provided with the start-of-sentence token

3. ` loss=criterion(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
`
  * The output tensor has shape [tgt_seq_len - 1, batch_size, tgt_vocab_size], which means it has 3 dimensions. To calculate the loss, we need to reshape it to a 2D tensor of shape [(tgt_seq_len - 1) * batch_size, tgt_vocab_size].

  * Similarly, tgt_out tensor has shape [tgt_seq_len - 1, batch_size], but to calculate the loss, we need to flatten it into a 1D tensor of shape [(tgt_seq_len - 1) * batch_size].

In [None]:
# Instantiate the model
input_dim = 50
output_dim=50
emb_size=512
nhead = 8
num_encoder_layers =num_decoder_layers= 3

model = TransformerModel(num_encoder_layers, num_decoder_layers, emb_size, nhead, src_vocab_size=tokenizer_src.vocab_size, tgt_vocab_size=tokenizer_tgt.vocab_size, dim_feedforward=512).to(device)

# Training loop
num_epochs = 1
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer_tgt.pad_token_id)
optimizer = optim.Adam(model.parameters())
model.train()
losses = 0
for epoch in range(num_epochs):
    for i, (src_ids, tgt_ids) in enumerate(dataloader):
        src=src_ids.to(device)
        tgt=tgt_ids.to(device)
        tgt_input = tgt[:-1, :]
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        # (self, src, trg, src_mask=None, src_padding_mask=None,trg_mask=None, trg_padding_mask=None, memory_key_padding_mask=None)
        output=model(src, tgt_input, src_mask, src_padding_mask,tgt_mask, tgt_padding_mask, src_padding_mask)
        optimizer.zero_grad()
        tgt_out = tgt[1:, :]
        loss=criterion(output.reshape(-1, output.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        if i%100==0:
          print('epoch ', epoch, 'batch ', i, ' loss ', loss)
        losses=loss.item()
    print( losses / float(len(list(dataloader))))



        



epoch  0 batch  0  loss  tensor(10.5033, device='mps:0', grad_fn=<NllLossBackward0>)
epoch  0 batch  100  loss  tensor(7.1293, device='mps:0', grad_fn=<NllLossBackward0>)
0.05702095623164214


### Analysis:
The above code is a simplified version of a machine translation model. In future chapters, we will explore more advanced models.