<a href="https://colab.research.google.com/github/aquibjaved/Bits_and_Pieces_DL/blob/tinytransformer/Basic_trfs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# source: https://github.com/hkproj/pytorch-transformer
# This notebook is just for different use case but codes are mostly used from above source

In [2]:
!pip install transformers datasets tokenizers

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
import torch
import torch.nn as nn
import math

from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

In [4]:
class InputEmbeddings(nn.Module):
  def __init__(self, model_dim:int=512, vocab_size:int=30000):
    super().__init__()

    self.model_dim = model_dim
    self.vocab_size = vocab_size

    self.embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.model_dim)

  def forward(self, x):
    return self.embedding(x) * math.sqrt(self.model_dim)


In [5]:
class PositionalEncoding(nn.Module):
  def __init__(self, model_dim:int, seq_len:int, dropout:float):
    super().__init__()

    self.model_dim = model_dim
    self.seq_len= seq_len # max len of doc
    self.dropout = nn.Dropout(dropout)

    # create matrix of shape (seq, model_dim) => (512, 512)
    pe = torch.zeros(seq_len, model_dim)

    # create a vector for seq_len
    position = torch.arange(0, self.seq_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-math.log(10000.0)/self.model_dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    pe = pe.unsqueeze(0)

    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
    return self.dropout(x)

In [6]:
PE = PositionalEncoding(model_dim=512, seq_len=512, dropout=0.0)

In [7]:
class LayerNormalization(nn.Module):
  def __init__(self, eps: float=10**-6):
    super().__init__()
    self.eps = eps
    self.alpha = nn.Parameter(torch.ones(1))
    self.bias = nn.Parameter(torch.zeros(1))

  def forward(self, x):
    mean = x.mean(dim = -1, keepdim=True)
    std = x.std(dim = -1, keepdim=True)

    return self.alpha * (x-mean)/(std+self.eps) + self.bias


In [8]:
torch.zeros(1)

tensor([0.])

In [9]:
class FeedForwardBlock(nn.Module):
  def __init__(self, model_dim:int, d_ff:int, dropout:float):
    super().__init__()

    self.d_ff = d_ff
    self.linear_one = nn.Linear(model_dim, self.d_ff)
    self.dropout = nn.Dropout(dropout)
    self.linear_out = nn.Linear(self.d_ff, model_dim)


  def forward(self, x):
    # (batch_size, max_seq_len, model_dim)

    return self.linear_out(self.dropout(torch.relu(self.linear_one(x))))


In [10]:
class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

In [11]:
class MultiHeadAttentionBlock(nn.Module):
  def __init__(self, h:int=8, model_dim:int=512, dropout:float=0.1):
    super().__init__()

    self.model_dim = model_dim
    self.h = h
    self.dropout = nn.Dropout(dropout)

    assert model_dim % h == 0
    self.d_k = model_dim//h

    # now get the weights of query, key, value
    self.w_q = nn.Linear(model_dim, model_dim)
    self.w_k = nn.Linear(model_dim, model_dim)
    self.w_v = nn.Linear(model_dim, model_dim)

    self.w_o = nn.Linear(model_dim, model_dim)

    self.dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(query, key, value, mask, dropout: nn.Dropout):
    d_k = query.shape[-1]
    # Just apply the formula from the paper
    # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
    attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        # Write a very low value (indicating -inf) to the positions where mask == 0
        attention_scores.masked_fill_(mask == 0, -1e9)

    attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax

    if dropout is not None:
        attention_scores = dropout(attention_scores)
    # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
    # return attention scores which can be used for visualization
    return (attention_scores @ value), attention_scores

  def forward(self, q, k, v, mask):

    query = self.w_q(q)
    key = self.w_k(k)
    value = self.w_v(v)

    query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2)
    key = value.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)
    value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)

    # calculate attention
    x, self.attention_score = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

    # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, model_dim)
    x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

    # Multiply by Wo
    # (batch, seq_len, model_dim) --> (batch, seq_len, model_dim)
    return self.w_o(x)

In [12]:
class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

In [13]:
class Encoder(nn.Module):
  def __init__(self, features:int, layers:nn.ModuleList):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization(features)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [14]:
class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [15]:
class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [16]:
class ProjectionLayer(nn.Module):
  def __init__(self, model_dims:int, vocab_size:int):
    super().__init__()

    self.model_dims = model_dims
    self.vocab_size = vocab_size

    self.proj = nn.Linear(self.model_dims, self.vocab_size)

  def forward(self, x):
    return torch.log_softmax(self.proj(x), dim =-1)

In [17]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, model_dim)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, model_dim)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)




In [18]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, model_dim: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(model_dim, src_vocab_size)
    tgt_embed = InputEmbeddings(model_dim, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(model_dim, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(model_dim, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(model_dim, h, dropout)
        feed_forward_block = FeedForwardBlock(model_dim, d_ff, dropout)
        encoder_block = EncoderBlock(model_dim, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(model_dim, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(model_dim, h, dropout)
        feed_forward_block = FeedForwardBlock(model_dim, d_ff, dropout)
        decoder_block = DecoderBlock(model_dim, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(model_dim, nn.ModuleList(encoder_blocks))
    decoder = Decoder(model_dim, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(model_dim, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

In [19]:
model_dim = 512
src_vocab_size = 30522
tgt_vocab_size = 30522
src_seq_len = 512
tgt_seq_len = 512
dropout = 0.1
h = 8
N = 6
d_ff = 2048

In [20]:


src_embed = InputEmbeddings(model_dim, src_vocab_size)
tgt_embed = InputEmbeddings(model_dim, tgt_vocab_size)

# Create the positional encoding layers
src_pos = PositionalEncoding(model_dim, src_seq_len, dropout)
tgt_pos = PositionalEncoding(model_dim, tgt_seq_len, dropout)


# Create the encoder blocks
encoder_blocks = []
for _ in range(N):
    encoder_self_attention_block = MultiHeadAttentionBlock(h, model_dim, dropout)
    feed_forward_block = FeedForwardBlock(model_dim, d_ff, dropout)
    encoder_block = EncoderBlock(model_dim, encoder_self_attention_block, feed_forward_block, dropout)
    encoder_blocks.append(encoder_block)


# Create the decoder blocks
decoder_blocks = []
for _ in range(N):
    decoder_self_attention_block = MultiHeadAttentionBlock(h, model_dim, dropout)
    decoder_cross_attention_block = MultiHeadAttentionBlock(h, model_dim, dropout)
    feed_forward_block = FeedForwardBlock(model_dim, d_ff, dropout)
    decoder_block = DecoderBlock(model_dim, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
    decoder_blocks.append(decoder_block)


# Create the encoder and decoder
encoder = Encoder(model_dim, nn.ModuleList(encoder_blocks))
decoder = Decoder(model_dim, nn.ModuleList(decoder_blocks))



# Create the projection layer
projection_layer = ProjectionLayer(model_dim, tgt_vocab_size)

# Create the transformer
transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)


In [21]:
from datasets import load_dataset

dataset = load_dataset("findnitai/english-to-hinglish", split="train")

Downloading readme:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/27.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [50]:
train = dataset['translation'][:int(len(dataset['translation'])*.75)]
print("Train size: ", len(train))
test = dataset['translation'][int(len(dataset['translation'])*.75):]
print("Test size: ", len(test))

Train size:  141826
Test size:  47276


In [51]:
test[0]

{'en': 'What is the best way for me to get home the shortest distance',
 'hi_ng': 'ghar pahuchne ka sabse chota rasta kaunsa hai',
 'source': 0}

In [52]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class BilingualDataset(Dataset):

    def __init__(self, ds, model_name_src, model_name_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        # Load the tokenizers
        self.tokenizer_src = AutoTokenizer.from_pretrained(model_name_src)
        self.tokenizer_tgt = AutoTokenizer.from_pretrained(model_name_tgt)
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        # Assuming that [SOS] and [EOS] tokens are already in the tokenizer's vocabulary
        self.sos_token_id = self.tokenizer_tgt.convert_tokens_to_ids('[SOS]')
        self.eos_token_id = self.tokenizer_tgt.convert_tokens_to_ids('[EOS]')
        self.pad_token_id = self.tokenizer_tgt.pad_token_id

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Encode the texts
        encoder_input = self.tokenizer_src.encode(src_text, max_length=self.seq_len, padding='max_length', truncation=True)

        # Truncate decoder_input if necessary and then prepend SOS token
        decoder_input = self.tokenizer_tgt.encode(tgt_text, max_length=self.seq_len - 1, truncation=True)
        decoder_input = [self.sos_token_id] + decoder_input

        # Truncate decoder_input if necessary and then append EOS token
        decoder_input_truncated = decoder_input[:self.seq_len - 1]
        label = decoder_input_truncated[1:] + [self.eos_token_id]

        # Ensure the length is seq_len
        decoder_input_padded = decoder_input_truncated + [self.pad_token_id] * (self.seq_len - len(decoder_input_truncated))
        label_padded = label + [self.pad_token_id] * (self.seq_len - len(label))

        encoder_input = torch.tensor(encoder_input, dtype=torch.long)
        decoder_input = torch.tensor(decoder_input_padded, dtype=torch.long)
        label = torch.tensor(label_padded, dtype=torch.long)

        return {
            "encoder_input": encoder_input,
            "encoder_mask": (encoder_input != self.pad_token_id).unsqueeze(0).unsqueeze(0).int(),

            "decoder_input": decoder_input,
            "decoder_mask": (decoder_input != self.pad_token_id).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),

            "label": label,
            "src_text": src_text,
            "tgt_text": tgt_text,
        }


# Function to create the causal mask for the decoder
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).bool()
    return ~mask


In [53]:
from transformers import AutoTokenizer

# Load the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [54]:
hinglish_ds = BilingualDataset(ds=dataset, model_name_src="bert-base-uncased",
                               model_name_tgt="bert-base-uncased", src_lang="en",
                               tgt_lang="hi_ng", seq_len=512)



In [55]:
hinglish_dataloader = DataLoader(hinglish_ds, batch_size=4)

In [56]:
data = {}
for batch in hinglish_dataloader:
  data = batch
  break

In [66]:
data['label'].shape

torch.Size([4, 512])

In [45]:
# data.get('encoder_mask')[0]

In [99]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
optimizer = torch.optim.Adam(transformer.parameters(), lr =0.0001, eps=1e-9)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.convert_tokens_to_ids(['[PAD]'])[0], label_smoothing=0.1)

transformer.train()
for batch in hinglish_dataloader:


  encoder_input = batch['encoder_input']
  encoder_mask = batch['encoder_mask']

  decoder_input = batch['decoder_input']
  decoder_mask = batch['decoder_mask']
  label = batch['label']
  assert decoder_output.shape == enc_output.shape

  enc_output = transformer.encode(src=encoder_input, src_mask = encoder_mask)
  decoder_output = transformer.decode(encoder_output= enc_output, src_mask=encoder_mask,
                                      tgt = decoder_input, tgt_mask= decoder_mask)
  proj_output = transformer.project(decoder_output)


  # Compute the loss using a simple cross entropy
  loss = loss_fn(proj_output.view(-1, tokenizer.vocab_size), label.view(-1))
  print("loss: ", loss.item())

  loss.backward()
  optimizer.step()


loss:  10.336156845092773
loss:  10.333630561828613
loss:  10.331705093383789
loss:  10.326706886291504
loss:  10.325390815734863
loss:  10.324607849121094
loss:  10.325847625732422
loss:  10.322803497314453
loss:  10.32236099243164
loss:  10.325940132141113
loss:  10.318849563598633
loss:  10.321937561035156
loss:  10.319647789001465
loss:  10.316468238830566
loss:  10.3215970993042
loss:  10.319546699523926
loss:  10.318574905395508
loss:  10.309822082519531
loss:  10.31270980834961
loss:  10.308786392211914
loss:  10.319567680358887
loss:  10.31602954864502
loss:  10.319635391235352
loss:  10.316715240478516
loss:  10.295827865600586
loss:  10.30947494506836
loss:  10.309414863586426
loss:  10.316606521606445
loss:  10.30629825592041
loss:  10.312324523925781
loss:  10.295146942138672
loss:  10.295072555541992
loss:  10.30660629272461
loss:  10.298874855041504
loss:  10.302706718444824
loss:  10.299957275390625
loss:  10.30364990234375
loss:  10.288223266601562
loss:  10.29701137542

KeyboardInterrupt: ignored