<a href="https://colab.research.google.com/github/boi-doingthings/my-transfomers/blob/main/transformers-book/Encoder_HandsOn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import output
output.enable_custom_widget_manager()

In [2]:
## Install Transformers and datasets

!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 57.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 50.4

In [3]:
import torch
import numpy

In [67]:
input = ["The cat is quick and orange",
         "The orange is tangy and juicy"]

In [68]:
# Tokenizer to create tokens

from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [70]:
input_tokens = tokenizer(input,return_tensors="pt",padding=True,truncation=True)
input_tokens

{'input_ids': tensor([[  101,  1996,  4937,  2003,  4248,  1998,  4589,   102,     0],
        [  101,  1996,  4589,  2003,  9745,  2100,  1998, 28900,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## Implement Self - Attention

In [71]:
from torch import nn
from transformers import AutoConfig # gives the config file of the model used.

config = AutoConfig.from_pretrained(model_ckpt)
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}

In [72]:
embeds = nn.Embedding(config.vocab_size,config.dim)
embeds

Embedding(30522, 768)

In [73]:
input_embeds = embeds(input_tokens['input_ids'])
input_embeds.shape

torch.Size([2, 9, 768])

In [74]:
#  1. Project each token embedding into three vectors called query, key, and value.
q=k=v=input_embeds
#  2. Compute attention scores. 
dim_k = numpy.sqrt(k.shape[-1])
scores = torch.bmm(q,k.transpose(2,1))/dim_k

In [75]:
scores

tensor([[[ 2.6978e+01,  2.2458e+00,  5.3122e-01,  8.3515e-01, -7.3148e-01,
          -1.8162e-01,  1.6261e-01, -1.4513e+00, -7.2368e-02],
         [ 2.2458e+00,  2.8367e+01,  2.9603e-01, -4.6457e-01, -3.5759e-01,
           5.5322e-01, -7.1558e-01,  9.4747e-01, -3.3803e-01],
         [ 5.3122e-01,  2.9603e-01,  2.6179e+01,  2.2540e-01,  1.3389e+00,
           9.8135e-01,  1.3713e+00,  1.8472e+00, -5.1659e-01],
         [ 8.3515e-01, -4.6457e-01,  2.2540e-01,  2.9390e+01, -1.6786e+00,
           1.4807e+00, -1.6285e+00, -8.5464e-01,  1.7378e+00],
         [-7.3148e-01, -3.5759e-01,  1.3389e+00, -1.6786e+00,  2.7546e+01,
          -1.5153e+00,  1.1849e+00,  1.4900e+00, -1.3961e+00],
         [-1.8162e-01,  5.5322e-01,  9.8135e-01,  1.4807e+00, -1.5153e+00,
           2.5874e+01,  1.0427e+00,  8.1405e-01,  6.8843e-02],
         [ 1.6261e-01, -7.1558e-01,  1.3713e+00, -1.6285e+00,  1.1849e+00,
           1.0427e+00,  2.9313e+01,  2.7255e+00,  6.1797e-01],
         [-1.4513e+00,  9.4747e-01

In [76]:
v.shape

torch.Size([2, 9, 768])

In [77]:
import torch.nn.functional as F
# 3. Compute attention weights.
weights = F.softmax(scores,dim=-1)
# 4. Update the token embeddings.
attn_op = torch.bmm(weights,v)
attn_op

tensor([[[ 1.8862, -0.7888,  1.1448,  ..., -0.2929,  1.2159,  1.0291],
         [ 1.3269, -1.5454,  0.3856,  ..., -0.1901, -0.3826, -1.9612],
         [ 0.1785,  2.1488,  1.0524,  ..., -1.1788, -0.6794,  1.5221],
         ...,
         [ 0.4172,  2.1974,  0.5010,  ...,  0.6061, -1.5345,  1.0498],
         [-0.4665, -0.4760, -0.8432,  ..., -1.5794,  0.7461, -0.7338],
         [-1.7447,  0.1426,  1.6844,  ..., -0.3214,  1.2646, -0.8988]],

        [[ 1.8862, -0.7888,  1.1448,  ..., -0.2929,  1.2159,  1.0291],
         [ 1.3269, -1.5454,  0.3856,  ..., -0.1901, -0.3826, -1.9612],
         [ 0.4172,  2.1974,  0.5010,  ...,  0.6061, -1.5345,  1.0498],
         ...,
         [ 1.1877,  1.4300,  0.0981,  ..., -1.2678,  1.5755, -2.7351],
         [ 0.2579,  0.4515, -1.1815,  ...,  0.0404,  0.6257, -0.5362],
         [-0.4665, -0.4760, -0.8432,  ..., -1.5794,  0.7461, -0.7338]]],
       grad_fn=<BmmBackward0>)

In [78]:
def scaled_dot_product_attention(query, key, value):
 dim_k = query.size(-1)
 scores = torch.bmm(query, key.transpose(1, 2)) / numpy.sqrt(dim_k)
 weights = F.softmax(scores, dim=-1)
 return torch.bmm(weights, value)

In [79]:
class AttentionHead(nn.Module):
  def __init__(self,embed_dim,head_dim):
    super().__init__()
    self.query = nn.Linear(embed_dim,head_dim)
    self.key = nn.Linear(embed_dim,head_dim)
    self.value = nn.Linear(embed_dim,head_dim)

  def forward(self,hidden_state):
    attn_outputs = scaled_dot_product_attention(
    self.query(hidden_state), self.key(hidden_state), self.value(hidden_state))
    return attn_outputs



In [80]:
c = AttentionHead(config.dim,config.dim//config.n_heads)

In [81]:
c(input_embeds).shape

torch.Size([2, 9, 64])

In [82]:
class MultiHeadAttention(nn.Module):
  def __init__(self,config):
    super().__init__()
    self.head_dim = config.dim//config.n_heads
    self.heads = nn.ModuleList([AttentionHead(config.dim,self.head_dim) for _ in range(config.n_heads)])
    self.output_linear = nn.Linear(config.dim,config.dim)
  
  def forward(self,hidden_state):
    self.output = torch.cat([h(hidden_state) for h in self.heads],dim= -1)
    self.output = self.output_linear(self.output)
    return self.output
  

In [83]:
c = MultiHeadAttention(config)

In [84]:
c(input_embeds).shape

torch.Size([2, 9, 768])

In [85]:
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.linear_1 = nn.Linear(config.dim, config.hidden_dim)
    self.linear_2 = nn.Linear(config.hidden_dim, config.dim)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(config.attention_dropout)
  def forward(self, x):
    x = self.linear_1(x)
    x = self.gelu(x)
    x = self.linear_2(x)
    x = self.dropout(x)
    return x

In [86]:
c = FeedForward(config)
c(input_embeds).shape

torch.Size([2, 9, 768])

In [87]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self,config):
    super().__init__()
    self.layer_norm1 = nn.LayerNorm(config.dim)
    self.layer_norm2 = nn.LayerNorm(config.dim)
    self.Attention = MultiHeadAttention(config)
    self.feed_forward = FeedForward(config)

  def forward(self,x):
    temp = self.layer_norm1(x)
    x = x + self.Attention(temp)
    hidden_state = self.layer_norm2(x)
    return x + self.feed_forward(hidden_state)



In [88]:
transformer_encoder = TransformerEncoderLayer(config)

In [118]:
transformer_encoder(input_embeds).size()

torch.Size([2, 9, 768])

### Incorporate Position Embeddings

In [136]:
class Embeddings(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.token_embeddings = nn.Embedding(config.vocab_size,
                                         config.dim)
    self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                            config.dim)
    self.layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
    self.dropout = nn.Dropout()

  def forward(self, input_ids):
    # Create position IDs for input sequence
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
    # Create token and position embeddings
    token_embeddings = self.token_embeddings(input_ids)
    position_embeddings = self.position_embeddings(position_ids)
    # Combine token and position embeddings
    embeddings = token_embeddings + position_embeddings
    embeddings = self.layer_norm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings

In [137]:
e =Embeddings(config)

In [139]:
e(input_tokens.input_ids).size()

torch.Size([2, 9, 768])

In [140]:
class TransformerEncoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embeddings = Embeddings(config)
    self.layers = nn.ModuleList([TransformerEncoderLayer(config) for _ in range(config.n_layers)])
  def forward(self, x):
    x = self.embeddings(x)
    for layer in self.layers:
       x = layer(x)
    return x


In [141]:
trx = TransformerEncoder(config)

In [142]:
trx(input_tokens.input_ids)

tensor([[[-1.6523, -1.1900, -4.7558,  ..., -0.2136, -0.4529,  0.2156],
         [ 2.5483, -0.2638,  0.1308,  ...,  1.9346, -4.1408,  2.4937],
         [-0.1966, -4.2770, -0.1808,  ...,  0.2456,  0.8316, -1.1160],
         ...,
         [-4.2977, -1.7213,  0.6988,  ..., -0.2712, -0.8062,  0.1393],
         [-1.3997, -0.0491,  0.6676,  ...,  0.7598,  0.2390,  0.2701],
         [-0.8013, -1.2029, -0.4760,  ...,  0.8523, -2.0018, -1.4756]],

        [[-3.8127, -1.3154, -5.0701,  ..., -0.6410, -0.6948, -0.6807],
         [-0.5961, -0.0663, -0.0165,  ...,  1.8785, -0.6121,  0.1930],
         [-3.2779, -0.7309, -0.8722,  ...,  0.2890, -0.2289, -0.4353],
         ...,
         [-0.5368, -0.8194,  0.5627,  ..., -0.3873, -0.6835, -0.0738],
         [-0.1412, -0.4684, -0.5600,  ..., -0.5834,  0.7925,  0.1550],
         [-0.1805,  0.5588,  0.1084,  ...,  1.7335, -2.1849, -0.4564]]],
       grad_fn=<AddBackward0>)