<a href="https://colab.research.google.com/github/boi-doingthings/my-transfomers/blob/main/transfomers-book/Encoder_HandsOn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import output
output.enable_custom_widget_manager()

In [2]:
## Install Transformers and datasets

!pip install transformers datasets

Installing collected packages: urllib3, xxhash, tokenizers, responses, multiprocess, huggingface-hub, transformers, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed datasets-2.4.0 huggingface-hub-0.8.1 multiprocess-0.70.13 responses-0.18.0 tokenizers-0.12.1 transformers-4.21.1 urllib3-1.25.11 xxhash-3.0.0


In [24]:
import torch
import numpy

In [4]:
input = "The cat is quick and orange"

In [5]:
# Tokenizer to create tokens

from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
input_tokens = tokenizer(input,return_tensors="pt")
input_tokens

{'input_ids': tensor([[ 101, 1996, 4937, 2003, 4248, 1998, 4589,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

## Implement Self - Attention

In [13]:
from torch import nn
from transformers import AutoConfig # gives the config file of the model used.

config = AutoConfig.from_pretrained(model_ckpt)
config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.21.1",
  "vocab_size": 30522
}

In [17]:
embeds = nn.Embedding(config.vocab_size,config.dim)
embeds

Embedding(30522, 768)

In [22]:
input_embeds = embeds(input_tokens['input_ids'])
input_embeds.shape

torch.Size([1, 8, 768])

In [32]:
#  1. Project each token embedding into three vectors called query, key, and value.
q=k=v=input_embeds
#  2. Compute attention scores. 
dim_k = numpy.sqrt(k.shape[-1])
scores = torch.bmm(q,k.transpose(2,1))/dim_k

In [33]:
scores

tensor([[[27.3107,  0.4923, -0.3205, -1.4599,  0.8392,  0.9175, -1.7061,
           0.9743],
         [ 0.4923, 26.9458, -2.5731,  0.6720, -0.4026, -1.5090, -0.0412,
          -3.8114],
         [-0.3205, -2.5731, 27.8759, -0.5187,  0.8185, -0.9948,  1.1783,
           1.1427],
         [-1.4599,  0.6720, -0.5187, 27.5788,  2.3593,  0.1219,  0.2802,
           0.5671],
         [ 0.8392, -0.4026,  0.8185,  2.3593, 30.3954,  0.8647, -0.5827,
           0.9450],
         [ 0.9175, -1.5090, -0.9948,  0.1219,  0.8647, 27.5563,  0.3520,
          -0.1167],
         [-1.7061, -0.0412,  1.1783,  0.2802, -0.5827,  0.3520, 27.1091,
          -1.5181],
         [ 0.9743, -3.8114,  1.1427,  0.5671,  0.9450, -0.1167, -1.5181,
          27.1199]]], grad_fn=<DivBackward0>)

In [34]:
v.shape

torch.Size([1, 8, 768])

In [39]:
import torch.nn.functional as F
# 3. Compute attention weights.
weights = F.softmax(scores,dim=-1)
# 4. Update the token embeddings.
attn_op = torch.bmm(weights,v)
attn_op

tensor([[[-1.2443, -1.0827, -0.4263,  ..., -0.2538,  1.1041, -0.7977],
         [-0.3195,  1.5146, -0.6712,  ...,  0.4072, -1.0146,  0.7862],
         [-0.4574,  0.1039, -0.7137,  ...,  0.5607,  0.5315, -0.5152],
         ...,
         [-1.3553, -1.7808, -0.7417,  ...,  1.2805, -0.7524, -0.1875],
         [-0.8253, -0.3365, -1.6557,  ...,  1.9635, -0.4545, -0.0279],
         [ 1.6413, -0.5059, -1.7393,  ..., -0.1644,  1.6524,  0.4118]]],
       grad_fn=<BmmBackward0>)

In [41]:
def scaled_dot_product_attention(query, key, value):
 dim_k = query.size(-1)
 scores = torch.bmm(query, key.transpose(1, 2)) / numpy.sqrt(dim_k)
 weights = F.softmax(scores, dim=-1)
 return torch.bmm(weights, value)

In [51]:
class AttentionHead(nn.Module):
  def __init__(self,embed_dim,head_dim):
    super().__init__()
    self.query = nn.Linear(embed_dim,head_dim)
    self.key = nn.Linear(embed_dim,head_dim)
    self.value = nn.Linear(embed_dim,head_dim)

  def forward(self,hidden_state):
    attn_outputs = scaled_dot_product_attention(
    self.query(hidden_state), self.key(hidden_state), self.value(hidden_state))
    return attn_outputs



In [52]:
c = AttentionHead(config.dim,config.n_heads)

In [53]:
c(input_embeds)

tensor([[[ 0.1498, -0.2782, -0.0147,  0.0480, -0.0598, -0.1702,  0.2614,
          -0.2178,  0.3996,  0.3681,  0.1106,  0.1237],
         [ 0.1802, -0.1953,  0.1317,  0.0102,  0.0399, -0.2700,  0.2750,
          -0.2173,  0.5817,  0.4931,  0.0446,  0.1465],
         [ 0.0897, -0.1188,  0.0236,  0.1105,  0.0013, -0.2470,  0.2708,
          -0.1487,  0.5133,  0.3285,  0.0811,  0.1254],
         [ 0.0995, -0.1235,  0.0318,  0.1240,  0.0044, -0.2535,  0.3257,
          -0.1906,  0.5248,  0.3582,  0.0851,  0.1452],
         [ 0.1721, -0.0886,  0.0508,  0.1566, -0.0911, -0.2672,  0.4333,
          -0.2377,  0.4427,  0.1600,  0.0540,  0.2205],
         [ 0.1711, -0.2459,  0.0384,  0.0127, -0.0092, -0.2029,  0.2939,
          -0.2587,  0.4704,  0.3527,  0.1522,  0.1398],
         [ 0.0534, -0.1588, -0.0270,  0.0868,  0.0204, -0.2398,  0.3352,
          -0.2887,  0.4674,  0.3014,  0.1771,  0.1101],
         [ 0.2523, -0.2960,  0.0411,  0.1757, -0.0376, -0.2021,  0.2108,
          -0.0133,  0.53