<a href="https://colab.research.google.com/github/animesh-kishore/scratch-space/blob/master/transformer_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenization

In [14]:
import pandas as pd
from transformers import AutoTokenizer # Auto identify tokenizer based on model passed to AutoTokenizer.from_pretrained('model_name')

model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model)

In [15]:
# e.g. usage
text = 'Tokenize text is a core concept in NLP'

print('Tokenizer vacabulary size: ', tokenizer.vocab_size)
print('Max length of input sequence that model can handle: ', tokenizer.model_max_length, 'tokens')

tokenized_text = {}
tokenized_text['Numerical Token'] = tokenizer(text)['input_ids']
tokenized_text['Token'] = tokenizer.convert_ids_to_tokens(tokenizer(text)['input_ids'])
pd.DataFrame(tokenized_text).T

Tokenizer vacabulary size:  30522
Max length of input sequence that model can handle:  512 tokens


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Numerical Token,101,19204,4697,3793,2003,1037,4563,4145,1999,17953,2361,102
Token,[CLS],token,##ize,text,is,a,core,concept,in,nl,##p,[SEP]


In [16]:
# Set max token 100, add pad if number of tokens < 100, return pytorch tensor
input_squence = tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True, max_length=100)['input_ids']
input_squence

tensor([[  101, 19204,  4697,  3793,  2003,  1037,  4563,  4145,  1999, 17953,
          2361,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

# Embedding

In [17]:
# Create a config class for each access of attributes
class Config:
  def __init__(self, config_dict):
    self.__dict__.update(config_dict) # self.__dict__ is a special attribute of an object that contains dictionary of all writable attributes of the class object/instance.

config = {
    'vocab_size': tokenizer.vocab_size,
    'embedding_dimensions': 128,
    'max_tokens': 100, # Allow max 100 tokens per input sequences
    'num_attention_heads': 8,
    'hidden_dropout_prob': 0.3, # Dropout for feed forward network (FFN)
    'intermediate_size': 128 * 4, # Number of neurons in hidden layer of feed forward network (FFN)
    'num_encoder_layers': 2,
}

config = Config(config)

In [18]:
import torch.nn as nn

class TokenEmbedding(nn.Module):
  def __init__(self, config):
    super().__init__()

    # Create an embedding look-up table with config.vocab_size entries. Each entry is an embedding vector of size config.embedding_dimensions
    self.token_embedding = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.embedding_dimensions)

  def forward(self, tokenized_sentence):
    return self.token_embedding(tokenized_sentence)

token_embedding = TokenEmbedding(config)
embedding_output = token_embedding(input_squence)

print(embedding_output, 'Shape: ', embedding_output.shape) # Enbedding layer output of shape [batch_size, seq_length, embedding_dims]

tensor([[[ 0.7954,  0.0402,  0.1097,  ..., -3.1627, -0.8433, -1.0843],
         [-1.2236, -0.2956,  1.8014,  ..., -2.8588, -1.3785,  0.7190],
         [ 0.9850, -0.9777, -0.5681,  ...,  0.1499, -0.3708, -1.4400],
         ...,
         [-1.5691,  0.5431,  0.1510,  ...,  0.6697,  0.0867, -1.5468],
         [-1.5691,  0.5431,  0.1510,  ...,  0.6697,  0.0867, -1.5468],
         [-1.5691,  0.5431,  0.1510,  ...,  0.6697,  0.0867, -1.5468]]],
       grad_fn=<EmbeddingBackward0>) Shape:  torch.Size([1, 100, 128])


# Positional Encodings

In [19]:
import torch

'''
PE(position, i) = sin(position/10000^(i/d_model)) for even i
PE(position, i) = cos(position/10000^((i-1)/d_model)) for odd i

where:
d_model = config.embedding_dimensions
position = index in seq_len
i = index in embedding vector
'''

class PositionalEncoding(nn.Module):
  def __init__(self, config):
    super().__init__()

    pe = torch.zeros(config.max_tokens, config.embedding_dimensions) # shape [100, 128]
    position = torch.arange(0, config.max_tokens, dtype=torch.float).unsqueeze(1) # shape [100, 1]
    div_term = 1 / (10000 ** (torch.arange(0, config.embedding_dimensions, 2).float()/config.embedding_dimensions))

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    self.pe = pe.unsqueeze(0) # shape [1, 100, 128]

  def forward(self, x):
    return x + self.pe

positional_encoding = PositionalEncoding(config)
pos_enc_output = positional_encoding(embedding_output)

# Attention

In [20]:
import numpy as np
import torch.nn.functional as F

class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim):
    super().__init__()
    self.q = nn.Linear(embed_dim, head_dim) # Q = Embedding @ Wq
    self.k = nn.Linear(embed_dim, head_dim) # K = Embedding @ Wk
    self.v = nn.Linear(embed_dim, head_dim) # V = Embedding @ Wv

  def scaled_dot_product_attention(self, query, key, value):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / np.sqrt(dim_k) # (Q @ K.T) / sqrt(head_dim). Shape [100, 100]
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value) # Shape [100, head_dim]

  def forward(self, hidden_state):
    return self.scaled_dot_product_attention(self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))

class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()
    embed_dim = config.embedding_dimensions # 128
    num_heads = config.num_attention_heads # 8
    head_dim = embed_dim // num_heads

    self.heads = nn.ModuleList(
        [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
    )
    self.output_linear = nn.Linear(embed_dim, embed_dim)

  def forward(self, hidden_state):
    x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
    return self.output_linear(x)

multihead_attn = MultiHeadAttention(config)
atn_output = multihead_attn(pos_enc_output)

atn_output.shape

torch.Size([1, 100, 128])

#Residual connection and Layer Normalization

In [21]:
layer_norm = nn.LayerNorm(config.embedding_dimensions)

add_norm_output = layer_norm(pos_enc_output + atn_output)

add_norm_output.shape

torch.Size([1, 100, 128])

#Feed Forward Network

In [22]:
class FeedForward(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.linear_1 = nn.Linear(config.embedding_dimensions, config.intermediate_size) # intermediate_size is 128 * 4
    self.linear_2 = nn.Linear(config.intermediate_size, config.embedding_dimensions)
    self.gelu = nn.GELU()
    self.dropout = nn.Dropout(config.hidden_dropout_prob)

  def forward(self, x):
    x = self.linear_1(x)
    x = self.gelu(x)
    x = self.linear_2(x)
    return self.dropout(x)

feed_forward = FeedForward(config)
fnn_output = feed_forward(add_norm_output)

fnn_output.shape

torch.Size([1, 100, 128])

#Second add and Norm

In [23]:
layer_norm = nn.LayerNorm(config.embedding_dimensions)

add_norm_output2 = layer_norm(add_norm_output + fnn_output)

add_norm_output2.shape

torch.Size([1, 100, 128])