<a href="https://colab.research.google.com/github/ahujaavi13/play-with-transformers/blob/master/BERT_GPT2_Play.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch

# HuggingFace Transformers


In [2]:
%%bash
pip install transformers



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# BERT

In [4]:
from transformers import *
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_text = tokenizer.tokenize("[CLS] This is a simple tutorial to understand BERT [SEP]")
print(tokenized_text)

['[CLS]', 'this', 'is', 'a', 'simple', 'tutor', '##ial', 'to', 'understand', 'bert', '[SEP]']


In [5]:
id = tokenizer.convert_tokens_to_ids('[CLS]')
print(id)
print(len(list(tokenizer.vocab.keys()))) # size of pre-defined vocab in bert base case
print(list(tokenizer.vocab.keys())[4242:4252])

101
30522
['unknown', 'parties', '##les', 'generation', '##ff', 'continues', 'quick', 'fields', 'brigade', 'quiet']


BERT to encode sentence representations


In [6]:
sent_1 = 'Life is like a box of chocolates.'
sent_2 = 'Life is not like a box of chocolates.'
marked_sent_1 = '[CLS] ' + sent_1 + ' [SEP]'
print(marked_sent_1)

[CLS] Life is like a box of chocolates. [SEP]


In [7]:
tokenized_sent_1 = tokenizer.tokenize(marked_sent_1)
print(tokenized_sent_1)
print(len(tokenized_sent_1))

['[CLS]', 'life', 'is', 'like', 'a', 'box', 'of', 'chocolate', '##s', '.', '[SEP]']
11


In [8]:
tokens_ids = tokenizer.convert_tokens_to_ids(tokenized_sent_1)
print(tokens_ids)
print(len(tokens_ids))

[101, 2166, 2003, 2066, 1037, 3482, 1997, 7967, 2015, 1012, 102]
11


In [9]:
for tup in zip(tokenized_sent_1, tokens_ids):
  print (tup)

('[CLS]', 101)
('life', 2166)
('is', 2003)
('like', 2066)
('a', 1037)
('box', 3482)
('of', 1997)
('chocolate', 7967)
('##s', 2015)
('.', 1012)
('[SEP]', 102)


Self-Note: BERT is trained with **sentence** pairs. So other than the token_ids, it also requries a segment_ids, to indicate whether a token belongs to the first or second sentence.

Here we are only processing one sentence, so we will just use 1 segment_ids for all tokens

In [10]:
segments_ids = [1] * len(tokens_ids)
print(segments_ids)
print(len(segments_ids))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
11


In [0]:
# from python list to pytorch tensors
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([tokens_ids])
segments_tensors = torch.tensor([segments_ids])

In [12]:
# load pretrained model of BERT
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
# Extract hidden states representation of the original sentence
with torch.no_grad():
    encoded_hidden_states, _ = model(tokens_tensor.to(device), segments_tensors.to(device))
print(encoded_hidden_states)
print(encoded_hidden_states.size())

tensor([[[-0.0048,  0.1704,  0.1082,  ..., -0.5456,  0.1983,  0.7477],
         [ 0.4264,  0.2739, -0.1103,  ..., -0.1719,  0.8716,  0.6505],
         [ 0.3570, -0.0347,  0.7900,  ..., -0.2911,  0.1469,  0.9281],
         ...,
         [-0.3826,  0.2574,  1.0077,  ...,  0.0121, -0.0270,  0.3057],
         [ 0.1922, -0.3401, -0.2274,  ...,  0.1532,  0.0502, -0.5300],
         [ 0.9307,  0.3238, -0.1628,  ...,  0.1840, -0.5337, -0.2759]]],
       device='cuda:0')
torch.Size([1, 11, 768])


Do you recognize the dimensions specified above?

In [14]:
print ("Number of batches:", len(encoded_hidden_states))
batch_i = 0

print ("Number of tokens:", len(encoded_hidden_states[batch_i]))
token_i = 0

print ("Number of hidden units:", len(encoded_hidden_states[batch_i][token_i]))


Number of batches: 1
Number of tokens: 11
Number of hidden units: 768


# BERT to encode sentence representation

In [15]:
# I am going to use encoded_hidden_states (last layer hidden states) as my sentence representation
print(encoded_hidden_states.size())

torch.Size([1, 11, 768])


In [16]:
sent_1_rep = torch.mean(encoded_hidden_states, axis=1)
print(sent_1_rep.size())

torch.Size([1, 768])


In [17]:
# sent_1 = 'Life is like a box of chocolates.'
sent_2 = 'Life is not like a box of chocolates.'
def encode_sentence(sentence, model):
  marked_sent = '[CLS] ' + sentence + ' [SEP]'
  tokenized_sent= tokenizer.tokenize(marked_sent)
  tokens_ids = tokenizer.convert_tokens_to_ids(tokenized_sent)
  segments_ids = [1] * len(tokens_ids)
  tokens_tensor = torch.tensor([tokens_ids])
  segments_tensors = torch.tensor([segments_ids])

  with torch.no_grad():
      encoded_hidden_states, _ = model(tokens_tensor.to(device), segments_tensors.to(device))
  return torch.mean(encoded_hidden_states, axis=1)

sent_2_rep = encode_sentence(sent_2, model)
# print(sent_rep)
print(sent_2_rep.size())


torch.Size([1, 768])


In [18]:
# How similar are they?
# sent_1 = 'He has tons of stuff to throw away.'
# sent_2 = 'He needs to get rid of a lot of junk.'

dot_product = torch.dot(sent_1_rep[0], sent_2_rep[0])
print(dot_product)


tensor(77.6483, device='cuda:0')


In [19]:
sent_3 = 'Life is like a box of cookies.'
sent_3_rep = encode_sentence(sent_3, model)
print(torch.dot(sent_1_rep[0], sent_3_rep[0]))
print(torch.dot(sent_2_rep[0], sent_3_rep[0]))

tensor(83.1716, device='cuda:0')
tensor(74.5247, device='cuda:0')


# Play with GPT2

In [20]:
import gc
del tokenizer, model
gc.collect()


221

In [21]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [22]:

# Encode some inputs
text = "I went to see some animals at the"
indexed_tokens = tokenizer.encode(text)
print(len(indexed_tokens))
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor(indexed_tokens)

output, _ = model(tokens_tensor)
print(output.size())

# get the predicted last token
probability_over_vocab = output[-1, :]
print(probability_over_vocab.size())
predicted_index = torch.argmax(probability_over_vocab).item()
predicted_token = tokenizer.decode([predicted_index])
print(predicted_token)

8
torch.Size([8, 50257])
torch.Size([50257])
 zoo


# Decoding from GPT2

In [23]:
# Prepare a prompt
context_str = "On a Sunday morning"
context_tokens = tokenizer.encode(context_str)
print(context_tokens)
context = torch.tensor(context_tokens, dtype=torch.long)
context = context.unsqueeze(0)
print(context.size())

[2202, 257, 3502, 3329]
torch.Size([1, 4])


In [24]:
generated = context
length = 20
import torch.nn.functional as F

with torch.no_grad():
    for _ in range(length):
        # run the model
        outputs,_ = model(generated)
        
        # get the logits for the next word prediction
        next_token_logits = outputs[0, -1, :]

        # sample from the distribution over vocab
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)

        # concatenate the sampled token to generated
        generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
text = tokenizer.decode(generated[0].tolist())
print(text)

On a Sunday morning program, it drew widespread skepticism from both Democrats and Republicans when support for expand Medicaid went very far with


In [25]:
# TopK decoding
generated = context
length = 20
top_k = 40
import torch.nn.functional as F

with torch.no_grad():
    for _ in range(length):
        # run the model
        outputs,_ = model(generated)
        
        # get the logits for the next word prediction
        next_token_logits = outputs[0, -1, :]

        # filter the next_token_logist to keep the top k
        indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
        next_token_logits[indices_to_remove] = -float('inf')

        # sample from the distribution over vocab
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)

        # concatenate the sampled token to generated
        generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
text = tokenizer.decode(generated[0].tolist())
print(text)

On a Sunday morning on Sunday, October 17, the former governor and governor of Nevada, R. Kelly is to lead
