# Quick start

Note:

- is not a toolbox to build neural nets. Inherit from huggingface base classes and use Pytorch to reuse functionalities
- but still exposes models' internals 
    - give access to full hidden-states and **attention weights**
- easy for fine-tuning/investigating models
    - add new tokens/embeddings
    - mask and prune transformer

In [1]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [2]:
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load tokenizer and preprocess our example text

In [3]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [4]:
tokenized_text

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 'henson',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

In [5]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', 
                          '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

In [6]:
len(tokenized_text)

14

In [7]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [8]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load model for training

## BERT model with no head

In [9]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# 12 layer 110M params BERT trained on lower-cased English text


In [10]:
# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
type(model)

transformers.models.bert.modeling_bert.BertModel

In [12]:
# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda');

In [13]:
tokens_tensor.shape,segments_tensors.shape

(torch.Size([1, 14]), torch.Size([1, 14]))

In [14]:
# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]


In [15]:
# We have encoded our input sequence in a FloatTensor of shape 
# (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

In [16]:
len(outputs)

2

In [32]:
# In our case, the first element is the hidden state of the last layer of the Bert model
# Wait, this should be the hidden state for each bptt, in this case bptt = 14
outputs[0].shape

torch.Size([1, 14, 768])

In [35]:
# the last hidden states of bptt= 13th
outputs[1].shape

torch.Size([1, 768])

## BERT model with head to predict mask token

In [17]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda');

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

In [19]:
len(outputs)

1

In [39]:
# for BERt masked prediction, it will output result for every token in the input.
predictions.shape

torch.Size([1, 14, 30522])

In [50]:
import pandas as pd

In [51]:
pd.Series(predictions[0, masked_index].cpu().numpy()).describe()

count    30522.000000
mean        -4.330632
std          2.341220
min        -16.808929
25%         -5.943692
50%         -4.442482
75%         -2.870903
max         18.189161
dtype: float64

In [52]:
# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

henson


In [53]:
for i in range(14):
    predicted_index = torch.argmax(predictions[0, i]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)

.
who
was
jim
henson
?
.
jim
henson
was
a
puppet
##eer
.


### Multiple masks

In [62]:
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [63]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = [2,8,11]
for i in masked_index:
    tokenized_text[i] = '[MASK]'
print(tokenized_text)

['[CLS]', 'who', '[MASK]', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', '[MASK]', '##eer', '[SEP]']


In [64]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [65]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval();

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda');

# Predict ALL tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
for i in range(14):
    predicted_index = torch.argmax(predictions[0, i]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)

.
who
was
jim
henson
?
.
jim
henson
was
a
puppet
##eer
.


Different example

In [98]:
text = "[CLS] I am working at a coding school . [SEP] My main job is to teach students how to code a program [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['[CLS]', 'i', 'am', 'working', 'at', 'a', 'coding', 'school', '.', '[SEP]', 'my', 'main', 'job', 'is', 'to', 'teach', 'students', 'how', 'to', 'code', 'a', 'program', '[SEP]']


In [99]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = [3,7,16, 19]
for i in masked_index:
    tokenized_text[i] = '[MASK]'
print(tokenized_text)

['[CLS]', 'i', 'am', '[MASK]', 'at', 'a', 'coding', '[MASK]', '.', '[SEP]', 'my', 'main', 'job', 'is', 'to', 'teach', '[MASK]', 'how', 'to', '[MASK]', 'a', 'program', '[SEP]']


In [100]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0]*10 + [1]*(len(indexed_tokens)-10)


In [101]:
assert len(segments_ids) == len(indexed_tokens)

In [102]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval();

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda');

# Predict ALL tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

for i in range(len(segments_ids)):
    predicted_index = torch.argmax(predictions[0, i]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_token)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


.
i
am
working
at
a
coding
school
.
.
my
main
job
is
to
teach
you
how
to
run
a
program
.


TODO: do https://huggingface.co/transformers/quicktour.html