In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

We will get a warning that some model weights are not needed. This comes from `bert-base-uncased` having layers for other tasks (specifically, classification tasks) that our Masked Language Modeling task won't need.

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu'
MODEL_CHECKPOINT = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT).to(DEVICE)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


For `bert-base-uncased` the mask token is `'[MASK]'` with token id `103`:

In [22]:
print(f"Mask token: {tokenizer.mask_token}")
print(f"Mask token id: {tokenizer.mask_token_id}")

Mask token: [MASK]
Mask token id: 103


Here's a test string with a mask token. We will
1. Tokenize it
1. Note the mask token's position in the tokenized sequence
1. 

In [32]:
sequence = "It was the best of [MASK], it was the worst of times"

In [57]:
# Tokenize the text sequence, put on GPU
inputs = tokenizer(sequence, return_tensors="pt").to(DEVICE)
print(f"Inputs shape: {inputs.input_ids.shape}")

# Get the mask token's index
mask_index = torch.where(inputs.input_ids[0] == tokenizer.mask_token_id)[0].item()

# Get the model's logits outputs
logits = model(**inputs).logits
print(f"Logits shape: {logits.shape}")

# Get the predicted token for the mask
predicted_token_id = torch.argmax(logits[0, mask_index])

# Replace mask token with predicted id
inputs.input_ids[0, mask_index] = predicted_token_id
output = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(f"Reconstructed output: {output}")

Inputs shape: torch.Size([1, 15])
Logits shape: torch.Size([1, 15, 30522])
Reconstructed output: it was the best of times, it was the worst of times


In [49]:
mask_index = torch.where(inputs.input_ids[0] == tokenizer.mask_token_id)[0].item()
print(mask_index)

print(torch.argmax(logits[0, mask_index]))


6
tensor(2335, device='mps:0')


In [50]:
tokenizer.convert_ids_to_tokens([2335])

['times']

In [16]:
sequences = [
    "All you [MASK] is love",
    "You can't [MASK] get what you want"
]

for s in sequences:
    inputs = tokenizer(s, return_tensors="pt").to(DEVICE)
    logits = model(**inputs).logits
    predicted_token_ids = torch.argmax(logits, dim=-1)
    predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids[0])
    print(f"Predicted tokens: {predicted_tokens}")

Predicted tokens: ['.', 'all', 'you', 'need', 'is', '.', '.']
Predicted tokens: ['.', 'you', 'can', '.', 't', 'always', 'get', 'what', 'you', 'want', '.']


In [21]:
inputs.input_ids

tensor([[ 101, 2017, 2064, 1005, 1056,  103, 2131, 2054, 2017, 2215,  102]],
       device='mps:0')

## Simplified code with a fill-mask pipeline



In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model,
                     tokenizer=tokenizer, device=DEVICE)