In [44]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [45]:
prompt_text = input("Please input something: ")
# Tokenize the text so the model can understand it
tokenized_text = tokenizer.tokenize(prompt_text)
# Print the tokenized text so I can read it
print("Your input was: " + tokenized_text)

['[CLS]',
 'where',
 'is',
 'the',
 'remote',
 '?',
 '[SEP]',
 'it',
 'is',
 'under',
 'the',
 'couch',
 '.',
 '[SEP]']

In [46]:
# Mask one of the words (so that BERT has to guess what it is)
masked_index = 9
tokenized_text[masked_index] = "[MASK]"
tokenized_text

['[CLS]',
 'where',
 'is',
 'the',
 'remote',
 '?',
 '[SEP]',
 'it',
 'is',
 '[MASK]',
 'the',
 'couch',
 '.',
 '[SEP]']

In [47]:
# Convert tokens to vocab ids (for the model to understand)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence indices?
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [48]:
# Convert inputs to tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segments_ids])

In [49]:
# Use the BERT model to convert tokens to "hidden-states"

# Quote: "Set the model in evaluation mode to deactivate the DropOut modules
#           This is IMPORTANT to have reproducible results during evaluation!"
#   --> But why?
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [50]:
# Predict "hidden-state" features for each layer...?
# --> What does this mean?
# --> Also, what is torch.no_grad?
with torch.no_grad():
    # Get the outputs by using the model on the tokens
    outputs = model(tokens_tensor, token_type_ids=segments_tensor)
    # Quote: "In our case, the first element is the hidden state of the last layer of the Bert model"
    # --> Which means what? I get that it's what we're looking for, but why?
    encoded_layers = outputs[0]


In [51]:
# The above code was to show how to tokenize and get back predictions
# The code below actually utilizes BERT to unmask the masked areas
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# This is the same code as above
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensor)
    predictions = outputs[0]

In [53]:
# Get the prediction for the mask
# argmax() gets back the top result (we've been using topk)
predicted_outputs = [tokenizer.convert_ids_to_tokens([index.item()])[0] for index in predictions[0, masked_index].topk(10).indices]
# Convert that result (which is a vocab id) to a token so we can read it
# predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
# Print it out
predicted_outputs

['on',
 'under',
 'behind',
 'underneath',
 'beneath',
 'beside',
 'in',
 'near',
 'above',
 'by']