In [1]:
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [13]:
example_en = "It's International Dog Day!"
example_ru = "Это Международный День Собаки!"

encoded_examples = [torch.tensor(tokenizer.encode(ex)).unsqueeze(0) 
                    for ex in [example_en, example_ru]]

In [14]:
# Decoding works similarly - word pieces are concatenated automatically
tokenizer.decode([int(tok) for tok in encoded_examples[0][0]])

"It's International Dog Day!"

In [15]:
# Set model to eval to deactivate dropout
model.eval()

example_hidden_states = [model(ex)[0][0] for ex in encoded_examples]

# 7 WordPieces, 768 weights
example_hidden_states[0].shape

torch.Size([7, 768])

In [16]:
mlm_model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

In [17]:
# This model includes the prediction layer 
mlm_model.cls

BertOnlyMLMHead(
  (predictions): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): BertLayerNorm()
    )
    (decoder): Linear(in_features=768, out_features=119547, bias=False)
  )
)

In [19]:
tokenized_text = tokenizer.tokenize('I love my pets!')
masked_index = 3
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_tensors = torch.tensor([0] * len(tokenized_text))
tokens_tensor = torch.tensor([indexed_tokens])

# Convert inputs to PyTorch tensors
model.eval()

# Encode inputs to hidden states using regular BERT
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)

# Attempt to predict masked token using BERT for MLM prediction
mlm_model.eval()

with torch.no_grad():
    predictions = mlm_model(tokens_tensor, segments_tensors)
    
predicted_index = torch.argmax(predictions[0][0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

In [20]:
predicted_token

'heart'