# BERT masked words in English and Mandarin Chinese

Code from: https://stackoverflow.com/questions/54978443/predicting-missing-words-in-a-sentence-natural-language-processing-model

This file uses a pre-trained BERT from HuggingFace to predict the word missing from a sentence in Mandarin Chinese. 

In [1]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 18.4MB/s eta 0:00:01[K     |█████▎                          | 20kB 25.3MB/s eta 0:00:01[K     |████████                        | 30kB 20.9MB/s eta 0:00:01[K     |██████████▋                     | 40kB 22.9MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 13.6MB/s eta 0:00:01[K     |███████████████▉                | 61kB 14.9MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 12.8MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 13.4MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 14.7MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 12.1MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 12.1MB/s eta 0:00:01[K     |████████████

In [2]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [3]:
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
# import logging
# logging.basicConfig(level=logging.INFO)

In [4]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

text =  '[CLS]你是中[MASK]人吗?[SEP]'
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
masked_index = tokenized_text.index('[MASK]')

# Create the segments tensors.
segments_ids = [0] * len(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-chinese')
model.eval()

100%|██████████| 109540/109540 [00:00<00:00, 412416.24B/s]
100%|██████████| 382072689/382072689 [00:15<00:00, 25164601.51B/s]


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
   

In [5]:
# Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)

In [6]:
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

print(predicted_token)

国


In [8]:
print(text.replace("[MASK]",predicted_token).replace("[CLS]","").replace("[SEP]","")) # Not the most effeicent way of doing this...

你是中国人吗?
