In [4]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [5]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
input_path = "../example/input.txt"

with open(input_path, 'r', encoding='utf-8') as file:
    sentences = file.readlines()

char_tokens = []

for sentence in sentences:
    sentence = sentence.strip()
    if not sentence:
        continue

    char_token = " ".join(sentence) + " [MASK]"
    char_tokens.append(char_token)

char_tokens

['H a p p [MASK]',
 'H a p p y   N e [MASK]',
 'H a p p y   N e w   Y e a [MASK]',
 'T h a t ’ s   o n e   s m a l l   s t e [MASK]',
 'T h a t ’ s   o n e   s m [MASK]',
 'T h a t ’ [MASK]',
 'T h [MASK]',
 'o n e   g i a n t   l e a p   f o r   m a n k i n [MASK]',
 'o n e   g i a n t   l e a p   f o [MASK]',
 'o n e   g i a n t   l e a [MASK]',
 'o n e   g i a n t   l [MASK]',
 'o n e   g i a [MASK]',
 'o n [MASK]']

In [13]:
predictions = []
with torch.no_grad():
    for char_token in char_tokens:
        input = tokenizer(char_token, return_tensors='pt')
        mask_token_index = torch.where(input["input_ids"] == tokenizer.mask_token_id)[1].item()

        predict = model(**input)

        top_3_predictions = torch.topk(predict.logits[0, mask_token_index], k=3)
        predicted_chars = [tokenizer.decode([token_id]) for token_id in top_3_predictions.indices]
        predictions.append(predicted_chars)


for original, pred in zip(char_tokens, predictions):
    print(f"Original: {original}")
    print(f"Predicted next characters: {pred}")
    print("-" * 50)

Original: H a p p [MASK]
Predicted next characters: ['p', '.', 'H']
--------------------------------------------------
Original: H a p p y   N e [MASK]
Predicted next characters: ['n', '.', 'E']
--------------------------------------------------
Original: H a p p y   N e w   Y e a [MASK]
Predicted next characters: ['e', 'u', 'f']
--------------------------------------------------
Original: T h a t ’ s   o n e   s m a l l   s t e [MASK]
Predicted next characters: ['e', 'u', 't']
--------------------------------------------------
Original: T h a t ’ s   o n e   s m [MASK]
Predicted next characters: ['n', 'm', '[UNK]']
--------------------------------------------------
Original: T h a t ’ [MASK]
Predicted next characters: ['[UNK]', '.', ':']
--------------------------------------------------
Original: T h [MASK]
Predicted next characters: ['.', ':', '=']
--------------------------------------------------
Original: o n e   g i a n t   l e a p   f o r   m a n k i n [MASK]
Predicted next cha