In [8]:
from datasets import load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer

import random
import torch

In [3]:
local_model = AutoModelForMaskedLM.from_pretrained("./my-bert-model")
local_tokenizer = AutoTokenizer.from_pretrained("./my-bert-model")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [4]:
train_data = dataset['train']

In [22]:
example = train_data[100]['text']
print('Example:', example)

tokens = local_tokenizer.tokenize(example)
print('Tokens count:', len(tokens))

mask_idx = random.randint(1, len(tokens)-2)
print('Mask index:', mask_idx)
masked_token = tokens[mask_idx]
tokens[mask_idx] = local_tokenizer.mask_token
masked_sentence = local_tokenizer.convert_tokens_to_string(tokens)
print('Masked sentence:', masked_sentence)

inputs = local_tokenizer(masked_sentence, return_tensors="pt")
with torch.no_grad():
    outputs = local_model(**inputs)
    logits = outputs.logits

mask_token_index = (inputs.input_ids == local_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0].item()
mask_logits = logits[0, mask_token_index, :]

top_token_id = torch.argmax(mask_logits).item()
predicted_token = local_tokenizer.decode([top_token_id])
print(f"Predicted token: {predicted_token}")

completed_tokens = tokens.copy()
completed_tokens[mask_idx] = predicted_token
completed_sentence = local_tokenizer.convert_tokens_to_string(completed_tokens)
print(f"Completed sentence: {completed_sentence}")

Example:  96 ammunition packing boxes 

Tokens count: 4
Mask index: 2
Masked sentence: 96 ammunition [MASK] boxes
Predicted token: storage
Completed sentence: 96 ammunition storage boxes


In [27]:
print(masked_token, predicted_token)
is_correct = (predicted_token.strip() == masked_token.strip())
print(is_correct)

packing storage
False


In [30]:
correct = 0
total = 0

for example in train_data:
    tokens = local_tokenizer.tokenize(example['text'])
    if len(tokens) < 3:
        continue
    mask_idx = random.randint(1, len(tokens)-2)
    masked_token = tokens[mask_idx]
    tokens[mask_idx] = local_tokenizer.mask_token
    masked_sentence = local_tokenizer.convert_tokens_to_string(tokens)
    
    inputs = local_tokenizer(masked_sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = local_model(**inputs)
        logits = outputs.logits
    mask_token_index = (inputs.input_ids == local_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0].item()
    mask_logits = logits[0, mask_token_index, :]
    predicted_token = local_tokenizer.decode([torch.argmax(mask_logits).item()])
    
    print(f"Masked sentence: {masked_sentence}")
    print(f"True token: {masked_token} | Predicted token: {predicted_token}")
    print(f"Match: {predicted_token.strip() == masked_token.strip()}")
    print("-" * 40)
    
    if predicted_token.strip() == masked_token.strip():
        correct += 1
    total += 1

    if total == 100:
        break

accuracy = correct / total
print(f"Masked LM accuracy over {total} examples: {accuracy:.2%}")

Masked sentence: = [MASK]kyria chronicles iii =
True token: val | Predicted token: =
Match: False
----------------------------------------
Masked sentence: senjo no valkyria 3 : un [MASK]corded chronicles ( japanese : 戦 場 のウァルキュリア3, lit. valkyria of the battlefield 3 ), commonly referred to as valkyria chronicles iii outside japan, is a tactical role @ - @ playing video game developed by sega and media. vision for the playstation portable. released in january 2011 in japan, it is the third game in the valkyria series. employing the same fusion of tactical and real @ - @ time gameplay as its predecessors, the story runs parallel to the first game and follows the " nameless ", a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit " calamaty raven ".
True token: ##re | Predicted token: -
Match: False
----------------------------------------
Masked sentence: the game began development in