In [2]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
import random
import numpy as np

# Set random seeds for reproducibility
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)


Prepare a Toy Dataset

In [3]:
# Create a small corpus of sentences
sentences = [
    "The cat sat on the mat.",
    "The dog barked loudly.",
    "The bird sang a sweet song.",
    "The fish swam in the tank.",
    "The cow jumped over the moon."
]

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the sentences
inputs = tokenizer(sentences, return_tensors='pt', max_length=10, padding=True, truncation=True)
print("Tokenized Inputs:", inputs.input_ids)


Tokenized Inputs: tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,  1012,   102],
        [  101,  1996,  3899, 17554,  9928,  1012,   102,     0,     0],
        [  101,  1996,  4743,  6369,  1037,  4086,  2299,  1012,   102],
        [  101,  1996,  3869, 16849,  1999,  1996,  4951,  1012,   102],
        [  101,  1996, 11190,  5598,  2058,  1996,  4231,  1012,   102]])




Create Masked Language Modeling Data

In [4]:
def mask_tokens(inputs, tokenizer):
    labels = inputs.clone()
    # Replace 15% of tokens with [MASK]
    mask_arr = torch.full(labels.shape, 0.15) < torch.rand(labels.shape)
    labels[~mask_arr] = -100  # We only compute loss on masked tokens

    # Replace 80% of the masked tokens with [MASK]
    inputs[mask_arr] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of the masked tokens with a random word
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    random_replace = torch.full(labels.shape, 0.10) < torch.rand(labels.shape)
    inputs[mask_arr & random_replace] = random_words[mask_arr & random_replace]

    # Leave 10% of the masked tokens unchanged

    return inputs, labels

masked_inputs, labels = mask_tokens(inputs.input_ids, tokenizer)
print("Masked Inputs:", masked_inputs)
print("Labels:", labels)

Masked Inputs: tensor([[ 1595, 11477, 27706, 25821, 26699, 20856, 14835, 15736, 28743],
        [  101, 12145, 26251, 15902, 11186,  9732, 28964,   103, 20150],
        [  103, 26604, 18509, 12795, 27394,   103,  2299, 10067, 21141],
        [  103, 22141,  3869, 21629,  1999, 12355, 29966, 26279, 26430],
        [29299, 26198, 23535,  1624, 11300, 28357,   103, 23447, 22387]])
Labels: tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,  1012,   102],
        [ -100,  1996,  3899, 17554,  9928,  1012,   102,     0,     0],
        [  101,  1996,  4743,  6369,  1037,  4086,  -100,  1012,   102],
        [  101,  1996,  -100, 16849,  -100,  1996,  4951,  1012,   102],
        [  101,  1996, 11190,  5598,  2058,  1996,  4231,  1012,   102]])


Load Pre-trained BERT Model and Fine-Tune

In [5]:
# Load pre-trained BERT model for Masked Language Modeling
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tune the model
model.train()
epochs = 2  # For demonstration, use 2 epochs

for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(masked_inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Loss: 9.458168983459473
Epoch 2, Loss: 8.453481674194336


Test the Model with Masked Input

In [6]:
# Select a sentence to test
test_sentence = "The barked loudly."
inputs = tokenizer(test_sentence, return_tensors='pt')
masked_input = inputs.input_ids.clone()

# Mask a word in the sentence
masked_input[0, 2] = tokenizer.mask_token_id  # Mask "barked"

# Predict the masked word
model.eval()
with torch.no_grad():
    outputs = model(masked_input)
    predictions = outputs.logits

# Get the predicted word
predicted_index = torch.argmax(predictions[0, 2]).item()
predicted_word = tokenizer.decode([predicted_index])

print(f"Original Sentence: {test_sentence}")
print(f"Masked Sentence: {' '.join(tokenizer.convert_ids_to_tokens(masked_input[0]))}")
print(f"Predicted Word: {predicted_word}")


Original Sentence: The barked loudly.
Masked Sentence: [CLS] the [MASK] loudly . [SEP]
Predicted Word: door
