In [2]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset

# Define the masked language model
class MaskedLanguageModel(nn.Module):
    def __init__(self, bert_model, tokenizer, hidden_size, vocab_size):
        super(MaskedLanguageModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.tokenizer = tokenizer

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        logits = self.fc(hidden_states)
        return logits

# Dummy dataset class for masked language modeling
class MaskedLanguageModelingDataset(Dataset):
    def __init__(self, input_text, tokenizer):
        self.input_text = input_text
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, index):
        encoded_input = self.tokenizer.encode_plus(
            self.input_text[index],
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoded_input['input_ids']
        attention_mask = encoded_input['attention_mask']
        masked_input_ids = input_ids.clone()

        # Randomly mask 15% of the tokens
        mask_indices = torch.bernoulli(torch.full(input_ids.shape, 0.15)).bool()
        masked_input_ids[mask_indices] = self.tokenizer.mask_token_id

        return masked_input_ids, attention_mask, input_ids

# Dummy samples for illustration
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
input_text_masked_lm = ["I love [MASK].", "She hates [MASK]."]

# Create dataset and dataloader for masked language modeling
dataset_masked_lm = MaskedLanguageModelingDataset(input_text_masked_lm, tokenizer)
dataloader_masked_lm = DataLoader(dataset_masked_lm, batch_size=2, shuffle=True)

# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
hidden_size = bert_model.config.hidden_size
vocab_size = bert_model.config.vocab_size

# Create masked language model instance
mlm_model = MaskedLanguageModel(bert_model, tokenizer, hidden_size, vocab_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlm_model.parameters(), lr=1e-4)
# Training loop
epochs = 10

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    print("----------------")

    total_loss = 0.0

    # Training phase for Masked Language Modeling
    mlm_model.train()
    for batch in dataloader_masked_lm:
        masked_input_ids, attention_mask, input_ids = batch
        optimizer.zero_grad()

        logits = mlm_model(masked_input_ids, attention_mask)
        loss = criterion(logits.view(-1, vocab_size), input_ids.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader_masked_lm)
    print(f"Masked LM Loss: {avg_loss:.4f}")

    print()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/10
----------------


ValueError: too many values to unpack (expected 2)