<a href="https://colab.research.google.com/github/arishp/veltech_genai/blob/main/Bert_predicts_mlm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import required libraries
from transformers import AutoTokenizer, AutoModelForMaskedLM
from scipy.special import softmax
import numpy as np

In [None]:
# Specify the pre-trained model to use: BERT-base-cased
model_name = "bert-base-cased"

# Instantiate the tokenizer and model for the specified pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Get the mask token from the tokenizer
mask = tokenizer.mask_token
print(mask)

[MASK]


In [None]:
# Create a sentence with a mask token to be filled in by the model
sentence = f"I want to {mask} pizza for tonight."
# Tokenize the sentence
tokens = tokenizer.tokenize(sentence)
print(tokens)

['I', 'want', 'to', '[MASK]', 'pizza', 'for', 'tonight', '.']


In [None]:
# Encode the sentence using the tokenizer and return the input tensors
encoded_inputs = tokenizer(sentence, return_tensors='pt')
print(encoded_inputs)

{'input_ids': tensor([[  101,   146,  1328,  1106,   103, 13473,  1111,  3568,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
# Get the model's output for the input tensors
outputs = model(**encoded_inputs)
print(outputs)

MaskedLMOutput(loss=None, logits=tensor([[[ -7.3723,  -7.2489,  -7.4421,  ...,  -6.3119,  -5.9369,  -6.4257],
         [ -7.9311,  -8.2282,  -8.0326,  ...,  -6.7387,  -6.4877,  -6.9525],
         [-12.0500, -11.7972, -12.5776,  ...,  -8.4518,  -6.7310,  -8.2586],
         ...,
         [-10.2204, -10.4315,  -9.9993,  ...,  -7.9570,  -6.7194,  -9.3618],
         [-12.4471, -12.5367, -12.5614,  ...,  -9.9086,  -9.4219, -11.1770],
         [-14.3657, -14.5227, -15.0017,  ..., -11.9715, -11.6569, -13.4498]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


In [None]:
# Detach the logits from the model's output and convert them to numpy arrays
logits = outputs.logits.detach().numpy()[0]
logits.shape

(10, 28996)

In [None]:
len(tokens)

8

In [None]:
# Extract the logits for the mask token
mask_logits = logits[tokens.index(mask) + 1]
print(mask_logits)

[-6.7146263 -6.3791075 -6.1184874 ... -5.651307  -3.657276  -4.994728 ]


In [None]:
# Calculate the confidence scores for each possible token using softmax
confidence_scores = softmax(mask_logits)
print(confidence_scores)

[2.9159986e-10 4.0785036e-10 5.2928162e-10 ... 8.4446450e-10 6.2026548e-09
 1.6282821e-09]


In [None]:
confidence_scores.sum()

1.0000001

In [None]:
# Print the top 5 predicted tokens and their confidence scores
for i in np.argsort(confidence_scores)[::-1][:5]:
    pred_token = tokenizer.decode(i)
    score = confidence_scores[i]

    # Print the predicted sentence with the mask token replaced by the predicted token, and the confidence score
    print(sentence.replace(mask, pred_token), score)

I want to have pizza for tonight. 0.25729004
I want to get pizza for tonight. 0.17849584
I want to eat pizza for tonight. 0.1555555
I want to make pizza for tonight. 0.11422437
I want to order pizza for tonight. 0.09823046
