In [2]:
import re
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# Load and clean movie line dataset
def load_movie_lines(path):
    lines = []
    with open(path, encoding='ISO-8859-1') as file:  # ← FIXED ENCODING
        for line in file:
            parts = line.strip().split("+++$+++")
            if len(parts) == 5:
                sentence = parts[4].strip()
                if sentence:
                    lines.append(sentence)
    return lines

# dataset file
file_path = "/content/movie_lines.txt"
corpus = load_movie_lines(file_path)

# Preview 3 cleaned lines
print("Sample cleaned lines:")
for i in range(3):
    print("-", corpus[i])

# Function to predict next words using BERT
def predict_next_words(seed_text, top_k=5):
    seed_text = seed_text.strip()
    if not seed_text.endswith(" "):
        seed_text += " "
    input_text = seed_text + "[MASK]"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    mask_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    mask_word_logits = logits[0, mask_index, :]
    sorted_indices = torch.argsort(mask_word_logits, dim=-1, descending=True)[0]

    predictions = []
    for idx in sorted_indices:
        word = tokenizer.decode([idx]).strip()
        # Skip punctuation
        if word.isalpha():
            predictions.append(word)
        if len(predictions) == top_k:
            break

    return predictions


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Sample cleaned lines:
- They do not!
- They do to!
- I hope so.


In [None]:
# 6. Test it
while True:
    user_input = input("\nEnter a 3+ word phrase (or 'exit'): ").strip()
    if user_input.lower() == "exit":
        break
    if len(user_input.split()) < 3:
        print("⚠️ Please enter at least 3 words.")
        continue
    preds = predict_next_words(user_input)
    print("🔮 Top prediction:", preds[0])
    print("🧠 Top 5 predictions:", preds)


Enter a 3+ word phrase (or 'exit'): my name is choco 
🔮 Top prediction: and
🧠 Top 5 predictions: ['and', 'so', 'here', 'because', 'you']

Enter a 3+ word phrase (or 'exit'): i hate
⚠️ Please enter at least 3 words.

Enter a 3+ word phrase (or 'exit'): i don't like
🔮 Top prediction: it
🧠 Top 5 predictions: ['it', 'you', 'and', 'that', 'me']

Enter a 3+ word phrase (or 'exit'): i love you but not 
🔮 Top prediction: you
🧠 Top 5 predictions: ['you', 'me', 'because', 'god', 'so']

Enter a 3+ word phrase (or 'exit'): i am currently in BSAI
🔮 Top prediction: and
🧠 Top 5 predictions: ['and', 'because', 'today', 'so', 'where']

Enter a 3+ word phrase (or 'exit'): my CGPA is 2.89 but i will
🔮 Top prediction: take
🧠 Top 5 predictions: ['take', 'be', 'work', 'go', 'do']
