In [1]:
import nltk

In [1]:
import re
import math
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [3]:
# Load SQuAD sample
squad = load_dataset("squad", split="train[:20]")



In [4]:
# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:
# --- NLTK based text preprocessing ---
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep only alphabets and spaces
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

# --- Compute Term Frequency (TF) ---

In [22]:
# --- Compute Term Frequency (TF) ---
def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

In [23]:
# --- Compute Inverse Document Frequency (IDF) ---
def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for doc in documents for word in doc)
    for word in all_words:
        containing_docs = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N / (1 + containing_docs))
    return idf

In [24]:
# --- Compute TF-IDF ---
def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 0.0) for word in tf}

In [25]:
# --- Cosine similarity ---
def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

In [26]:
# --- Generate distractors ---
def generate_distractors(answer, context, idf):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)

    distractors = []
    used_words = set(answer_tokens)
    for word in set(context_tokens):
        if word in used_words:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        if 0.2 < sim < 0.8:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])[:3]
    return [w for w, _ in distractors]

In [27]:
# --- Generate question using T5 ---
def generate_question_t5(context, answer):
    input_text = f"generate question: {context} answer: {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=64)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# --- Prepare IDF from all contexts ---
all_contexts = [preprocess(example["context"]) for example in squad]
idf = compute_idf(all_contexts)

In [28]:
# --- Prepare IDF from all contexts ---
all_contexts = [preprocess(example["context"]) for example in squad]
idf = compute_idf(all_contexts)

In [29]:

# --- Generate MCQs ---
mcqs = []
for example in squad:
    context = example["context"]
    answer = example["answers"]["text"][0]
    if not answer.strip():
        continue

    distractors = generate_distractors(answer, context, idf)
    if len(distractors) < 3:
        continue

    question = generate_question_t5(context, answer)
    options = distractors + [answer]
    random.shuffle(options)

    mcqs.append({
        "question": question,
        "options": options,
        "answer": answer
    })

    if len(mcqs) >= 5:
        break

# --- Print sample MCQs ---
for i, mcq in enumerate(mcqs):
    print(f"\nQuestion {i+1}: {mcq['question']}")
    for j, opt in enumerate(mcq['options']):
        label = chr(65 + j)
        print(f"  {label}. {opt}")
    print(f"Answer: {mcq['answer']}")

In [47]:
import re
import math
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize, ne_chunk, sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Download NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load fine-tuned T5 model and tokenizer for question generation
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged, binary=False)
    entities = set()

    for subtree in tree:
        if hasattr(subtree, 'label') and subtree.label() in ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY', 'GSP']:
            entity = ' '.join([token for token, pos in subtree.leaves()])
            if len(entity) > 1:
                entities.add(entity)
    return list(entities)

def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 0.0) for word in tf}

def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def generate_distractors(answer, context, idf):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)

    distractors = []
    used_words = set([t.lower() for t in answer_tokens])
    for word in set(context_tokens):
        if word in used_words:
            continue
        if word in answer.lower() or answer.lower() in word:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        if 0.2 < sim < 0.8:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])[:3]
    return [w for w, _ in distractors]

def get_relevant_sentence(text, answer):
    sentences = sent_tokenize(text)
    for sent in sentences:
        if answer.lower() in sent.lower():
            return sent
    return text

def generate_question_t5(context, answer):
    sentence = get_relevant_sentence(context, answer)
    # Highlight the answer with <hl> tags (case-insensitive)
    pattern = re.compile(re.escape(answer), re.IGNORECASE)
    highlighted_sentence = pattern.sub(f"<hl>{answer}</hl>", sentence, count=1)
    input_text = f"generate question: {highlighted_sentence} answer: {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    question = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return question

def generate_mcqs_from_text(text, desired_mcq_count=5):
    candidate_answers = extract_named_entities(text)

    if not candidate_answers:
        # fallback to most frequent words if no named entities
        candidate_answers = preprocess(text)[:20]

    idf = {word: 1.0 for word in preprocess(text)}  # dummy IDF for single doc

    mcqs = []
    used_answers = set()

    for answer in candidate_answers:
        if len(mcqs) >= desired_mcq_count:
            break
        if len(answer.split()) > 5 or len(answer) < 2:
            continue
        if answer.lower() in used_answers:
            continue

        distractors = generate_distractors(answer, text, idf)

        # Add random distractors if not enough
        if len(distractors) < 3:
            context_tokens = set(preprocess(text))
            random_distractors = list(context_tokens - set([answer.lower()]))
            random.shuffle(random_distractors)
            for w in random_distractors:
                if w not in distractors and w.lower() != answer.lower():
                    distractors.append(w)
                if len(distractors) >= 3:
                    break

        if len(distractors) < 3:
            continue

        question = generate_question_t5(text, answer)
        options = distractors[:3] + [answer]
        random.shuffle(options)

        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
        used_answers.add(answer.lower())

    return mcqs


# ======= Example usage =======
input_text = """
In the heart of the quaint village of Eldermere, a mysterious tree stood tall in the town square.
Its gnarled branches bore fruits that resembled pears, but with an unusual twist: they seemed to
shimmer with a golden hue. The villagers affectionately named it the "Shakespear" tree, believing it
held magical properties.

Legend had it that anyone who tasted a Shakespear would gain a glimpse into their future.
Curiosity spread like wildfire, and soon, villagers flocked to the tree, eager for a taste of destiny.
Young Emma, a spirited girl with dreams of becoming a writer, felt an undeniable pull toward the
shimmering fruit.

One crisp autumn morning, she approached the tree, heart racing. With a deep breath, she plucked a
Shakespear and took a bite. Instantly, a whirlwind of visions enveloped her. She saw herself standing
on a grand stage, the applause of a thousand voices echoing in her ears. In another glimpse, she wandered
through enchanted forests, her stories coming to life.

Determined to fulfill these dreams, Emma spent every spare moment writing. The villagers, inspired by
her passion, began sharing their own tales. The square buzzed with creativity, and soon, Eldermere
became a hub of storytelling.

As the seasons changed, Emma’s words took flight. She published her first book, a collection of
enchanting stories, and it captured the hearts of many beyond Eldermere. The Shakespear tree
continued to stand, its golden pears glimmering, a reminder that dreams, when nurtured, could blossom
into reality.

And so, in the embrace of magic and creativity, the legacy of the Shakespear lived on, inspiring
generations to reach for their dreams.
"""

mcqs = generate_mcqs_from_text(input_text, desired_mcq_count=5)

for i, mcq in enumerate(mcqs):
    print(f"\nQuestion {i+1}: {mcq['question']}")
    for j, opt in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {opt}")
    print(f"Answer: {mcq['answer']}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-


Question 1: What is the name of the village in which a mysterious tree stood in the square?
  A. soon
  B. Eldermere
  C. quaint
  D. saw
Answer: Eldermere

Question 2: Who felt an undeniable pull toward the shimmering fruit?
  A. eager
  B. Young Emma
  C. believing
  D. held
Answer: Young Emma

Question 3: What did the villagers call the tree?
  A. gnarled
  B. crisp
  C. Shakespear
  D. glimpse
Answer: Shakespear

Question 4: Who felt an undeniable pull toward the shimmering fruit?
  A. Emma
  B. life
  C. reality
  D. autumn
Answer: Emma


In [44]:
import re
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import T5Tokenizer, T5ForConditionalGeneration

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 1.0) for word in tf}

def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def generate_distractors(answer, context, idf):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)

    distractors = []
    used = set(answer_tokens)
    for word in set(context_tokens):
        if word in used or len(word) < 3:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        # Relax thresholds for more distractors
        if 0.0 < sim < 0.7:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])
    # Return top 3 or less if not enough found
    return [w for w, _ in distractors[:3]]

def generate_question_t5(context, answer):
    # Simplify prompt for better question generation or fallback
    try:
        input_text = f"generate question: answer: {answer} context: {context}"
        input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512)
        output_ids = model.generate(input_ids, max_length=64)
        question = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        if len(question.strip()) < 10:  # fallback if output is nonsense
            raise ValueError
        return question
    except:
        return f"What is {answer}?"

def generate_mcqs(text, top_k=10):
    tokens = preprocess(text)
    tf = compute_tf(tokens)
    idf = {word: 1.0 for word in tf}  # single doc
    tfidf = compute_tf_idf(tf, idf)
    sorted_keywords = sorted(tfidf.items(), key=lambda x: -x[1])
    candidate_answers = [word for word, score in sorted_keywords[:top_k]]

    mcqs = []
    for answer in candidate_answers:
        distractors = generate_distractors(answer, text, idf)
        if len(distractors) < 2:  # accept 2 distractors for more MCQs
            continue
        question = generate_question_t5(text, answer)
        options = distractors + [answer]
        random.shuffle(options)
        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
        if len(mcqs) >= 3:  # generate at least 3 MCQs
            break
    return mcqs

# Sample Text (same as yours)
sample_text = """
In the heart of the quaint village of Eldermere, a mysterious tree stood tall in the town square. 
Its gnarled branches bore fruits that resembled pears, but with an unusual twist: they seemed to 
shimmer with a golden hue. The villagers affectionately named it the "Shakespear" tree, believing 
it held magical properties. Legend had it that anyone who tasted a Shakespear would gain a glimpse 
into their future. Curiosity spread like wildfire, and soon, villagers flocked to the tree, eager 
for a taste of destiny. Young Emma, a spirited girl with dreams of becoming a writer, felt an 
undeniable pull toward the shimmering fruit. One crisp autumn morning, she approached the tree, 
heart racing. With a deep breath, she plucked a Shakespear and took a bite. Instantly, a whirlwind 
of visions enveloped her. She saw herself standing on a grand stage, the applause of a thousand 
voices echoing in her ears. In another glimpse, she wandered through enchanted forests, her stories 
coming to life. Determined to fulfill these dreams, Emma spent every spare moment writing. The 
villagers, inspired by her passion, began sharing their own tales. The square buzzed with 
creativity, and soon, Eldermere became a hub of storytelling. As the seasons changed, Emma’s words 
took flight. She published her first book, a collection of enchanting stories, and it captured the 
hearts of many beyond Eldermere. The Shakespear tree continued to stand, its golden pears 
glimmering, a reminder that dreams, when nurtured, could blossom into reality.
"""

mcqs = generate_mcqs(sample_text)

for i, mcq in enumerate(mcqs):
    print(f"\nQuestion {i+1}: {mcq['question']}")
    for j, opt in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {opt}")
    print(f"Answer: {mcq['answer']}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
import re
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize, ne_chunk, sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Download required NLTK data (only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the fine-tuned T5 model and tokenizer for SQuAD-style QG
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged, binary=False)
    entities = set()

    for subtree in tree:
        if hasattr(subtree, 'label') and subtree.label() in ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY']:
            entity = ' '.join([token for token, pos in subtree.leaves()])
            if len(entity) > 1:
                entities.add(entity)
    return list(entities)

def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 0.0) for word in tf}

def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def generate_distractors(answer, context, idf):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)

    distractors = []
    used_words = set([t.lower() for t in answer_tokens])
    for word in set(context_tokens):
        if word in used_words:
            continue
        if word in answer.lower() or answer.lower() in word:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        if 0.2 < sim < 0.8:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])[:3]
    return [w for w, _ in distractors]

def get_relevant_sentence(text, answer):
    sentences = sent_tokenize(text)
    for sent in sentences:
        if answer.lower() in sent.lower():
            return sent
    return text

def generate_question_squad_style(context, answer):
    sentence = get_relevant_sentence(context, answer)
    # Highlight answer with <hl> tags (case-insensitive)
    pattern = re.compile(re.escape(answer), re.IGNORECASE)
    highlighted_sentence = pattern.sub(f"<hl>{answer}</hl>", sentence, count=1)
    input_text = f"generate question: {highlighted_sentence} answer: {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    question = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return question

def generate_mcqs_from_text(text, desired_mcq_count=5):
    candidate_answers = extract_named_entities(text)

    # Fallback: if no named entities found, use frequent keywords
    if not candidate_answers:
        candidate_answers = preprocess(text)[:20]

    idf = {word: 1.0 for word in preprocess(text)}  # dummy IDF for single doc

    mcqs = []
    used_answers = set()

    for answer in candidate_answers:
        if len(mcqs) >= desired_mcq_count:
            break
        if len(answer.split()) > 5 or len(answer) < 2:
            continue
        if answer.lower() in used_answers:
            continue

        distractors = generate_distractors(answer, text, idf)

        # If less than 3 distractors found, add random words from context
        if len(distractors) < 3:
            context_tokens = set(preprocess(text))
            random_distractors = list(context_tokens - set([answer.lower()]))
            random.shuffle(random_distractors)
            for w in random_distractors:
                if w not in distractors and w.lower() != answer.lower():
                    distractors.append(w)
                if len(distractors) >= 3:
                    break

        if len(distractors) < 3:
            continue

        question = generate_question_squad_style(text, answer)
        options = distractors[:3] + [answer]
        random.shuffle(options)

        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
        used_answers.add(answer.lower())

    return mcqs


# ======== Example usage ========

input_text = """
In the heart of the quaint village of Eldermere, a mysterious tree stood tall in the town square.
Its gnarled branches bore fruits that resembled pears, but with an unusual twist: they seemed to
shimmer with a golden hue. The villagers affectionately named it the "Shakespear" tree, believing it
held magical properties.

Legend had it that anyone who tasted a Shakespear would gain a glimpse into their future.
Curiosity spread like wildfire, and soon, villagers flocked to the tree, eager for a taste of destiny.
Young Emma, a spirited girl with dreams of becoming a writer, felt an undeniable pull toward the
shimmering fruit.

One crisp autumn morning, she approached the tree, heart racing. With a deep breath, she plucked a
Shakespear and took a bite. Instantly, a whirlwind of visions enveloped her. She saw herself standing
on a grand stage, the applause of a thousand voices echoing in her ears. In another glimpse, she wandered
through enchanted forests, her stories coming to life.

Determined to fulfill these dreams, Emma spent every spare moment writing. The villagers, inspired by
her passion, began sharing their own tales. The square buzzed with creativity, and soon, Eldermere
became a hub of storytelling.

As the seasons changed, Emma’s words took flight. She published her first book, a collection of
enchanting stories, and it captured the hearts of many beyond Eldermere. The Shakespear tree
continued to stand, its golden pears glimmering, a reminder that dreams, when nurtured, could blossom
into reality.

And so, in the embrace of magic and creativity, the legacy of the Shakespear lived on, inspiring
generations to reach for their dreams.
"""

mcqs = generate_mcqs_from_text(input_text, desired_mcq_count=3)

for i, mcq in enumerate(mcqs):
    print(f"\nQuestion {i+1}: {mcq['question']}")
    for j, opt in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {opt}")
    print(f"Answer: {mcq['answer']}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-


Question 1: What is the name of the village in which a mysterious tree stood in the square?
  A. Eldermere
  B. resembled
  C. blossom
  D. twist
Answer: Eldermere

Question 2: Who felt an undeniable pull toward the shimmering fruit?
  A. Young Emma
  B. enchanting
  C. legend
  D. every
Answer: Young Emma

Question 3: What did the villagers call the tree?
  A. many
  B. Shakespear
  C. glimmering
  D. whirlwind
Answer: Shakespear


In [49]:
#main

In [53]:
import re
import random
import numpy as np
import nltk
import fitz  # PyMuPDF for PDF reading
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize, ne_chunk, sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Download required NLTK data (only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the fine-tuned T5 model and tokenizer for SQuAD-style QG
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")

# 📄 Utility to read PDF and return extracted text
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged, binary=False)
    entities = set()

    for subtree in tree:
        if hasattr(subtree, 'label') and subtree.label() in ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY']:
            entity = ' '.join([token for token, pos in subtree.leaves()])
            if len(entity) > 1:
                entities.add(entity)
    return list(entities)

def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 0.0) for word in tf}

def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def generate_distractors(answer, context, idf):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)

    distractors = []
    used_words = set([t.lower() for t in answer_tokens])
    for word in set(context_tokens):
        if word in used_words:
            continue
        if word in answer.lower() or answer.lower() in word:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        if 0.2 < sim < 0.8:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])[:3]
    return [w for w, _ in distractors]

def get_relevant_sentence(text, answer):
    sentences = sent_tokenize(text)
    for sent in sentences:
        if answer.lower() in sent.lower():
            return sent
    return text

def generate_question_squad_style(context, answer):
    sentence = get_relevant_sentence(context, answer)
    pattern = re.compile(re.escape(answer), re.IGNORECASE)
    highlighted_sentence = pattern.sub(f"<hl>{answer}</hl>", sentence, count=1)
    input_text = f"generate question: {highlighted_sentence} answer: {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    question = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return question

def generate_mcqs_from_text(text, desired_mcq_count=5):
    candidate_answers = extract_named_entities(text)

    if not candidate_answers:
        candidate_answers = preprocess(text)[:20]

    idf = {word: 1.0 for word in preprocess(text)}  # dummy IDF for single doc

    mcqs = []
    used_answers = set()

    for answer in candidate_answers:
        if len(mcqs) >= desired_mcq_count:
            break
        if len(answer.split()) > 5 or len(answer) < 2:
            continue
        if answer.lower() in used_answers:
            continue

        distractors = generate_distractors(answer, text, idf)

        if len(distractors) < 3:
            context_tokens = set(preprocess(text))
            random_distractors = list(context_tokens - set([answer.lower()]))
            random.shuffle(random_distractors)
            for w in random_distractors:
                if w not in distractors and w.lower() != answer.lower():
                    distractors.append(w)
                if len(distractors) >= 3:
                    break

        if len(distractors) < 3:
            continue

        question = generate_question_squad_style(text, answer)
        options = distractors[:3] + [answer]
        random.shuffle(options)

        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
        used_answers.add(answer.lower())

    return mcqs

# ======= MAIN EXECUTION =======

def run_mcq_generator(input_source, desired_mcq_count=3):
    if input_source.lower().endswith(".pdf"):
        text = extract_text_from_pdf(input_source)
    else:
        text = input_source  # assume plain text

    mcqs = generate_mcqs_from_text(text, desired_mcq_count)

    for i, mcq in enumerate(mcqs):
        print(f"\nQuestion {i+1}: {mcq['question']}")
        for j, opt in enumerate(mcq['options']):
            print(f"  {chr(65+j)}. {opt}")
        print(f"Answer: {mcq['answer']}")

# Example usage for text:

# input_text = """
# In the heart of the quaint village of Eldermere, a mysterious tree stood tall in the town square.
# Its gnarled branches bore fruits that resembled pears, but with an unusual twist: they seemed to
# shimmer with a golden hue. The villagers affectionately named it the "Shakespear" tree, believing it
# held magical properties.

# Legend had it that anyone who tasted a Shakespear would gain a glimpse into their future.
# Curiosity spread like wildfire, and soon, villagers flocked to the tree, eager for a taste of destiny.
# Young Emma, a spirited girl with dreams of becoming a writer, felt an undeniable pull toward the
# shimmering fruit.

# One crisp autumn morning, she approached the tree, heart racing. With a deep breath, she plucked a
# Shakespear and took a bite. Instantly, a whirlwind of visions enveloped her. She saw herself standing
# on a grand stage, the applause of a thousand voices echoing in her ears. In another glimpse, she wandered
# through enchanted forests, her stories coming to life.

# Determined to fulfill these dreams, Emma spent every spare moment writing. The villagers, inspired by
# her passion, began sharing their own tales. The square buzzed with creativity, and soon, Eldermere
# became a hub of storytelling.

# As the seasons changed, Emma’s words took flight. She published her first book, a collection of
# enchanting stories, and it captured the hearts of many beyond Eldermere. The Shakespear tree
# continued to stand, its golden pears glimmering, a reminder that dreams, when nurtured, could blossom
# into reality.

# And so, in the embrace of magic and creativity, the legacy of the Shakespear lived on, inspiring
# generations to reach for their dreams.
# """
# run_mcq_generator(input_text)

# Example usage for PDF:
# C:\Users\atulm\Desktop
run_mcq_generator(r"C:\Users\atulm\Desktop\sample.pdf")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-


Question 1: Who felt an undeniable pull toward the shimmering fruit?
  A. lived
  B. captured
  C. Young Emma
  D. writing
Answer: Young Emma

Question 2: What village is Eldermere located in?
  A. spread
  B. glimpse
  C. published
  D. Eldermere
Answer: Eldermere

Question 3: What was the name of the tree that stood tall in the town square?
  A. Shakespear Enigma
  B. taste
  C. coming
  D. unusual
Answer: Shakespear Enigma


In [None]:
#rule based also

In [66]:
import re
import random
import numpy as np
import nltk
import os
import fitz  # PyMuPDF for PDF reading
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize, ne_chunk, sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
from collections import Counter
from difflib import SequenceMatcher

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")

def read_pdf_text(file_path):
    """Extracts all text from a PDF file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"PDF file not found: {file_path}")
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged, binary=False)
    entities = set()
    for subtree in tree:
        if hasattr(subtree, 'label') and subtree.label() in ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY']:
            entity = ' '.join([token for token, pos in subtree.leaves()])
            if len(entity) > 1:
                entities.add(entity)
    return list(entities)

def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 0.0) for word in tf}

def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def generate_distractors(answer, context, idf, required=3):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)
    distractors = []
    used_words = set([t.lower() for t in answer_tokens])

    for word in set(context_tokens):
        if word in used_words or word in answer.lower() or answer.lower() in word:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        if 0.1 < sim < 0.9:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])
    distractor_words = [w for w, _ in distractors]

    if len(distractor_words) < required:
        extras = list(set(context_tokens) - set(distractor_words) - set(answer_tokens))
        random.shuffle(extras)
        distractor_words += extras[:required - len(distractor_words)]

    return distractor_words[:required]

def get_relevant_sentence(text, answer):
    sentences = sent_tokenize(text)
    for sent in sentences:
        if answer.lower() in sent.lower():
            return sent
    return text

def generate_question_rule_based(sentence, answer):
    if answer.istitle():
        return f"Who is {answer}?"
    elif any(word in answer.lower() for word in ['village', 'city', 'place', 'school', 'forest']):
        return f"Where is {answer} located?"
    else:
        return f"What is {answer}?"

def generate_question_with_fallback(context, answer):
    sentence = get_relevant_sentence(context, answer)
    pattern = re.compile(re.escape(answer), re.IGNORECASE)
    highlighted_sentence = pattern.sub(f"<hl>{answer}</hl>", sentence, count=1)
    input_text = f"generate question: {highlighted_sentence} answer: {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    question = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if not question or answer.lower() in question.lower():
        question = generate_question_rule_based(sentence, answer)
        print(f"Rule-based fallback triggered for answer: '{answer}'")
    return question

def is_similar(q1, q2, threshold=0.85):
    return SequenceMatcher(None, q1.lower(), q2.lower()).ratio() >= threshold

def generate_mcqs_from_text(text, desired_mcq_count=5):
    candidate_answers = extract_named_entities(text)
    if len(candidate_answers) < desired_mcq_count:
        freq_words = Counter(preprocess(text)).most_common(50)
        for word, _ in freq_words:
            if word not in candidate_answers and word.isalpha() and len(word) > 2:
                candidate_answers.append(word)

    idf = {word: 1.0 for word in preprocess(text)}
    mcqs = []
    used_questions = []

    for answer in candidate_answers:
        if len(mcqs) >= desired_mcq_count:
            break
        if len(answer.split()) > 5 or len(answer) < 2:
            continue

        distractors = generate_distractors(answer, text, idf, required=3)
        if len(set(distractors)) < 3:
            continue

        question = generate_question_with_fallback(text, answer)
        if any(is_similar(question, existing_q) for existing_q in used_questions):
            continue

        options = distractors[:3] + [answer]
        random.shuffle(options)

        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
        used_questions.append(question)

    return mcqs

# -------- Switch here --------
use_pdf = False  # Set to True to read from PDF, False for plain text

if use_pdf:
    pdf_path = "your_pdf_file.pdf"  # <-- Set your PDF file path here
    input_text = read_pdf_text(pdf_path)
else:
    input_text = """
    In the heart of the quaint village of Eldermere, a mysterious tree stood tall in the town square.
    Its gnarled branches bore fruits that resembled pears, but with an unusual twist: they seemed to
    shimmer with a golden hue. The villagers affectionately named it the "Shakespear" tree, believing it
    held magical properties.

    Legend had it that anyone who tasted a Shakespear would gain a glimpse into their future.
    Curiosity spread like wildfire, and soon, villagers flocked to the tree, eager for a taste of destiny.
    Young Emma, a spirited girl with dreams of becoming a writer, felt an undeniable pull toward the
    shimmering fruit.

    One crisp autumn morning, she approached the tree, heart racing. With a deep breath, she plucked a
    Shakespear and took a bite. Instantly, a whirlwind of visions enveloped her. She saw herself standing
    on a grand stage, the applause of a thousand voices echoing in her ears. In another glimpse, she wandered
    through enchanted forests, her stories coming to life.

    Determined to fulfill these dreams, Emma spent every spare moment writing. The villagers, inspired by
    her passion, began sharing their own tales. The square buzzed with creativity, and soon, Eldermere
    became a hub of storytelling.

    As the seasons changed, Emma’s words took flight. She published her first book, a collection of
    enchanting stories, and it captured the hearts of many beyond Eldermere. The Shakespear tree
    continued to stand, its golden pears glimmering, a reminder that dreams, when nurtured, could blossom
    into reality.
    """

print(f"\nGenerating MCQs from {'PDF file' if use_pdf else 'plain text'}...\n")
mcqs = generate_mcqs_from_text(input_text, desired_mcq_count=5)

for i, mcq in enumerate(mcqs):
    print(f"\nQuestion {i+1}: {mcq['question']}")
    for j, opt in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {opt}")
    print(f"Answer: {mcq['answer']}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-


Generating MCQs from plain text...


Question 1: What is the name of the village in which a mysterious tree stood in the square?
  A. sharing
  B. village
  C. Eldermere
  D. branch
Answer: Eldermere

Question 2: Who felt an undeniable pull toward the shimmering fruit?
  A. racing
  B. took
  C. unusual
  D. Young Emma
Answer: Young Emma

Question 3: What did the villagers call the tree?
  A. Shakespear
  B. bite
  C. changed
  D. writer
Answer: Shakespear

Question 4: What mysterious object stood tall in the town square?
  A. coming
  B. tree
  C. village
  D. first
Answer: tree

Question 5: What is the name of the tree that stood tall in the town square?
  A. village
  B. heart
  C. held
  D. glimmering
Answer: heart


In [None]:
#if t5 fails

In [64]:
import re
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize, ne_chunk, sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
from collections import Counter
from difflib import SequenceMatcher

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Setup
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = T5ForConditionalGeneration.from_pretrained("valhalla/t5-base-qg-hl")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

def extract_named_entities(text):
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = ne_chunk(tagged, binary=False)
    entities = set()
    for subtree in tree:
        if hasattr(subtree, 'label') and subtree.label() in ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY']:
            entity = ' '.join([token for token, pos in subtree.leaves()])
            if len(entity) > 1:
                entities.add(entity)
    return list(entities)

def compute_tf(tokens):
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_tf_idf(tf, idf):
    return {word: tf[word] * idf.get(word, 0.0) for word in tf}

def cosine_similarity(vec1, vec2):
    words = set(vec1.keys()).union(set(vec2.keys()))
    v1 = np.array([vec1.get(w, 0.0) for w in words])
    v2 = np.array([vec2.get(w, 0.0) for w in words])
    dot = np.dot(v1, v2)
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    return dot / (norm1 * norm2) if norm1 and norm2 else 0.0

def generate_distractors(answer, context, idf, required=3):
    answer_tokens = preprocess(answer)
    context_tokens = preprocess(context)
    answer_vec = compute_tf_idf(compute_tf(answer_tokens), idf)
    distractors = []
    used_words = set([t.lower() for t in answer_tokens])

    for word in set(context_tokens):
        if word in used_words or word in answer.lower() or answer.lower() in word:
            continue
        word_vec = compute_tf_idf(compute_tf([word]), idf)
        sim = cosine_similarity(answer_vec, word_vec)
        if 0.1 < sim < 0.9:
            distractors.append((word, sim))

    distractors = sorted(distractors, key=lambda x: -x[1])
    distractor_words = [w for w, _ in distractors]

    if len(distractor_words) < required:
        extras = list(set(context_tokens) - set(distractor_words) - set(answer_tokens))
        random.shuffle(extras)
        distractor_words += extras[:required - len(distractor_words)]

    return distractor_words[:required]

def get_relevant_sentence(text, answer):
    sentences = sent_tokenize(text)
    for sent in sentences:
        if answer.lower() in sent.lower():
            return sent
    return text

def generate_question_rule_based(sentence, answer):
    if answer.istitle():
        return f"Who is {answer}?"
    elif any(word in answer.lower() for word in ['village', 'city', 'place', 'school', 'forest']):
        return f"Where is {answer} located?"
    else:
        return f"What is {answer}?"

def generate_question_with_fallback(context, answer):
    sentence = get_relevant_sentence(context, answer)
    pattern = re.compile(re.escape(answer), re.IGNORECASE)
    highlighted_sentence = pattern.sub(f"<hl>{answer}</hl>", sentence, count=1)
    input_text = f"generate question: {highlighted_sentence} answer: {answer}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=64, num_beams=4, early_stopping=True)
    question = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # === Simulate failure of T5 for demonstration ===
    if answer.lower() == "shakespear":
        question = ""  # Force empty output to trigger fallback

    if not question or answer.lower() in question.lower():
        question = generate_question_rule_based(sentence, answer)
        print(f"Rule-based fallback triggered for answer: '{answer}'")  # Debug output

    return question

def is_similar(q1, q2, threshold=0.85):
    return SequenceMatcher(None, q1.lower(), q2.lower()).ratio() >= threshold

def generate_mcqs_from_text(text, desired_mcq_count=5):
    candidate_answers = extract_named_entities(text)
    if len(candidate_answers) < desired_mcq_count:
        freq_words = Counter(preprocess(text)).most_common(50)
        for word, _ in freq_words:
            if word not in candidate_answers and word.isalpha() and len(word) > 2:
                candidate_answers.append(word)

    idf = {word: 1.0 for word in preprocess(text)}
    mcqs = []
    used_questions = []

    for answer in candidate_answers:
        if len(mcqs) >= desired_mcq_count:
            break
        if len(answer.split()) > 5 or len(answer) < 2:
            continue

        distractors = generate_distractors(answer, text, idf, required=3)
        if len(set(distractors)) < 3:
            continue

        question = generate_question_with_fallback(text, answer)
        if any(is_similar(question, existing_q) for existing_q in used_questions):
            continue

        options = distractors[:3] + [answer]
        random.shuffle(options)

        mcqs.append({
            "question": question,
            "options": options,
            "answer": answer
        })
        used_questions.append(question)

    return mcqs

# ======= SAMPLE TEXT =======
input_text = """
In the heart of the quaint village of Eldermere, a mysterious tree stood tall in the town square.
Its gnarled branches bore fruits that resembled pears, but with an unusual twist: they seemed to
shimmer with a golden hue. The villagers affectionately named it the "Shakespear" tree, believing it
held magical properties.

Legend had it that anyone who tasted a Shakespear would gain a glimpse into their future.
Curiosity spread like wildfire, and soon, villagers flocked to the tree, eager for a taste of destiny.
Young Emma, a spirited girl with dreams of becoming a writer, felt an undeniable pull toward the
shimmering fruit.

One crisp autumn morning, she approached the tree, heart racing. With a deep breath, she plucked a
Shakespear and took a bite. Instantly, a whirlwind of visions enveloped her. She saw herself standing
on a grand stage, the applause of a thousand voices echoing in her ears. In another glimpse, she wandered
through enchanted forests, her stories coming to life.

Determined to fulfill these dreams, Emma spent every spare moment writing. The villagers, inspired by
her passion, began sharing their own tales. The square buzzed with creativity, and soon, Eldermere
became a hub of storytelling.

As the seasons changed, Emma’s words took flight. She published her first book, a collection of
enchanting stories, and it captured the hearts of many beyond Eldermere. The Shakespear tree
continued to stand, its golden pears glimmering, a reminder that dreams, when nurtured, could blossom
into reality.
"""

# ========== RUN ==========
mcqs = generate_mcqs_from_text(input_text, desired_mcq_count=5)
for i, mcq in enumerate(mcqs):
    print(f"\nQuestion {i+1}: {mcq['question']}")
    for j, opt in enumerate(mcq['options']):
        print(f"  {chr(65+j)}. {opt}")
    print(f"Answer: {mcq['answer']}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\atulm\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

Rule-based fallback triggered for answer: 'Shakespear'
Rule-based fallback triggered for answer: 'shakespear'

Question 1: What is the name of the village in which a mysterious tree stood in the square?
  A. Eldermere
  B. named
  C. heart
  D. fruit
Answer: Eldermere

Question 2: Who felt an undeniable pull toward the shimmering fruit?
  A. morning
  B. instantly
  C. Young Emma
  D. legend
Answer: Young Emma

Question 3: Who is Shakespear?
  A. spread
  B. Shakespear
  C. felt
  D. spent
Answer: Shakespear

Question 4: What mysterious object stood tall in the town square?
  A. fulfill
  B. tree
  C. inspired
  D. tale
Answer: tree

Question 5: What is the name of the tree that stood tall in the town square?
  A. wildfire
  B. shakespear
  C. heart
  D. resembled
Answer: heart
