In [1]:
#Please use python 3.9 environment
# Working version link (in case this notebook is not running properly): https://www.kaggle.com/code/aathanush/cs787-question-generator/edit 
#Install these libraries if you haven't already (uncomment the line below to install them)
# !pip install -U transformers accelerate datasets huggingface_hub -q

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments





In [None]:
#Use this in case of out of memory errors. Else, no need to run this code
#!pip install GPUtil
#For clearing GPU cache
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             
    torch.cuda.empty_cache()
    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()                           


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


## Data Preprocessing

In [2]:
print("\n--- Step 1: Loading and Preprocessing Data ---")
# Requires internet connection as we are downloading data form huggingface
# Load the dataset from the Hugging Face Hub
try:
    df = pd.read_csv("hf://datasets/KadamParth/Ncert_dataset/NCERT_Dataset.csv")
    print(f"Successfully loaded dataset. Initial shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")


# Filter for grades 10, 11, and 12
df = df[df['grade'].isin([10, 11, 12])]
print(f"Shape after filtering for grades [10, 11, 12]: {df.shape}")

# Filter for the specified subjects
allowed_subjects = ["Biology", "Chemistry", "Physics", "Science"]
df = df[df['subject'].isin(allowed_subjects)]
print(f"Shape after filtering for subjects {allowed_subjects}: {df.shape}")

# Drop rows with any null values (especially in 'Answer')
df.dropna(inplace=True)
print(f"Shape after dropping null values: {df.shape}")

# Reset index after filtering
df.reset_index(drop=True, inplace=True)


df['qg_input_text'] = df.apply(
    lambda row: f"Generate a {row['Difficulty']} question for a grade {row['grade']} {row['subject']} student using this context: {row['Explanation']}",
    axis=1
)

df['ag_input_text'] = df.apply(
    lambda row: f"Answer the following {row['Difficulty']} grade {row['grade']} {row['subject']} question. Explanation: {row['Explanation']} Question: {row['Question']}",
    axis=1
)




--- Step 1: Loading and Preprocessing Data ---
Successfully loaded dataset. Initial shape: (120406, 12)
Shape after filtering for grades [10, 11, 12]: (98199, 12)
Shape after filtering for subjects ['Biology', 'Chemistry', 'Physics', 'Science']: (30706, 12)
Shape after dropping null values: (30706, 12)


# MODEL TRAINING 


## T5 Transformer

In [3]:
def train_model(df, input_col, target_col, model_output_dir):
    """
    Trains a T5 model on the provided dataframe.

    Args:
        df (pd.DataFrame): The preprocessed dataframe.
        input_col (str): The name of the column containing the input text.
        target_col (str): The name of the column containing the target text.
        model_output_dir (str): The directory to save the trained model.
    """
    print(f"\n--- Starting Training for: {model_output_dir} ---")

    model_df = df[[input_col, target_col]].copy()
    model_df.rename(columns={input_col: 'input_text', target_col: 'target_text'}, inplace=True)

    dataset = Dataset.from_pandas(model_df)

    train_test_split_dataset = dataset.train_test_split(test_size=0.1)
    dataset_dict = DatasetDict({
        'train': train_test_split_dataset['train'],
        'validation': train_test_split_dataset['test']
    })
    print(f"Dataset prepared and split:\n{dataset_dict}")

    # MODEL_NAME = 't5-small'
    MODEL_NAME = 't5-base'
    #MODEL_NAME = "t5-large" t5-large was not working due to large size and lower compute hence its results were not published in report
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

    MAX_INPUT_LENGTH = 512  
    MAX_TARGET_LENGTH = 128 

    def tokenize_function(examples):
        model_inputs = tokenizer(
            examples['input_text'],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding='max_length'
        )
        labels = tokenizer(
            text_target=examples['target_text'],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding='max_length'
        )
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
    print("Tokenization complete.")

    model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

    training_args = TrainingArguments(
    output_dir=model_output_dir,
    num_train_epochs=3,
    save_strategy='steps',
    eval_strategy='steps',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'./logs/{model_output_dir}',
    logging_steps=500, 
    save_steps=500,
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer
    )

    print(f"Starting training for {model_output_dir}. This might take a while...")
    trainer.train()

    print(f"Training finished. Saving model to {model_output_dir}")
    trainer.save_model(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)



## Training models

In [8]:

# --- Train Model 1: Question Generator ---
train_model(
    df=df,
    input_col='qg_input_text',
    target_col='Question',
    model_output_dir='./question_generator_model_t5base'
)

# --- Train Model 2: Answer Generator ---
train_model(
    df=df, 
    input_col='ag_input_text',
    target_col='Answer',
    model_output_dir='./answer_generator_model_t5base'
)


--- Starting Training for: ./question_generator_model_t5base ---
Dataset prepared and split:
DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 27635
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 3071
    })
})


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/27635 [00:00<?, ? examples/s]

Map:   0%|          | 0/3071 [00:00<?, ? examples/s]

Tokenization complete.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


Starting training for ./question_generator_model_t5base. This might take a while...




Step,Training Loss,Validation Loss
500,2.5902,0.217149
1000,0.2149,0.19987
1500,0.2074,0.186916
2000,0.1954,0.181906
2500,0.1908,0.176095
3000,0.1893,0.172204
3500,0.185,0.170081
4000,0.1676,0.167327
4500,0.1667,0.165953
5000,0.1685,0.162993


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training finished. Saving model to ./question_generator_model_t5base

--- Starting Training for: ./answer_generator_model_t5base ---
Dataset prepared and split:
DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 27635
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 3071
    })
})


Map:   0%|          | 0/27635 [00:00<?, ? examples/s]

Map:   0%|          | 0/3071 [00:00<?, ? examples/s]

Tokenization complete.


  trainer = Trainer(


Starting training for ./answer_generator_model_t5base. This might take a while...




Step,Training Loss,Validation Loss
500,2.6148,0.577911
1000,0.6226,0.542167
1500,0.602,0.521803
2000,0.5739,0.509029
2500,0.5567,0.499482
3000,0.5503,0.491342
3500,0.5459,0.486965
4000,0.5253,0.482715
4500,0.5153,0.478218
5000,0.5146,0.474124


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Training finished. Saving model to ./answer_generator_model_t5base


## Inference

In [11]:

print("\n\n--- Step 4: Inference Example ---")
qg_model_path = './question_generator_model_t5base'
ag_model_path = './answer_generator_model_t5base'

qg_tokenizer = T5Tokenizer.from_pretrained(qg_model_path)
qg_model = T5ForConditionalGeneration.from_pretrained(qg_model_path).to(device)

ag_tokenizer = T5Tokenizer.from_pretrained(ag_model_path)
ag_model = T5ForConditionalGeneration.from_pretrained(ag_model_path).to(device)

print("Models loaded successfully.")

# --- Create generation functions ---
def generate_question(explanation, grade, subject, difficulty, complexity):
    """Generates a question using the fine-tuned QG model."""
    input_text = f"Generate a {difficulty} question for a grade {grade} {subject} student using this context: {explanation}"
    
    inputs = qg_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    outputs = qg_model.generate(
        inputs['input_ids'],
        max_length=128,
        num_beams=5,
        early_stopping=True
    )
    
    generated_question = qg_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_question

def generate_answer(explanation, grade, subject, difficulty, question):
    """Generates an answer using the fine-tuned AG model."""
    input_text = f"Answer the following {difficulty} grade {grade} {subject} question. Explanation: {explanation} Question: {question}"
    
    inputs = ag_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    outputs = ag_model.generate(
        inputs['input_ids'],
        max_length=256, # Answers can be longer
        num_beams=5,
        early_stopping=True
    )
    
    generated_answer = ag_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_answer


# --- Let's test it with a sample from our dataset ---
sample_row = df.iloc[15090] # Take a random sample to test

explanation = sample_row['Explanation']
grade = sample_row['grade']
subject = sample_row['subject']
difficulty = sample_row['Difficulty']
complexity = sample_row['QuestionComplexity']

print("\n--- Generating a new question based on this context: ---")
print(f"Subject: {subject}, Grade: {grade}, Difficulty: {difficulty}")
print(f"Explanation: {explanation[:500]}...") # Print first 200 chars

# 1. Generate the question
generated_q = generate_question(explanation, grade, subject, difficulty, complexity)
print("\nGENERATED QUESTION:")
print(generated_q)
print("\nORIGINAL QUESTION (for comparison):")
print(sample_row['Question'])

# 2. Generate the answer for the newly generated question
generated_a = generate_answer(explanation, grade, subject, difficulty, generated_q)
print("\nGENERATED ANSWER (for the new question):")
print(generated_a)
print("\nORIGINAL ANSWER (for comparison):")
print(sample_row['Answer'])




--- Step 4: Inference Example ---
Models loaded successfully.

--- Generating a new question based on this context: ---
Subject: Chemistry, Grade: 11, Difficulty: Medium
Explanation: In the periodic table, the electronic configuration of elements determines various properties, including atomic radii. For Group 1 elements (alkali metals like Na, K, Rb, Cs), the atomic radius increases as you move down the group. This is because each subsequent element has an additional electron shell, which increases the distance from the nucleus to the outermost electrons. In contrast, for Group 17 elements (halogens like F, Cl, Br, I), the atomic radius decreases as you move down the group....

GENERATED QUESTION:
Describe the relationship between the number of electron shells and the atomic radius for Group 1 elements.

ORIGINAL QUESTION (for comparison):
Describe the general outer electronic configuration of p-block elements.

GENERATED ANSWER (for the new question):
The atomic radius for Group 1 

## Evaluation

In [12]:
print("\n\n--- Step 5: Evaluating Model Performance ---")

#!pip install evaluate sacrebleu bert-score tqdm -q

import evaluate
from tqdm.auto import tqdm
import numpy as np

# Load the metric calculators
try:
    bleu_metric = evaluate.load('sacrebleu')
    bertscore_metric = evaluate.load('bertscore')
    print("Evaluation metrics loaded successfully.")
except Exception as e:
    print(f"Could not load metrics: {e}")

# Let's evaluate on a random sample of 100 items from our original filtered dataframe
# to get a good statistical measure without taking too long.
EVAL_SAMPLE_SIZE = 100
if len(df) > EVAL_SAMPLE_SIZE:
    eval_df = df.sample(n=EVAL_SAMPLE_SIZE, random_state=42)
else:
    eval_df = df

print(f"Running evaluation on {len(eval_df)} samples...")

# Store the generated texts and the reference texts
generated_questions = []
reference_questions = []
generated_answers = []
reference_answers = []

# Loop through the evaluation sample and generate predictions
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Generating for Evaluation"):
    # --- Part 1: Evaluate the Question Generator ---
    # Get the reference (original) question
    ref_q = row['Question']
    reference_questions.append(ref_q)
    
    # Generate a new question from the context
    gen_q = generate_question(
        explanation=row['Explanation'],
        grade=row['grade'],
        subject=row['subject'],
        difficulty=row['Difficulty'],
        complexity=row['QuestionComplexity']
    )
    generated_questions.append(gen_q)

    # --- Part 2: Evaluate the Answer Generator ---
    # Get the reference (original) answer
    ref_a = row['Answer']
    reference_answers.append(ref_a)
    
    # Generate an answer for the *newly generated question*
    # This tests the full pipeline
    gen_a = generate_answer(
        explanation=row['Explanation'],
        grade=row['grade'],
        subject=row['subject'],
        difficulty=row['Difficulty'],
        question=gen_q # Use the generated question as input
    )
    generated_answers.append(gen_a)

# --- Now, calculate the scores ---

# For BLEU, the references need to be in a list of lists.
bleu_references_q = [[q] for q in reference_questions]
bleu_references_a = [[a] for a in reference_answers]

print("\n--- Calculating Scores for Question Generation Model ---")
try:
    # Calculate BLEU Score for Questions
    bleu_score_q = bleu_metric.compute(predictions=generated_questions, references=bleu_references_q)
    print(f"BLEU Score: {bleu_score_q['score']:.2f}")

    # Calculate BERTScore for Questions
    bert_score_q = bertscore_metric.compute(predictions=generated_questions, references=reference_questions, lang="en")
    # We take the average F1 score
    avg_f1_q = np.mean(bert_score_q['f1'])
    print(f"BERTScore (F1): {avg_f1_q:.4f}")

except Exception as e:
    print(f"An error occurred during question scoring: {e}")

print("\n--- Calculating Scores for Answer Generation Model ---")
try:
    # Calculate BLEU Score for Answers
    bleu_score_a = bleu_metric.compute(predictions=generated_answers, references=bleu_references_a)
    print(f"BLEU Score: {bleu_score_a['score']:.2f}")

    # Calculate BERTScore for Answers
    bert_score_a = bertscore_metric.compute(predictions=generated_answers, references=reference_answers, lang="en")
    avg_f1_a = np.mean(bert_score_a['f1'])
    print(f"BERTScore (F1): {avg_f1_a:.4f}")

except Exception as e:
    print(f"An error occurred during answer scoring: {e}")


# --- Explanation of Scores ---
print("\n--- What do these scores mean? ---")
print("BLEU Score: Measures word/phrase overlap. A score of 0-10 is poor, 10-20 is okay, 20-30 is good, and >30 is considered high quality. It's scaled from 0 to 100.")
print("BERTScore (F1): Measures semantic similarity (meaning). A higher score is better (ranges from 0 to 1). It is generally a more reliable indicator of quality than BLEU for tasks like this.")



--- Step 5: Evaluating Model Performance ---
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics loaded successfully.
Running evaluation on 100 samples...


Generating for Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]


--- Calculating Scores for Question Generation Model ---
BLEU Score: 28.06


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore (F1): 0.9217

--- Calculating Scores for Answer Generation Model ---
BLEU Score: 15.88
BERTScore (F1): 0.8832

--- What do these scores mean? ---
BLEU Score: Measures word/phrase overlap. A score of 0-10 is poor, 10-20 is okay, 20-30 is good, and >30 is considered high quality. It's scaled from 0 to 100.
BERTScore (F1): Measures semantic similarity (meaning). A higher score is better (ranges from 0 to 1). It is generally a more reliable indicator of quality than BLEU for tasks like this.


# BART

In [4]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

torch.set_num_threads(4) 
import gc
gc.collect()
torch.cuda.empty_cache()
from transformers import BartTokenizer, BartForConditionalGeneration

def train_model_bart(df, input_col, target_col, model_output_dir):
    """
    Trains a BART model on the provided dataframe.

    Args:
        df (pd.DataFrame): The preprocessed dataframe.
        input_col (str): The name of the column containing the input text.
        target_col (str): The name of the column containing the target text.
        model_output_dir (str): The directory to save the trained model.
    """
    print(f"\n--- Starting Training for: {model_output_dir} ---")

    model_df = df[[input_col, target_col]].copy()
    model_df.rename(columns={input_col: 'input_text', target_col: 'target_text'}, inplace=True)

    dataset = Dataset.from_pandas(model_df)

    train_test_split_dataset = dataset.train_test_split(test_size=0.1)
    dataset_dict = DatasetDict({
        'train': train_test_split_dataset['train'],
        'validation': train_test_split_dataset['test']
    })
    print(f"Dataset prepared and split:\n{dataset_dict}")


    MODEL_NAME = 'facebook/bart-base'

    tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)

    MAX_INPUT_LENGTH = 512  
    MAX_TARGET_LENGTH = 512 

    def tokenize_function(examples):
        model_inputs = tokenizer(
            examples['input_text'],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
            padding='max_length'
        )
        labels = tokenizer(
            text_target=examples['target_text'],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding='max_length'
        )
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
    print("Tokenization complete.")

    model = BartForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

    training_args = TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=3,
        save_strategy='steps',
        eval_strategy='steps',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs/{model_output_dir}',
        logging_steps=500,
        save_steps=500,
        load_best_model_at_end=True,
        save_total_limit=2,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer
    )

    print(f"Starting training for {model_output_dir}. This might take a while...")
    trainer.train()

    print(f"Training finished. Saving model to {model_output_dir}")
    trainer.save_model(model_output_dir)
    tokenizer.save_pretrained(model_output_dir)

In [5]:
train_model_bart(
    df=df, 
    input_col='qg_input_text',
    target_col='Question',
    model_output_dir='./question_generator_model_bart'
)

train_model_bart(
    df=df, 
    input_col='ag_input_text',
    target_col='Answer',
    model_output_dir='./answer_generator_model_bart'
)

Map (num_proc=4):   0%|          | 0/27635 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3071 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training... (This will still be slower than GPU, but optimized)




Step,Training Loss
50,4.6048
100,0.8307
150,0.5091
200,0.4618
250,0.4412
300,0.443
350,0.4208
400,0.4326
450,0.4062
500,0.4165


Saving model to ./question_generator_model_bart




Map (num_proc=4):   0%|          | 0/27635 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3071 [00:00<?, ? examples/s]

  trainer = Trainer(


Starting training... (This will still be slower than GPU, but optimized)




Step,Training Loss
50,3.7844
100,1.6354
150,1.3491
200,1.2875
250,1.2775
300,1.2279
350,1.2179
400,1.1685
450,1.1703
500,1.1601


Saving model to ./answer_generator_model_bart




## Inference

In [8]:
print("\n\n--- Step 4: Inference Example (BART) ---")

from transformers import BartTokenizer, BartForConditionalGeneration

qg_model_path = './question_generator_model_bart'
ag_model_path = './answer_generator_model_bart'

qg_tokenizer = BartTokenizer.from_pretrained(qg_model_path)
qg_model = BartForConditionalGeneration.from_pretrained(qg_model_path).to(device)

ag_tokenizer = BartTokenizer.from_pretrained(ag_model_path)
ag_model = BartForConditionalGeneration.from_pretrained(ag_model_path).to(device)

print("BART models loaded successfully.")

def generate_question(explanation, grade, subject, difficulty, complexity):
    """Generates a question using the fine-tuned BART QG model."""
    input_text = f"Generate a {difficulty} question with complexity {complexity:.2f} for a grade {grade} {subject} student. Explanation: {explanation}"
    
    inputs = qg_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    outputs = qg_model.generate(
        inputs['input_ids'],
        max_length=128,
        num_beams=5,
        early_stopping=True
    )
    
    generated_question = qg_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_question

def generate_answer(explanation, grade, subject, difficulty, question):
    """Generates an answer using the fine-tuned BART AG model."""
    input_text = f"Answer the following {difficulty} grade {grade} {subject} question. Explanation: {explanation} Question: {question}"
    
    inputs = ag_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    outputs = ag_model.generate(
        inputs['input_ids'],
        max_length=256, # Answers can be longer
        num_beams=5,
        early_stopping=True
    )
    
    generated_answer = ag_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_answer


sample_row = df.iloc[100] # Take a random sample to test

explanation = sample_row['Explanation']
grade = sample_row['grade']
subject = sample_row['subject']
difficulty = sample_row['Difficulty']
complexity = sample_row['QuestionComplexity']

print("\n--- Generating a new question based on this context (BART): ---")
print(f"Subject: {subject}, Grade: {grade}, Difficulty: {difficulty}")
print(f"Explanation: {explanation[:200]}...") # Print first 200 chars

# 1. Generate the question
generated_q = generate_question(explanation, grade, subject, difficulty, complexity)
print("\nGENERATED QUESTION (BART):")
print(generated_q)
print("\nORIGINAL QUESTION (for comparison):")
print(sample_row['Question'])

# 2. Generate the answer for the newly generated question
generated_a = generate_answer(explanation, grade, subject, difficulty, generated_q)
print("\nGENERATED ANSWER (for the new question) (BART):")
print(generated_a)
print("\nORIGINAL ANSWER (for comparison):")
print(sample_row['Answer'])





--- Step 4: Inference Example (BART) ---
BART models loaded successfully.

--- Generating a new question based on this context (BART): ---
Subject: Physics, Grade: 12, Difficulty: Medium
Explanation: Electrons carry a fundamental unit of charge, which is measured in Coulombs (C). The charge of one electron is approximately 1.6 × 10^-19 C. To calculate the time required to accumulate a certain amou...

GENERATED QUESTION (BART):
Describe the relationship between current, charge, and time in an electric circuit.

ORIGINAL QUESTION (for comparison):
If 10^10 electrons move from one body to another every second, how long will it take to transfer 1 C of charge?

GENERATED ANSWER (for the new question) (BART):
To calculate the time required to accumulate a certain amount of electric charge, you need to understand the rate at which electrons are transferred. This rate is often given as current (I).

ORIGINAL ANSWER (for comparison):
The charge transferred per second is 1.6 Ã— 10^-19 C Ã— 10

## Evaluation

In [9]:

print("\n\n--- Step 5: Evaluating BART Model Performance ---")

# Make sure the libraries are installed
!pip install evaluate sacrebleu bert-score tqdm -q

import evaluate
from tqdm.auto import tqdm
import numpy as np

# Load the metric calculators
try:
    bleu_metric = evaluate.load('sacrebleu')
    bertscore_metric = evaluate.load('bertscore')
    print("Evaluation metrics loaded successfully.")
except Exception as e:
    print(f"Could not load metrics: {e}")

# Let's evaluate on a random sample of 100 items
EVAL_SAMPLE_SIZE = 100
if len(df) > EVAL_SAMPLE_SIZE:
    eval_df = df.sample(n=EVAL_SAMPLE_SIZE, random_state=42)
else:
    eval_df = df

print(f"Running evaluation on {len(eval_df)} samples...")

# Store the generated texts and the reference texts
generated_questions = []
reference_questions = []
generated_answers = []
reference_answers = []

# Loop through the evaluation sample and generate predictions
# This loop is IDENTICAL to the T5 one. It works because our
# `generate_question` and `generate_answer` functions are now
# powered by the BART models you just loaded.
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Generating for Evaluation (BART)"):
    # --- Part 1: Evaluate the Question Generator ---
    ref_q = row['Question']
    reference_questions.append(ref_q)
    
    gen_q = generate_question(
        explanation=row['Explanation'],
        grade=row['grade'],
        subject=row['subject'],
        difficulty=row['Difficulty'],
        complexity=row['QuestionComplexity']
    )
    generated_questions.append(gen_q)

    # --- Part 2: Evaluate the Answer Generator ---
    ref_a = row['Answer']
    reference_answers.append(ref_a)
    
    gen_a = generate_answer(
        explanation=row['Explanation'],
        grade=row['grade'],
        subject=row['subject'],
        difficulty=row['Difficulty'],
        question=gen_q # Use the generated question
    )
    generated_answers.append(gen_a)

# --- Now, calculate the scores ---

bleu_references_q = [[q] for q in reference_questions]
bleu_references_a = [[a] for a in reference_answers]

print("\n--- Calculating Scores for Question Generation Model (BART) ---")
try:
    bleu_score_q = bleu_metric.compute(predictions=generated_questions, references=bleu_references_q)
    print(f"BLEU Score: {bleu_score_q['score']:.2f}")

    bert_score_q = bertscore_metric.compute(predictions=generated_questions, references=reference_questions, lang="en")
    avg_f1_q = np.mean(bert_score_q['f1'])
    print(f"BERTScore (F1): {avg_f1_q:.4f}")

except Exception as e:
    print(f"An error occurred during question scoring: {e}")

print("\n--- Calculating Scores for Answer Generation Model (BART) ---")
try:
    bleu_score_a = bleu_metric.compute(predictions=generated_answers, references=bleu_references_a)
    print(f"BLEU Score: {bleu_score_a['score']:.2f}")

    bert_score_a = bertscore_metric.compute(predictions=generated_answers, references=reference_answers, lang="en")
    avg_f1_a = np.mean(bert_score_a['f1'])
    print(f"BERTScore (F1): {avg_f1_a:.4f}")

except Exception as e:
    print(f"An error occurred during answer scoring: {e}")



--- Step 5: Evaluating BART Model Performance ---


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluation metrics loaded successfully.
Running evaluation on 100 samples...


Generating for Evaluation (BART):   0%|          | 0/100 [00:00<?, ?it/s]


--- Calculating Scores for Question Generation Model (BART) ---
BLEU Score: 25.79


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore (F1): 0.9193

--- Calculating Scores for Answer Generation Model (BART) ---
BLEU Score: 12.71
BERTScore (F1): 0.8796


# BERT-SQG Implementation for Question Generation

## Model definition and Training setup

In [None]:
#!pip install pandas torch transformers datasets scikit-learn sentencepiece accelerate tqdm -q


class BERT_SQG(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased'):
        super().__init__()
        # Load the BERT configuration
        self.config = BertConfig.from_pretrained(bert_model_name)
        
        # Load the base BERT model (the Encoder)
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # Add the classification head on top
        # This head predicts a word from the vocab for a given hidden state
        self.cls = BertOnlyMLMHead(self.config)
        
    def forward(self, input_ids, attention_mask):
        # Pass the input through the base BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the hidden state of all tokens
        sequence_output = outputs[0]
        
        # Pass the hidden states to the MLM head to get logits for each token
        prediction_scores = self.cls(sequence_output)
        
        return prediction_scores

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Instantiate the model and resize token embeddings for the new [HL] token
model = BERT_SQG()
model.bert.resize_token_embeddings(len(tokenizer))
model.to(device)

print("Model defined and loaded.")


def create_training_examples(row):
    context = row['Explanation']
    answer = row['Answer']
    question = row['Question']
    
        
    prompt = f"[CLS] {context} [SEP] {answer} [SEP]"
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
    
    question_tokens = tokenizer.encode(question, add_special_tokens=False)
    question_tokens.append(tokenizer.sep_token_id) # Add [SEP] as end token

    examples = []
    
    for i in range(len(question_tokens)):
        generated_part = question_tokens[:i] 
        input_tokens = prompt_tokens + generated_part + [tokenizer.mask_token_id]
        
        # 2. Create the label
        # The label is the token we are trying to predict
        label_token = question_tokens[i]
        
        # Truncate to BERT's max length
        input_tokens = input_tokens[:511] # Leave one spot for the [MASK]
        
        # If we truncated the prompt, we can't create examples
        if len(prompt_tokens) >= 511:
            break
            
        # Ensure [MASK] is at the end if truncated
        if input_tokens[-1] != tokenizer.mask_token_id:
            input_tokens[-1] = tokenizer.mask_token_id
            
        examples.append({
            'input_ids': input_tokens,
            'label_id': label_token
        })
        
        # Stop if we've already predicted the [SEP] token
        if label_token == tokenizer.sep_token_id:
            break
            
    return examples

all_training_examples = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    all_training_examples.extend(create_training_examples(row))

print(f"Created {len(all_training_examples)} training examples from {len(df)} rows.")

# Create a custom PyTorch Dataset
class SQGDataset(Dataset):
    def __init__(self, examples, tokenizer, max_len=512):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        input_ids = example['input_ids']
        label_id = example['label_id']
        
        # Pad the input_ids
        padding_length = self.max_len - len(input_ids)
        attention_mask = [1] * len(input_ids) + [0] * padding_length
        input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
        
        # Find the index of the [MASK] token
        # This is where we will get the logits from
        try:
            mask_index = input_ids.index(self.tokenizer.mask_token_id)
        except ValueError:
            # Should not happen, but as a fallback
            mask_index = len(example['input_ids']) - 1 

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'mask_index': torch.tensor(mask_index, dtype=torch.long),
            'label_id': torch.tensor(label_id, dtype=torch.long)
        }

train_dataset = SQGDataset(all_training_examples, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) # Adjust batch_size based on VRAM


## Training

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5) # [cite: 285]
criterion = nn.CrossEntropyLoss()
NUM_EPOCHS = 1 

model.train()
for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        # Send data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        mask_indices = batch['mask_index'].to(device)
        labels = batch['label_id'].to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        logits = model(input_ids, attention_mask)

        # Get the logits for *only* the [MASK] token
        # This is the core logic from the paper [cite: 141-144]
        # We need to gather the logits from the correct index for each item in the batch
        mask_logits = logits.gather(1, mask_indices.view(-1, 1, 1).expand(-1, -1, logits.size(-1))).squeeze(1)
        
        # Calculate loss
        loss = criterion(mask_logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

print("Training finished.")

# Save the model
model.bert.save_pretrained("./bert_sqg_model")
tokenizer.save_pretrained("./bert_sqg_model")
torch.save(model.cls.state_dict(), "./bert_sqg_model/cls_head.pt")
print("Model saved to ./bert_sqg_model")



## Inference

In [None]:

print("\n--- Running Inference with BERT-SQG ---")

# Load the model back (for demonstration)
model = BERT_SQG()
model.bert = BertModel.from_pretrained("./bert_sqg_model")
model.cls.load_state_dict(torch.load("./bert_sqg_model/cls_head.pt"))
model.to(device)
model.eval()
tokenizer = BertTokenizer.from_pretrained("./bert_sqg_model")

def generate_question_bert(context, answer, max_gen_len=20):
    """
    Generates a question token-by-token, as shown in
    Table 1 and 2 of the paper[cite: 158, 259].
    """
    model.eval()
    
    # Format the prompt
        
    prompt = f"[CLS] {context} [SEP] {answer} [SEP]"
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
    
    # Start with the [MASK] token
    generated_tokens = []
    input_tokens = prompt_tokens + [tokenizer.mask_token_id]
    
    for _ in range(max_gen_len):
        # Convert to tensor
        input_ids = torch.tensor([input_tokens]).to(device)
        # Create a simple attention mask
        attention_mask = torch.ones_like(input_ids).to(device)
        
        with torch.no_grad():
            logits = model(input_ids, attention_mask)
        
        # Get the logits for the last token (the [MASK] token)
        # This is the core of the sequential generation [cite: 141]
        next_token_logits = logits[0, -1, :]
        
        # Get the predicted token ID
        predicted_token_id = torch.argmax(next_token_logits).item()
        
        # If it's the [SEP] token, we're done
        if predicted_token_id == tokenizer.sep_token_id:
            break
            
        # Add the new token to our generated list
        generated_tokens.append(predicted_token_id)
        
        # Prepare the input for the next loop:
        # [CLS]...[SEP]...[SEP] + generated_tokens + [MASK]
        input_tokens = prompt_tokens + generated_tokens + [tokenizer.mask_token_id]
        
        # Stop if we exceed max length
        if len(input_tokens) >= 512:
            break
            
    # Decode the generated tokens
    return tokenizer.decode(generated_tokens)


# --- Let's test it with a sample ---
sample_row = df.iloc[5]
explanation = sample_row['Explanation']
answer = sample_row['Answer']
original_question = sample_row['Question']

print("\n--- Generating a new question (BERT-SQG): ---")
print(f"CONTEXT: {explanation[:200]}...")
print(f"ANSWER: {answer}")

generated_q_bert = generate_question_bert(explanation, answer)

print("\nGENERATED QUESTION (BERT-SQG):")
print(generated_q_bert)
print("\nORIGINAL QUESTION (for comparison):")
print(original_question)

## Evaluation

In [None]:
print("\n\n--- Step 6: Evaluating BERT-SQG Model Performance ---")

!pip install evaluate sacrebleu bert-score tqdm -q

import evaluate
from tqdm.auto import tqdm
import numpy as np

# Load the metric calculators
try:
    bleu_metric = evaluate.load('sacrebleu')
    bertscore_metric = evaluate.load('bertscore')
    print("Evaluation metrics loaded successfully.")
except Exception as e:
    print(f"Could not load metrics: {e}")

# Let's evaluate on a random sample of 100 items from our original filtered dataframe.
# This keeps the comparison fair, as we used 100 for T5 and BART.
EVAL_SAMPLE_SIZE = 100
if len(df) > EVAL_SAMPLE_SIZE:
    # Use the main 'df' to get a good evaluation sample
    eval_df = df.sample(n=EVAL_SAMPLE_SIZE, random_state=42)
else:
    eval_df = df

print(f"Running evaluation on {len(eval_df)} samples...")

# Store the generated texts and the reference texts
generated_questions = []
reference_questions = []

# Loop through the evaluation sample and generate predictions
# We will use our custom generate_question_bert function
for _, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Generating for Evaluation (BERT-SQG)"):
    # Get the reference (original) question
    ref_q = row['Question']
    reference_questions.append(ref_q)
    
    # Generate a new question from the context using our custom function
    gen_q = generate_question_bert(
        context=row['Explanation'],
        answer=row['Answer']
    )
    generated_questions.append(gen_q)


# --- Now, calculate the scores ---

# For BLEU, the references need to be in a list of lists.
bleu_references_q = [[q] for q in reference_questions]

print("\n--- Calculating Scores for Question Generation Model (BERT-SQG) ---")
try:
    # Calculate BLEU Score for Questions
    bleu_score_q = bleu_metric.compute(predictions=generated_questions, references=bleu_references_q)
    print(f"BLEU Score: {bleu_score_q['score']:.2f}")

    # Calculate BERTScore for Questions
    bert_score_q = bertscore_metric.compute(predictions=generated_questions, references=reference_questions, lang="en")
    # We take the average F1 score
    avg_f1_q = np.mean(bert_score_q['f1'])
    print(f"BERTScore (F1): {avg_f1_q:.4f}")

except Exception as e:
    print(f"An error occurred during question scoring: {e}")

print("\n--- Model Comparison ---")
print("You now have the BLEU and BERTScore for your BERT-SQG baseline.")
print("Compare these scores to the ones you got for T5 and BART.")
print(f"As the paper found [cite: D19-5821.pdf], this 'BERT-SQG' should be much better than a naive BERT,")
print("but you can now see how it compares to more modern architectures like T5 and BART!")