# Process Generated MCQs and score each MCQ with respect to Automated Scoring technique

In [11]:
from datetime import date
import json
import time
import pandas as pd

base_folder_path = "../data"
today_date = date(2025,8,23)
today_date_string = today_date.strftime("%Y%m%d")
todays_generated_ca_mcqs_folder =  f"{base_folder_path}/generated_ca_mcqs"
print(todays_generated_ca_mcqs_folder)

../data/generated_ca_mcqs


## BLEU SCORE FUNCTION

In [40]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Use smoothing for better results with short sentences
smooth_fn = SmoothingFunction().method1

def bleu_score(row):
    reference = [row["input"].split()]   # tokenized reference
    candidate = row["output"].split()     # tokenized generated
    bleu_score = sentence_bleu(reference, candidate, smoothing_function=smooth_fn)
    bleu_score=round(bleu_score,6)
    return bleu_score

## ROUGE SCORE FUNCTION

In [54]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

def rouge_scores(row):
    scores = scorer.score(row["input"], row["output"])
    rouge_score =  (scores["rouge1"].fmeasure + scores["rouge2"].fmeasure + scores["rougeL"].fmeasure)/3

    rouge_score = round(rouge_score,6)

    return rouge_score

## BERTScore FUNCTION

In [67]:
from bert_score import score

def get_bertscore(row, lang="en"):
    P, R, F1 = score([row["output"]], [row["input"]], lang=lang, verbose=False)
    
    bert_score = (float(P)+float(R)+float(F1))/3
    bert_score = round(bert_score, 6)

    return bert_score
    

## Review and score SLM (GPT-2 based file tuned model) Generated MCQs using automated scoring technique

In [65]:
slm_genertaed_mcqs_json_file = f"{todays_generated_ca_mcqs_folder}/__{today_date_string}_ca_mcqs_by_slm_gpt2-large.json"

slm_genertaed_mcqs_excel_file = f"{todays_generated_ca_mcqs_folder}/__{today_date_string}_ca_mcqs_by_slm_gpt2-large.xlsx"

print(slm_genertaed_mcqs_excel_file)

../data/generated_ca_mcqs/__20250823_ca_mcqs_by_slm_gpt2-large.xlsx


In [None]:
slm_mcqs_df = pd.read_json(slm_genertaed_mcqs_json_file)
slm_mcqs_df["bleu_score"] =  slm_mcqs_df.apply(bleu_score, axis=1)
slm_mcqs_df["rouge_score"] =  slm_mcqs_df.apply(rouge_scores, axis=1)
slm_mcqs_df["bert_score"] =  slm_mcqs_df.apply(get_bertscore, axis=1)

slm_mcqs_df[["output","bleu_score","rouge_score","bert_score"]]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

Unnamed: 0,output,bleu_score,rouge_score,bert_score
0,"Question:Altaf Hussain Wagay, who was arrested...",0.000000,0.047758,0.781115
1,,0.000000,0.000000,0.000000
2,Question:Which political party has declared it...,0.000169,0.109416,0.798443
3,Question:Which ministry has issued the stateme...,0.000132,0.093527,0.779961
4,### Input:\nIndia has become the first country...,0.000561,0.101079,0.782893
...,...,...,...,...
143,,0.000000,0.000000,0.000000
144,Question:Which former head of a major global c...,0.000000,0.052623,0.779976
145,Correct Answer: B [Earth Sciences Ministry],0.000000,0.010264,0.780978
146,.,0.000000,0.000000,0.755848


## Review and score LLM (Google Gemini Flash-2.5) Generated MCQs using automated scoring technique

In [19]:
llm_genertaed_mcqs_json_file = f"{todays_generated_ca_mcqs_folder}/__{today_date_string}_ca_mcqs_by_llm_gemini-2.5-flash.json"

print(llm_genertaed_mcqs_json_file)

llm_genertaed_mcqs_excel_file = f"{todays_generated_ca_mcqs_folder}/__{today_date_string}_ca_mcqs_by_llm_gemini-2.5-flash.xlsx"

../data/generated_ca_mcqs/__20250823_ca_mcqs_by_llm_gemini-2.5-flash.json


In [None]:
llm_mcqs_df = pd.read_json(llm_genertaed_mcqs_json_file)

llm_mcqs_df["bleu_score"] =  llm_mcqs_df.apply(bleu_score, axis=1)
llm_mcqs_df["rouge_score"] =  llm_mcqs_df.apply(rouge_scores, axis=1)
llm_mcqs_df["bert_score"] =  llm_mcqs_df.apply(get_bertscore, axis=1)

llm_mcqs_df[["output","bleu_score","rouge_score","bert_score"]]


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You sho

Unnamed: 0,output,bleu_score,rouge_score,bert_score
0,### Multiple Choice Question:\n\nAltaf Hussain...,0.002481,0.150922,0.821688
1,### Multiple Choice Question:\n\nWhat is the p...,0.002055,0.157219,0.827457
2,**Question:** What term did the All India Trin...,0.000013,0.068491,0.796635
3,**Question:** According to External Affairs Mi...,0.016770,0.188765,0.831243
4,### Multiple Choice Question:\n\nAccording to ...,0.004296,0.152623,0.826086
...,...,...,...,...
69,**Question:** Which of the following specifica...,0.001146,0.130053,0.810490
70,"**Question:** According to the provided text, ...",0.001869,0.131082,0.821401
71,Here's a Multiple Choice Question based on the...,0.000234,0.093720,0.804049
72,Here is a Multiple Choice Question based on th...,0.000242,0.117072,0.817740


### Manual SME reviewed scores 
Only rubric are added below, actual score to be updated in excel workbook under appropriate columns

In [76]:
slm_mcqs_df[["relevance_score", "clarity", "correctness", "distractor_quality","cognitive_level"]] = None


slm_mcqs_df[["relevance_score", "clarity", "correctness", "distractor_quality","cognitive_level"]]



Unnamed: 0,relevance_score,clarity,correctness,distractor_quality,cognitive_level
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
143,,,,,
144,,,,,
145,,,,,
146,,,,,


In [77]:
llm_mcqs_df[["relevance_score", "clarity", "correctness", "distractor_quality","cognitive_level"]] = None

llm_mcqs_df[["relevance_score", "clarity", "correctness", "distractor_quality","cognitive_level"]]

Unnamed: 0,relevance_score,clarity,correctness,distractor_quality,cognitive_level
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
69,,,,,
70,,,,,
71,,,,,
72,,,,,


## Save the MCQs along with score in excel files

In [80]:
slm_mcqs_df.to_excel(slm_genertaed_mcqs_excel_file,index=False)
print(f"Saved SLM generated MCQ and automated scores (bleu, rouge, bertscore) in {slm_genertaed_mcqs_excel_file}")

llm_mcqs_df.to_excel(llm_genertaed_mcqs_excel_file,index=False)
print(f"Saved LLM (Google Gemini) generated MCQ and automated scores (bleu, rouge, bertscore) in {llm_genertaed_mcqs_excel_file}")

Saved SLM generated MCQ and automated scores (bleu, rouge, bertscore) in ../data/generated_ca_mcqs/__20250823_ca_mcqs_by_slm_gpt2-large.xlsx
Saved LLM (Google Gemini) generated MCQ and automated scores (bleu, rouge, bertscore) in ../data/generated_ca_mcqs/__20250823_ca_mcqs_by_llm_gemini-2.5-flash.xlsx
