# üß† JEE Question Generation Model Benchmark
Compare multiple LLMs (GPT, Mistral, etc.) for generating JEE-style questions and evaluate their quality.

In [1]:
# Install dependencies if needed
# !pip install openai transformers pandas matplotlib sympy

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sympy import sympify, simplify
from openai import OpenAI
from transformers import pipeline

# Setup API client (if using OpenAI)
# os.environ['OPENAI_API_KEY'] = 'your_api_key_here'
client = OpenAI()

## üìÇ Load Small Dataset

In [None]:
data = [
    {"subject": "Physics", "topic": "Kinematics", "difficulty": "Medium", "question": "A car accelerates from rest at 2 m/s¬≤. What is its velocity after 5 seconds?", "options": ["5 m/s", "10 m/s", "15 m/s", "20 m/s"], "correct_answer": "10 m/s", "solution": "v = u + at = 0 + 2√ó5 = 10 m/s"},
    {"subject": "Maths", "topic": "Quadratic Equations", "difficulty": "Easy", "question": "Find the roots of the equation x¬≤ - 5x + 6 = 0.", "options": ["1,6", "2,3", "3,5", "1,5"], "correct_answer": "2,3", "solution": "Roots are (x-2)(x-3)=0"},
]

df = pd.DataFrame(data)
df

## ‚öôÔ∏è Define Prompt Template

In [None]:
def build_prompt(question, solution):
    return f'''
You are a question generator for JEE exams.
Generate ONE new JEE-style question that tests the same concept as below.
Rephrase or change the numbers/context but keep difficulty same.
Also generate 4 options and a correct answer with explanation.

Question: {question}
Solution: {solution}

Output JSON with keys: question, options, correct_answer, explanation.
'''

## ü§ñ Model Query Functions

In [None]:
def generate_with_gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.8
    )
    return response.choices[0].message.content

def generate_with_hf(model_name, prompt):
    generator = pipeline("text-generation", model=model_name, max_new_tokens=500)
    result = generator(prompt, do_sample=True, temperature=0.8)
    return result[0]['generated_text']

## üß© Generate Variants from Models

In [None]:
models = {
    "GPT-4o-mini": lambda p: generate_with_gpt(p),
    # "Mistral-7B": lambda p: generate_with_hf("mistralai/Mistral-7B-Instruct-v0.2", p),
}

results = []
for _, row in df.iterrows():
    prompt = build_prompt(row['question'], row['solution'])
    for model_name, func in models.items():
        print(f"\nüîπ Generating with {model_name} for topic: {row['topic']}")
        output = func(prompt)
        results.append({
            'model': model_name,
            'subject': row['subject'],
            'topic': row['topic'],
            'base_question': row['question'],
            'generated_output': output
        })

results_df = pd.DataFrame(results)
results_df.head()

## ‚úÖ Validation Example

In [None]:
def validate_math_expression(expr1, expr2):
    try:
        return simplify(sympify(expr1) - sympify(expr2)) == 0
    except:
        return False

# Example usage:
# validate_math_expression('2*5', '10')

## üßÆ Human Evaluation Template

In [None]:
eval_df = results_df.copy()
eval_df['conceptual_accuracy'] = None
eval_df['clarity'] = None
eval_df['creativity'] = None
eval_df['answer_validity'] = None
eval_df['formatting'] = None
eval_df.to_csv('evaluation_template.csv', index=False)
print('üìÑ Saved evaluation_template.csv ‚Äî fill scores (1‚Äì5) manually.')

## üìä Visualization and Final Decision

In [None]:
scored = pd.read_csv('evaluation_template.csv')
avg_scores = scored.groupby('model')[['conceptual_accuracy', 'clarity', 'creativity', 'answer_validity', 'formatting']].mean()
avg_scores.plot(kind='bar', figsize=(10,5), title='Model Comparison (Mean Scores)')
plt.show()

final_choice = avg_scores.mean(axis=1).idxmax()
print(f'üèÜ Best Performing Model: {final_choice}')