In [1]:
import pandas as pd
import itertools
import os
import random
from tqdm import tqdm
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from openai import OpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
level_quality = 2  # Threshold for good/bad MCQs
data_dir = "raiforukraine-MARIA/data/data_for_finetuning/Training_mcqs/"
datasets = ['gemma_9b_distractor_quality', 'Llama8B_0.1_distractor_quality', 'Llama1b_distractor_quality']  # Add 'phi3.5' when available
OPENAI_KEY = os.getenv('OPENAI_KEY')
max_gpt4o_attempts = 3

In [4]:
client = OpenAI(api_key=OPENAI_KEY)

In [5]:
class MCQQuestion(BaseModel):
    question: str = Field(description="The multiple-choice question")
    option_a: str = Field(description="The first answer option labeled 'A'")
    option_b: str = Field(description="The second answer option labeled 'B'")
    option_c: str = Field(description="The third answer option labeled 'C'")
    option_d: str = Field(description="The fourth answer option labeled 'D'")
    correct_option: str = Field(description="This consists only a letter of the correct option")

mcq_parser = JsonOutputParser(pydantic_object=MCQQuestion)

In [6]:
DEFAULT_PROMPT = """You are an expert in creating high-quality medical multiple-choice questions.
Create a challenging medical multiple-choice question with exactly one correct answer and three plausible distractors.

The distractors (incorrect options) quality is extremely important and should meet these criteria:
- They should be plausible to most test-takers
- They should represent common misconceptions
- They should require deep understanding to eliminate
- They should not be obviously incorrect or unrelated to the question
- They should not be easy to eliminate for knowledgeable test-takers

The question should test medical knowledge and critical thinking skills.
"""

In [7]:
DISTRACTORS_QUALITY_PROMPT = """You are tasked to evaluate the quality of the distractors of a multiple-choice question (incorrect options) on a scale of 1-5, where:
1 = POOR: Implausible, obviously incorrect, or unrelated to the question
2 = BELOW AVERAGE: Easy to eliminate, lacks plausibility for knowledgeable test-takers  
3 = AVERAGE: Somewhat plausible but contains minor flaws that make it distinguishable
4 = GOOD: Plausible to most test-takers, represents common misconceptions
5 = EXCELLENT: Highly plausible, represents sophisticated misconceptions, requires deep understanding to eliminate
Provide only a numerical score from 1 to 5 the best represents the level of distractor quality"""


In [8]:
def create_prompt_chain(prompt_text=DEFAULT_PROMPT):
    prompt_template = PromptTemplate(
        template="{prompt}.\n{format_instructions}\n{query}\n",
        input_variables=["query"],
        partial_variables={
            "prompt": prompt_text,
            "format_instructions": mcq_parser.get_format_instructions(),
        },
    )
    model = ChatOpenAI(model="gpt-4o", temperature=0.5, api_key=OPENAI_KEY)
    
    chain = prompt_template | model | mcq_parser
    return chain

In [9]:
def call_openai_api(client, system_prompt, user_prompt, temp=0.5, max_completion_tokens=1):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            temperature=temp,
            max_tokens=max_completion_tokens,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [10]:
def generate_prompt_for_question(row,
                                question_col='question',
                                option_a_col='option_a',
                                option_b_col='option_b',
                                option_c_col='option_c',
                                option_d_col='option_d',
                                correct_option='correct_option',
                                include_options=True,
                                include_correct_option=True,
                                context_col=None):
    question_text = row[question_col]
    options = f"a) {row[option_a_col]}\nb) {row[option_b_col]}\nc) {row[option_c_col]}\nd) {row[option_d_col]}"
    correct_opt = row[correct_option]
    
    user_prompt_delimiter = "-----\n"
    user_prompt_question = f"Question:\n{question_text}\n"
    user_prompt_options = f"Options:\n{options}\n"
    user_prompt = user_prompt_delimiter + user_prompt_question
    if include_options:
        user_prompt += user_prompt_options
    if include_correct_option:
        correct_option_text = f"Correct option: {correct_opt}\n"
        user_prompt += correct_option_text
    user_prompt += user_prompt_delimiter
    
    if context_col is not None and context_col in row:
        mcq_context = f"""Context:\n-----\n{row[context_col]}\n-----\n"""
        user_prompt = mcq_context + user_prompt
    return user_prompt

In [11]:
def evaluate_distractor_quality(mcq_dict):
    # Convert dictionary to a pandas Series for use with generate_prompt_for_question
    row = pd.Series(mcq_dict)
    
    # Generate prompt for evaluation
    user_prompt = generate_prompt_for_question(
        row,
        question_col='question',
        option_a_col='option_a',
        option_b_col='option_b',
        option_c_col='option_c',
        option_d_col='option_d',
        correct_option='correct_option',
        include_options=True,
        include_correct_option=True
    )
    
    # Call API to evaluate
    quality_score = call_openai_api(
        client,
        DISTRACTORS_QUALITY_PROMPT,
        user_prompt,
        temp=0.5,
        max_completion_tokens=1
    )
    
    try:
        return int(quality_score.strip())
    except (ValueError, TypeError):
        print(f"Error parsing quality score: {quality_score}")
        return 0

In [12]:
def generate_good_mcq_with_gpt4o(question_text):
    try:
        # Create the chain
        chain = create_prompt_chain()
        
        # Generate MCQ using the DEFAULT_PROMPT
        # Just pass the question topic without additional instructions
        query = question_text
        generated_mcq = chain.invoke({"query": query})
        
        # Evaluate distractor quality
        quality = evaluate_distractor_quality(generated_mcq)
        
        return generated_mcq, quality
        
    except Exception as e:
        print(f"Error generating MCQ with GPT-4o: {e}")
        return None, 0

In [13]:
def create_mcq_text(mcq_dict):
    return (
        f"Question: {mcq_dict['question']}\n"
        f"a) {mcq_dict['option_a']}\n"
        f"b) {mcq_dict['option_b']}\n"
        f"c) {mcq_dict['option_c']}\n"
        f"d) {mcq_dict['option_d']}"
    )

# Function to create a pair dictionary
def create_pair(lisa_id, good_mcq, bad_mcq, good_source="", bad_source=""):
    good_text = create_mcq_text(good_mcq)
    bad_text = create_mcq_text(bad_mcq)
    
    return {
        "id": lisa_id,
        "chosen": good_text,
        "rejected": bad_text,
        "chosen_source": good_source,
        "rejected_source": bad_source
    }

In [14]:
print("Processing MCQ datasets...")

id2mcqs = dict()  # Key: dataset, Value: {id: mcq_dict}
ids = set()       # Set of all common IDs across datasets
good_mcqs = dict() # Key: id, Value: list of datasets with good MCQs
bad_mcqs = dict()  # Key: id, Value: list of datasets with bad MCQs

Processing MCQ datasets...


In [15]:
for dataset in datasets:
    try:
        file_path = os.path.join(data_dir, f"{dataset}.csv")
        df = pd.read_csv(file_path)
        # df = df.iloc[:50]
        
        # Extract MCQ content and ensure correct_option is included
        temp = df[['id', 'question', 'option_a', 'option_b', 'option_c', 'option_d', 'correct_option']].set_index('id').T.to_dict()
        
        # Update set of common IDs
        if len(ids) == 0:
            ids = set(temp.keys())
        else:
            ids = ids.intersection(set(temp.keys()))
        
        # Store MCQs by dataset
        id2mcqs[dataset] = temp.copy()
        
        # Categorize MCQs by quality
        for idx, row in df[['id', 'distractor_quality']].iterrows():
            lisa_id, quality = row
            
            # Good MCQ
            if quality > level_quality:
                if lisa_id in good_mcqs:
                    good_mcqs[lisa_id].append(dataset)
                else:
                    good_mcqs[lisa_id] = [dataset]
            else:
                if lisa_id in bad_mcqs:
                    bad_mcqs[lisa_id].append(dataset)
                else:
                    bad_mcqs[lisa_id] = [dataset]
                    
    except Exception as e:
        print(f"Error processing dataset {dataset}: {e}")

print(f"Found {len(ids)} common IDs across all datasets")

Found 3163 common IDs across all datasets


In [19]:
for dataset in datasets:
    file_path = os.path.join(data_dir, f"{dataset}.csv")
    df = pd.read_csv(file_path)
    print(f"{dataset} quality distribution:")
    print(df['distractor_quality'].value_counts())

# Check how many IDs have all good or all bad MCQs
print(f"IDs with all good MCQs: {nb_good_only}")
print(f"IDs with all bad MCQs: {nb_bad_only}")
print(f"IDs with mixed quality MCQs: {len(ids) - nb_good_only - nb_bad_only}")

gemma_9b_distractor_quality quality distribution:
distractor_quality
4    1488
3    1177
2     257
1     150
5      97
Name: count, dtype: int64
Llama8B_0.1_distractor_quality quality distribution:
distractor_quality
1    1176
3     939
2     827
4     219
5       2
Name: count, dtype: int64
Llama1b_distractor_quality quality distribution:
distractor_quality
3    1246
4     957
2     560
1     388
5      18
Name: count, dtype: int64
IDs with all good MCQs: 746
IDs with all bad MCQs: 95
IDs with mixed quality MCQs: 2322


In [18]:
nb_good_only = 0
nb_bad_only = 0
nb_mixed = 0
nb_gpt4o_generated = 0
all_pairs = []

# Process each ID to create pairs
for idx in tqdm(ids, desc="Creating pairs"):
    bad_mcqs_list = []
    good_mcqs_list = []
    
    if idx in bad_mcqs:
        bad_datasets = bad_mcqs[idx]
        for dataset in bad_datasets:
            if idx in id2mcqs[dataset]:
                bad_mcqs_list.append((id2mcqs[dataset][idx], dataset))
    else:
        # All MCQs are good
        nb_good_only += 1
        continue
        
    if idx in good_mcqs:
        good_datasets = good_mcqs[idx]
        for dataset in good_datasets:
            if idx in id2mcqs[dataset]:
                good_mcqs_list.append((id2mcqs[dataset][idx], dataset))
        nb_mixed += 1
    else:
        # All MCQs are bad - try to generate a good one with GPT-4o
        nb_bad_only += 1
        
        # Get sample question from one of the bad MCQs
        sample_question = bad_mcqs_list[0][0]["question"]
        
        # Try to generate a good MCQ with GPT-4o (up to max_attempts times)
        for attempt in range(max_gpt4o_attempts):
            generated_mcq, quality = generate_good_mcq_with_gpt4o(sample_question)
            
            if generated_mcq and quality >= level_quality:  # Aim for quality 3 or higher
                good_mcqs_list.append((generated_mcq, "gpt-4o"))
                nb_gpt4o_generated += 1
                print(f"Successfully generated MCQ with quality score {quality} for ID {idx}")
                break
            else:
                print(f"Attempt {attempt+1}: Generated MCQ quality ({quality}) below threshold of {level_quality} for ID {idx}")
        
        # If we couldn't generate a good MCQ, skip this ID
        if not good_mcqs_list:
            print(f"Could not generate good MCQ for ID {idx} after {max_gpt4o_attempts} attempts. Skipping.")
            continue
    
    # Create all possible (good, bad) pairs
    for good_item, bad_item in itertools.product(good_mcqs_list, bad_mcqs_list):
        good_mcq, good_source = good_item
        bad_mcq, bad_source = bad_item
        
        pair = create_pair(idx, good_mcq, bad_mcq, good_source, bad_source)
        all_pairs.append(pair)

# Convert to DataFrame
pairs_df = pd.DataFrame(all_pairs)

Creating pairs:   4%|██▉                                                                     | 128/3163 [00:02<00:59, 50.74it/s]

Successfully generated MCQ with quality score 4 for ID OIC-182-03-B


Creating pairs:   5%|███▉                                                                    | 171/3163 [00:05<01:43, 29.02it/s]

Successfully generated MCQ with quality score 3 for ID OIC-177-02-A


Creating pairs:   7%|████▊                                                                   | 209/3163 [00:07<02:09, 22.76it/s]

Successfully generated MCQ with quality score 4 for ID OIC-154-04-A


Creating pairs:   8%|█████▍                                                                  | 241/3163 [00:11<03:08, 15.47it/s]

Successfully generated MCQ with quality score 4 for ID OIC-012-09-A


Creating pairs:   8%|█████▉                                                                  | 259/3163 [00:17<05:07,  9.43it/s]

Successfully generated MCQ with quality score 4 for ID OIC-189-07-B


Creating pairs:   8%|██████                                                                  | 266/3163 [00:19<06:03,  7.98it/s]

Successfully generated MCQ with quality score 4 for ID OIC-231-14-A


Creating pairs:  11%|███████▌                                                                | 333/3163 [00:21<03:25, 13.78it/s]

Successfully generated MCQ with quality score 4 for ID OIC-292-01-A


Creating pairs:  11%|████████                                                                | 354/3163 [00:24<04:03, 11.54it/s]

Successfully generated MCQ with quality score 4 for ID OIC-066-35-B


Creating pairs:  12%|████████▉                                                               | 395/3163 [00:26<03:30, 13.18it/s]

Successfully generated MCQ with quality score 4 for ID OIC-213-10-A


Creating pairs:  13%|█████████▍                                                              | 413/3163 [00:29<03:58, 11.51it/s]

Successfully generated MCQ with quality score 3 for ID OIC-031-03-A


Creating pairs:  17%|████████████▎                                                           | 539/3163 [00:31<01:51, 23.61it/s]

Successfully generated MCQ with quality score 4 for ID OIC-175-05-A


Creating pairs:  19%|█████████████▌                                                          | 598/3163 [00:34<01:50, 23.28it/s]

Successfully generated MCQ with quality score 4 for ID OIC-172-02-A


Creating pairs:  19%|█████████████▊                                                          | 608/3163 [00:36<02:23, 17.78it/s]

Successfully generated MCQ with quality score 4 for ID OIC-220-02-A


Creating pairs:  19%|█████████████▉                                                          | 613/3163 [00:39<03:23, 12.51it/s]

Successfully generated MCQ with quality score 4 for ID OIC-023-11-A


Creating pairs:  20%|██████████████▌                                                         | 641/3163 [00:42<03:40, 11.42it/s]

Successfully generated MCQ with quality score 4 for ID OIC-337-08-B


Creating pairs:  21%|███████████████                                                         | 660/3163 [00:44<03:59, 10.44it/s]

Successfully generated MCQ with quality score 4 for ID OIC-088-03-A


Creating pairs:  21%|███████████████▎                                                        | 673/3163 [00:48<05:30,  7.53it/s]

Successfully generated MCQ with quality score 3 for ID OIC-335-08-B


Creating pairs:  28%|███████████████████▉                                                    | 877/3163 [00:51<01:29, 25.66it/s]

Successfully generated MCQ with quality score 4 for ID OIC-003-10-B


Creating pairs:  28%|████████████████████                                                    | 883/3163 [00:55<02:07, 17.89it/s]

Successfully generated MCQ with quality score 5 for ID OIC-062-04-B


Creating pairs:  28%|████████████████████▏                                                   | 889/3163 [00:57<02:41, 14.04it/s]

Successfully generated MCQ with quality score 4 for ID OIC-163-04-A


Creating pairs:  30%|█████████████████████▊                                                  | 956/3163 [01:00<02:15, 16.31it/s]

Successfully generated MCQ with quality score 4 for ID OIC-123-04-A


Creating pairs:  31%|██████████████████████▏                                                 | 973/3163 [01:04<03:00, 12.12it/s]

Successfully generated MCQ with quality score 4 for ID OIC-007-03-B


Creating pairs:  33%|███████████████████████                                                | 1029/3163 [01:09<02:53, 12.31it/s]

Successfully generated MCQ with quality score 4 for ID OIC-331-06-A


Creating pairs:  33%|███████████████████████▏                                               | 1031/3163 [01:11<03:39,  9.70it/s]

Successfully generated MCQ with quality score 4 for ID OIC-128-06-A


Creating pairs:  33%|███████████████████████▌                                               | 1050/3163 [01:14<04:09,  8.46it/s]

Successfully generated MCQ with quality score 4 for ID OIC-295-07-B


Creating pairs:  35%|█████████████████████████▏                                             | 1120/3163 [01:21<03:41,  9.22it/s]

Successfully generated MCQ with quality score 4 for ID OIC-337-10-A


Creating pairs:  36%|█████████████████████████▍                                             | 1132/3163 [01:24<04:07,  8.20it/s]

Successfully generated MCQ with quality score 4 for ID OIC-054-03-A


Creating pairs:  37%|██████████████████████████▏                                            | 1164/3163 [01:30<04:40,  7.12it/s]

Successfully generated MCQ with quality score 4 for ID OIC-004-24-B


Creating pairs:  40%|████████████████████████████▎                                          | 1264/3163 [01:33<02:29, 12.73it/s]

Successfully generated MCQ with quality score 4 for ID OIC-023-05-B


Creating pairs:  40%|████████████████████████████▍                                          | 1266/3163 [01:35<02:58, 10.62it/s]

Successfully generated MCQ with quality score 4 for ID OIC-367-04-B


Creating pairs:  40%|████████████████████████████▌                                          | 1271/3163 [01:38<03:51,  8.17it/s]

Successfully generated MCQ with quality score 4 for ID OIC-289-06-A
Attempt 1: Generated MCQ quality (1) below threshold of 2 for ID OIC-031-01-A


Creating pairs:  41%|█████████████████████████████                                          | 1293/3163 [01:44<04:53,  6.38it/s]

Successfully generated MCQ with quality score 2 for ID OIC-031-01-A


Creating pairs:  42%|█████████████████████████████▌                                         | 1316/3163 [01:47<04:28,  6.87it/s]

Successfully generated MCQ with quality score 4 for ID OIC-275-01-A


Creating pairs:  42%|█████████████████████████████▌                                         | 1319/3163 [01:50<05:58,  5.15it/s]

Successfully generated MCQ with quality score 4 for ID OIC-341-10-A


Creating pairs:  42%|██████████████████████████████                                         | 1340/3163 [01:53<05:24,  5.61it/s]

Successfully generated MCQ with quality score 4 for ID OIC-277-08-A


Creating pairs:  42%|██████████████████████████████▏                                        | 1343/3163 [01:56<06:53,  4.40it/s]

Successfully generated MCQ with quality score 4 for ID OIC-091-01-A


Creating pairs:  44%|███████████████████████████████▎                                       | 1396/3163 [01:58<03:15,  9.06it/s]

Successfully generated MCQ with quality score 4 for ID OIC-163-10-B


Creating pairs:  45%|███████████████████████████████▊                                       | 1415/3163 [02:01<03:44,  7.80it/s]

Successfully generated MCQ with quality score 4 for ID OIC-366-07-B


Creating pairs:  45%|████████████████████████████████                                       | 1427/3163 [02:04<04:03,  7.14it/s]

Successfully generated MCQ with quality score 4 for ID OIC-264-04-B


Creating pairs:  47%|█████████████████████████████████▎                                     | 1482/3163 [02:06<02:29, 11.27it/s]

Successfully generated MCQ with quality score 4 for ID OIC-155-40-B


Creating pairs:  49%|██████████████████████████████████▌                                    | 1540/3163 [02:09<01:51, 14.53it/s]

Successfully generated MCQ with quality score 4 for ID OIC-122-03-A


Creating pairs:  49%|██████████████████████████████████▋                                    | 1547/3163 [02:12<02:31, 10.64it/s]

Successfully generated MCQ with quality score 4 for ID OIC-069-02-B


Creating pairs:  50%|███████████████████████████████████▏                                   | 1567/3163 [02:14<02:29, 10.69it/s]

Successfully generated MCQ with quality score 3 for ID OIC-323-08-A


Creating pairs:  52%|████████████████████████████████████▉                                  | 1646/3163 [02:16<01:31, 16.54it/s]

Successfully generated MCQ with quality score 4 for ID OIC-150-05-B


Creating pairs:  52%|█████████████████████████████████████▏                                 | 1656/3163 [02:19<01:54, 13.21it/s]

Successfully generated MCQ with quality score 4 for ID OIC-003-09-B


Creating pairs:  53%|█████████████████████████████████████▍                                 | 1666/3163 [02:21<02:22, 10.48it/s]

Successfully generated MCQ with quality score 4 for ID OIC-023-18-A


Creating pairs:  54%|██████████████████████████████████████▌                                | 1719/3163 [02:23<01:38, 14.72it/s]

Successfully generated MCQ with quality score 4 for ID OIC-286-13-A


Creating pairs:  55%|███████████████████████████████████████                                | 1741/3163 [02:25<01:46, 13.39it/s]

Successfully generated MCQ with quality score 4 for ID OIC-300-09-B


Creating pairs:  55%|███████████████████████████████████████▏                               | 1746/3163 [02:28<02:39,  8.86it/s]

Successfully generated MCQ with quality score 4 for ID OIC-337-21-B


Creating pairs:  56%|███████████████████████████████████████▋                               | 1769/3163 [02:31<02:36,  8.93it/s]

Successfully generated MCQ with quality score 4 for ID OIC-308-02-B


Creating pairs:  57%|████████████████████████████████████████▍                              | 1803/3163 [02:34<02:19,  9.75it/s]

Successfully generated MCQ with quality score 3 for ID OIC-231-20-A


Creating pairs:  57%|████████████████████████████████████████▋                              | 1810/3163 [02:38<03:15,  6.91it/s]

Successfully generated MCQ with quality score 4 for ID OIC-248-11-B


Creating pairs:  60%|██████████████████████████████████████████▎                            | 1887/3163 [02:41<01:40, 12.67it/s]

Successfully generated MCQ with quality score 4 for ID OIC-231-08-B


Creating pairs:  60%|██████████████████████████████████████████▊                            | 1910/3163 [02:43<01:47, 11.67it/s]

Successfully generated MCQ with quality score 4 for ID OIC-023-15-A


Creating pairs:  66%|███████████████████████████████████████████████▏                       | 2101/3163 [02:45<00:35, 29.92it/s]

Successfully generated MCQ with quality score 4 for ID OIC-366-05-A


Creating pairs:  67%|███████████████████████████████████████████████▋                       | 2126/3163 [02:48<00:41, 25.24it/s]

Successfully generated MCQ with quality score 4 for ID OIC-355-16-B


Creating pairs:  68%|████████████████████████████████████████████████▍                      | 2157/3163 [02:50<00:44, 22.44it/s]

Successfully generated MCQ with quality score 4 for ID OIC-185-09-B
Attempt 1: Generated MCQ quality (1) below threshold of 2 for ID OIC-118-07-A


Creating pairs:  68%|████████████████████████████████████████████████▌                      | 2165/3163 [02:54<01:14, 13.46it/s]

Successfully generated MCQ with quality score 4 for ID OIC-118-07-A


Creating pairs:  69%|█████████████████████████████████████████████████▎                     | 2196/3163 [02:59<01:30, 10.70it/s]

Successfully generated MCQ with quality score 3 for ID OIC-289-01-A


Creating pairs:  71%|██████████████████████████████████████████████████▏                    | 2238/3163 [03:01<01:13, 12.57it/s]

Successfully generated MCQ with quality score 4 for ID OIC-337-06-B
Successfully generated MCQ with quality score 4 for ID OIC-366-01-A


Creating pairs:  71%|██████████████████████████████████████████████████▎                    | 2240/3163 [03:08<02:22,  6.46it/s]

Successfully generated MCQ with quality score 4 for ID OIC-155-03-A


Creating pairs:  71%|██████████████████████████████████████████████████▎                    | 2242/3163 [03:11<02:53,  5.29it/s]

Successfully generated MCQ with quality score 4 for ID OIC-152-01-A


Creating pairs:  74%|████████████████████████████████████████████████████▎                  | 2329/3163 [03:13<01:07, 12.38it/s]

Successfully generated MCQ with quality score 4 for ID OIC-118-13-B


Creating pairs:  75%|████████████████████████████████████████████████████▉                  | 2360/3163 [03:16<01:11, 11.25it/s]

Successfully generated MCQ with quality score 4 for ID OIC-250-04-A


Creating pairs:  77%|██████████████████████████████████████████████████████▍                | 2423/3163 [03:18<00:49, 14.84it/s]

Successfully generated MCQ with quality score 4 for ID OIC-211-10-B


Creating pairs:  77%|██████████████████████████████████████████████████████▌                | 2429/3163 [03:22<01:08, 10.71it/s]

Successfully generated MCQ with quality score 4 for ID OIC-057-27-A


Creating pairs:  78%|███████████████████████████████████████████████████████▏               | 2460/3163 [03:25<01:09, 10.13it/s]

Successfully generated MCQ with quality score 4 for ID OIC-334-08-B


Creating pairs:  78%|███████████████████████████████████████████████████████▍               | 2471/3163 [03:28<01:21,  8.47it/s]

Successfully generated MCQ with quality score 4 for ID OIC-195-16-B


Creating pairs:  79%|███████████████████████████████████████████████████████▋               | 2483/3163 [03:31<01:34,  7.22it/s]

Successfully generated MCQ with quality score 4 for ID OIC-324-01-A


Creating pairs:  79%|███████████████████████████████████████████████████████▉               | 2491/3163 [03:34<01:58,  5.66it/s]

Successfully generated MCQ with quality score 4 for ID OIC-149-23-B


Creating pairs:  79%|███████████████████████████████████████████████████████▉               | 2494/3163 [03:37<02:28,  4.51it/s]

Successfully generated MCQ with quality score 4 for ID OIC-036-11-B


Creating pairs:  79%|████████████████████████████████████████████████████████               | 2500/3163 [03:41<03:13,  3.43it/s]

Successfully generated MCQ with quality score 3 for ID OIC-276-02-B


Creating pairs:  79%|████████████████████████████████████████████████████████▎              | 2510/3163 [03:44<03:15,  3.34it/s]

Successfully generated MCQ with quality score 4 for ID OIC-334-35-A


Creating pairs:  79%|████████████████████████████████████████████████████████▍              | 2512/3163 [03:46<04:00,  2.70it/s]

Successfully generated MCQ with quality score 4 for ID OIC-079-05-A


Creating pairs:  80%|████████████████████████████████████████████████████████▌              | 2520/3163 [03:50<04:09,  2.58it/s]

Successfully generated MCQ with quality score 4 for ID OIC-122-12-B


Creating pairs:  80%|████████████████████████████████████████████████████████▋              | 2525/3163 [03:54<04:58,  2.14it/s]

Successfully generated MCQ with quality score 3 for ID OIC-139-07-B


Creating pairs:  80%|████████████████████████████████████████████████████████▉              | 2539/3163 [03:56<03:23,  3.07it/s]

Successfully generated MCQ with quality score 4 for ID OIC-196-04-A


Creating pairs:  81%|█████████████████████████████████████████████████████████▎             | 2552/3163 [03:58<02:45,  3.69it/s]

Successfully generated MCQ with quality score 4 for ID OIC-335-23-A


Creating pairs:  85%|████████████████████████████████████████████████████████████▎          | 2689/3163 [04:02<00:30, 15.67it/s]

Successfully generated MCQ with quality score 3 for ID OIC-120-02-A


Creating pairs:  85%|████████████████████████████████████████████████████████████▍          | 2693/3163 [04:04<00:37, 12.40it/s]

Successfully generated MCQ with quality score 3 for ID OIC-113-05-B


Creating pairs:  85%|████████████████████████████████████████████████████████████▍          | 2695/3163 [04:07<00:53,  8.82it/s]

Successfully generated MCQ with quality score 4 for ID OIC-061-05-A


Creating pairs:  86%|████████████████████████████████████████████████████████████▉          | 2717/3163 [04:13<01:09,  6.41it/s]

Successfully generated MCQ with quality score 3 for ID OIC-066-14-A


Creating pairs:  88%|██████████████████████████████████████████████████████████████▎        | 2774/3163 [04:16<00:40,  9.71it/s]

Successfully generated MCQ with quality score 4 for ID OIC-129-02-B


Creating pairs:  88%|██████████████████████████████████████████████████████████████▎        | 2778/3163 [04:20<00:55,  6.92it/s]

Successfully generated MCQ with quality score 5 for ID OIC-064-01-A


Creating pairs:  88%|██████████████████████████████████████████████████████████████▊        | 2796/3163 [04:23<00:55,  6.60it/s]

Successfully generated MCQ with quality score 4 for ID OIC-177-31-A


Creating pairs:  89%|███████████████████████████████████████████████████████████████▌       | 2830/3163 [04:26<00:42,  7.82it/s]

Successfully generated MCQ with quality score 4 for ID OIC-080-09-B


Creating pairs:  95%|███████████████████████████████████████████████████████████████████▏   | 2991/3163 [04:31<00:09, 17.54it/s]

Successfully generated MCQ with quality score 4 for ID OIC-149-17-B


Creating pairs:  95%|███████████████████████████████████████████████████████████████████▏   | 2995/3163 [04:33<00:11, 14.19it/s]

Successfully generated MCQ with quality score 4 for ID OIC-155-26-B


Creating pairs:  96%|███████████████████████████████████████████████████████████████████▉   | 3028/3163 [04:36<00:09, 14.07it/s]

Successfully generated MCQ with quality score 4 for ID OIC-012-01-A


Creating pairs:  97%|█████████████████████████████████████████████████████████████████████  | 3074/3163 [04:38<00:05, 15.44it/s]

Successfully generated MCQ with quality score 4 for ID OIC-111-08-B


Creating pairs:  97%|█████████████████████████████████████████████████████████████████████▏ | 3082/3163 [04:41<00:06, 11.96it/s]

Successfully generated MCQ with quality score 4 for ID OIC-166-14-A


Creating pairs:  98%|█████████████████████████████████████████████████████████████████████▌ | 3100/3163 [04:43<00:05, 11.00it/s]

Successfully generated MCQ with quality score 5 for ID OIC-287-05-A


Creating pairs:  99%|██████████████████████████████████████████████████████████████████████▎| 3130/3163 [04:47<00:03,  9.56it/s]

Successfully generated MCQ with quality score 3 for ID OIC-060-01-A


Creating pairs: 100%|███████████████████████████████████████████████████████████████████████| 3163/3163 [04:50<00:00, 10.90it/s]

Successfully generated MCQ with quality score 4 for ID OIC-040-01-A





In [21]:
pairs_df

Unnamed: 0,id,chosen,rejected,chosen_source,rejected_source
0,OIC-131-17-A,Question: Which of the following is NOT a pote...,Question: What is a priority etiological diagn...,gemma_9b_distractor_quality,Llama8B_0.1_distractor_quality
1,OIC-131-17-A,Question: Which of the following is NOT a pote...,Question: What is the primary etiological fact...,gemma_9b_distractor_quality,Llama1b_distractor_quality
2,OIC-340-11-A,Question: What is the main purpose of confirmi...,Question: Which imaging technique is most effe...,Llama1b_distractor_quality,gemma_9b_distractor_quality
3,OIC-340-11-A,Question: What is the main purpose of confirmi...,Question: What is the purpose of additional in...,Llama1b_distractor_quality,Llama8B_0.1_distractor_quality
4,OIC-091-12-A,Question: Which of the following is a characte...,Question: What are the three main causes of re...,gemma_9b_distractor_quality,Llama8B_0.1_distractor_quality
...,...,...,...,...,...
4924,OIC-040-01-A,Question: A 45-year-old woman presents with su...,Question: What is the main difference between ...,gpt-4o,Llama1b_distractor_quality
4925,OIC-152-06-B,Question: Which of the following heart conditi...,Question: What is the main difference between ...,gemma_9b_distractor_quality,Llama1b_distractor_quality
4926,OIC-152-06-B,Question: Which group of heart diseases is con...,Question: What is the main difference between ...,Llama8B_0.1_distractor_quality,Llama1b_distractor_quality
4927,OIC-249-03-B,Question: What is the main purpose of psychoso...,Question: Which of the following is NOT a psyc...,Llama1b_distractor_quality,gemma_9b_distractor_quality


In [125]:
# Statistics
print(f"Statistics:")
print(f"  IDs with all good MCQs: {nb_good_only}")
print(f"  IDs with all bad MCQs: {nb_bad_only}")
print(f"  IDs with mixed quality MCQs: {nb_mixed}")
print(f"  MCQs generated with GPT-4o: {nb_gpt4o_generated}")
print(f"  Total pairs created: {len(pairs_df)}")

# Split into training and evaluation datasets
eval_size = min(len(pairs_df) // 10, 200)  # 10% or max 200 samples for evaluation
eval_pairs = pairs_df.sample(n=eval_size, random_state=42)
train_pairs = pairs_df.drop(eval_pairs.index)

# Save to CSV
pairs_df.to_csv("all_mcq_pairs.csv", index=False)
train_pairs.to_csv("train_mcq_pairs.csv", index=False)
eval_pairs.to_csv("eval_mcq_pairs.csv", index=False)

print(f"Created {len(train_pairs)} training pairs and {len(eval_pairs)} evaluation pairs")
print("Files saved as all_mcq_pairs.csv, train_mcq_pairs.csv, and eval_mcq_pairs.csv")

In [128]:
print(f'{nb_good_only} of Lisa sheets ({nb_good_only/len(ids)*100}%) gives only good MCQs across all LLMs')
print(f'{nb_bad_only} of Lisa sheets ({nb_bad_only/len(ids)*100}%) gives only bad MCQs across all LLMs')

746 of Lisa sheets (23.5852039203288%) gives only good MCQs across all LLMs
95 of Lisa sheets (3.0034777110338284%) gives only bad MCQs across all LLMs
