In [1]:
from openai import OpenAI
import os
import pandas as pd
import json

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv("../.env")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


True

In [6]:
# Access the OpenAI key
openai_key = os.getenv("OPENAI_API_KEY")

In [9]:
client = OpenAI(api_key = openai_key)

## Prompt templates preparation

In [102]:
negative_prompt = """You are tasked with generating multiple-choice questions for a medical institution exam. In this case, the correct answer should be highly ambiguous, meaning it should be very difficult to distinguish from the incorrect answers.

Your task is to create one question with four answer choices:

One correct answer (factually accurate)
Three incorrect answers that are logically connected to the correct answer but introduce subtle misconceptions.
The incorrect answers should be plausible distractors, meaning a student with basic medical knowledge might mistakenly select them.
The correct answer must be randomly assigned to one of A, B, C, or D.

Example:
Question: What is a pneumothorax?
A. Gas effusion in the pleural cavity (Correct Option)
B. Fluid buildup in the pleural cavity (Incorrect but plausible – confused with pleural effusion)
C. A collapsed alveolus (Incorrect but plausible – confused with atelectasis)
D. An inflammation of the pleura (Incorrect but plausible – confused with pleuritis)

Now generate a question following these guidelines.
"""

In [103]:
positive_prompt = """You are tasked with generating multiple-choice questions for a medical institution exam. In this case, the correct answer should be obviously different from the incorrect answers, leading to minimal or no ambiguity.

Your task is to create one question with four answer choices:

One correct answer (factually accurate)
Three incorrect answers that are unrelated but still within the medical domain (e.g., different medical fields or completely different concepts).
The incorrect answers must not be plausible distractors for someone with basic medical knowledge.
The correct answer must be randomly assigned to one of A, B, C, or D.

Example:
Question: What is the primary function of hemoglobin?
A. Transporting oxygen in the blood (Correct Option)
B. Digesting carbohydrates (Clearly unrelated – digestive system)
C. Conducting nerve impulses (Clearly unrelated – nervous system)
D. Filtering toxins from the blood (Clearly unrelated – renal system)
Correct option: A

Now generate a question following these guidelines.
"""

In [104]:
model = ChatOpenAI(model="gpt-4o", temperature = 0.4, api_key = openai_key)

In [105]:
class MCQQuestion(BaseModel):
    question: str = Field(description="The multiple-choice question")
    option_a: str = Field(description="The first answer option labeled 'A'")
    option_b: str = Field(description="The second answer option labeled 'B'")
    option_c: str = Field(description="The third answer option labeled 'C'")
    option_d: str = Field(description="The fourth answer option labeled 'D'")
    correct_option: str = Field(description="This consists only a letter of correct option")

In [106]:
mcq_parser = JsonOutputParser(pydantic_object=MCQQuestion)

positive_prompt_template = PromptTemplate(
    template="{prompt}.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"prompt": positive_prompt, "format_instructions": mcq_parser.get_format_instructions()},
)

In [107]:
positive_chain = positive_prompt_template | model | mcq_parser

In [108]:
negative_prompt_template = PromptTemplate(
    template="{prompt}.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"prompt": negative_prompt, "format_instructions": mcq_parser.get_format_instructions()},
)

In [109]:
negative_chain = negative_prompt_template | model | mcq_parser

## Data

In [22]:
df_lisa = pd.read_csv("../data/lisa_sheets.csv")

In [25]:
import json

with open("../data/train_test_split/train_folders.json", "r") as train_file:
    train_folders = json.load(train_file)

# Reading the test folders
with open("../data/train_test_split/test_folders.json", "r") as test_file:
    test_folders = json.load(test_file)

In [None]:
def classify_folder(folder):
    if folder in train_folders:
        return 'train'
    elif folder in test_folders:
        return 'test'
    else:
        return 'unknown' 

df_lisa['dataset_split'] = df_lisa['folder'].apply(classify_folder)

In [29]:
df_lisa.dataset_split.value_counts()

dataset_split
train    3169
test     1524
Name: count, dtype: int64

## Call Openai api 

In [75]:

import concurrent.futures

def generate_question_parallel_gpt(content, chain):
    """Function to process a single content item and generate a question."""
    try:
        generated_question = chain.invoke({"query": content})
        return generated_question
    except Exception as e:
        print(f"Error occurred for content: {e}")
        return None

def run_in_parallel_df(df, chain, max_workers=10):
    """Run the question generation in parallel for a DataFrame."""
    questions = [None] * len(df)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(generate_question_parallel_gpt, content, chain): index 
            for index, content in enumerate(df['content_gpt'])
        }

        processed_count = 0 
        for future in concurrent.futures.as_completed(future_to_index):
            index = future_to_index[future]  
            try:
                result = future.result()
                if result is not None:
                    questions[index] = result  
            except Exception as e:
                print(f"Unhandled exception in processing item at index {index}: {e}")

            # Update and print progress every 100 items
            processed_count += 1
            if processed_count % 100 == 0:
                print(f"{processed_count} samples processed...")

    return questions

In [None]:
%%time
df_lisa['positive_generated_question'] = run_in_parallel_df(df_lisa, positive_chain)  

In [110]:
%%time
results = run_in_parallel_df(df_lisa.iloc[:10], negative_chain)  

CPU times: user 176 ms, sys: 0 ns, total: 176 ms
Wall time: 3.05 s


In [None]:
%%time
df_lisa['negative_generated_question'] = run_in_parallel_df(df_lisa, negative_chain)  

In [99]:
import random

def shuffle_options_preserve_order(question_dict):
    """Shuffles the answer options while preserving the JSON key order."""
    options = ['option_a', 'option_b', 'option_c', 'option_d']
    option_values = [question_dict[opt] for opt in options]
    
    # Identify the correct answer before shuffling
    correct_answer = question_dict['correct_option'].lower()
    correct_value = question_dict[f'option_{correct_answer}']
    
    # Shuffle option labels while keeping key order
    shuffled_options = options.copy()
    random.shuffle(shuffled_options)

    # Assign new shuffled values while preserving key order
    shuffled_dict = {
        'question': question_dict['question'],
        'option_a': None,
        'option_b': None,
        'option_c': None,
        'option_d': None,
        'correct_option': None
    }

    for new_key, old_value in zip(shuffled_options, option_values):
        shuffled_dict[new_key] = old_value

    # Find the new correct option key
    new_correct_option = next(opt for opt, val in shuffled_dict.items() if val == correct_value)
    shuffled_dict['correct_option'] = new_correct_option[-1].upper()  # Convert 'option_x' to 'X'

    return shuffled_dict

df_lisa['shuffled_positive_generated_question'] = df_lisa['positive_generated_question'].apply(shuffle_options_preserve_order)

In [122]:
df_lisa['shuffled_negative_generated_question'] = df_lisa['negative_generated_question'].apply(shuffle_options_preserve_order)

In [127]:
df_lisa = df_lisa.drop(columns=['positive_generated_question', 'negative_generated_question'], errors='ignore')

In [128]:
df_lisa = df_lisa.rename(columns={
    'shuffled_negative_generated_question': 'negative_generated_question',
    'shuffled_positive_generated_question': 'positive_generated_question'
}, errors='ignore')

In [130]:
# Creating the first dataset with normalized positive_generated_question
df_positive = df_lisa[['folder', 'id', 'content_gpt', 'dataset_split', 'positive_generated_question']].copy()

df_positive = df_positive.join(df_positive['positive_generated_question'].apply(pd.Series))
df_positive = df_positive.drop(columns=['positive_generated_question'], errors='ignore')

In [132]:
df_positive.correct_option.value_counts()

correct_option
D    1214
C    1177
A    1167
B    1135
Name: count, dtype: int64

In [133]:
# Creating the second dataset with normalized negative_generated_question
df_negative = df_lisa[['folder', 'id', 'content_gpt', 'dataset_split', 'negative_generated_question']].copy()

# Expand the dictionary in negative_generated_question into separate columns
df_negative = df_negative.join(df_negative['negative_generated_question'].apply(pd.Series))
df_negative = df_negative.drop(columns=['negative_generated_question'], errors='ignore')

In [140]:
df_positive.to_csv("../data/data_for_finetuning/ambiguity/positive_prompt_ambiguity.csv", index=False)

In [141]:
df_negative.to_csv("../data/data_for_finetuning/ambiguity/negative_prompt_ambiguity.csv", index=False)