# Load Data

In [1]:
!pip install openai==1.55.3
!pip install langchain_openai

!pip install datasets

Collecting openai==1.55.3
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Collecting jiter<1,>=0.4.0 (from openai==1.55.3)
  Downloading jiter-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Downloading openai-1.55.3-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiter-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (343 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m343.6/343.6 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jiter, openai
Successfully installed jiter-0.8.0 openai-1.55.3
Collecting langchain_openai
  Downloading langchain_openai-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain_openai)
  Downloading langchain_core-0.3.22-py3-none-any.whl.metadata (6.3 kB)
Collecting tiktoke

In [2]:
from openai import OpenAI
import os
import pandas as pd
import json
import ast

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df = pd.read_csv('/kaggle/input/guess-data/lisa_sheets1.csv')
len(df)

3282

In [4]:
with open('/kaggle/input/guess-data/test_folders.json', 'r') as file:
    test_folders = json.load(file)

df = df[-df['folder'].isin(test_folders)]
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
midpoint = len(df) // 2

df_positive = df.iloc[:midpoint]
df_negative = df.iloc[midpoint:]

In [6]:
os.makedirs('/kaggle/working/batches_positive', exist_ok=True)
os.makedirs('/kaggle/working/batches_negative', exist_ok=True)

# Generation functions

In [7]:
OPENAI_KEY = "sk-wuniLI4FhpnCoM-4H7usEfIktpDoy0YOiCE-EIHzmOT3BlbkFJWg0Ky_60c07TCiGODeOa8_6-HqYOPu4YwbHSGAsH0A"

In [8]:
model = ChatOpenAI(model="gpt-4o", temperature = 0.7, api_key = OPENAI_KEY)

In [9]:
class MCQQuestion(BaseModel):
    question: str = Field(description="The multiple-choice question")
    option_a: str = Field(description="The first answer option labeled 'A'")
    option_b: str = Field(description="The second answer option labeled 'B'")
    option_c: str = Field(description="The third answer option labeled 'C'")
    option_d: str = Field(description="The fourth answer option labeled 'D'")
    correct_option: str = Field(description="This consists only a letter of correct option")

mcq_parser = JsonOutputParser(pydantic_object=MCQQuestion)

formatting_prompt_template = PromptTemplate(
    template="{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": mcq_parser.get_format_instructions()},
)

In [10]:
def get_generation_chain(system_prompt):
    generation_prompt_template = PromptTemplate(
        template="{system_instructions}\n{query}\n",
        input_variables=["query"],
        partial_variables={"system_instructions": system_prompt},
    )

    return generation_prompt_template | model | formatting_prompt_template | model | mcq_parser

In [11]:
from tqdm import tqdm
import numpy as np

tqdm.pandas()

In [12]:
import json
def parse_mcq(mcq_json):
    return mcq_json['question'], mcq_json['option_a'], mcq_json['option_b'], mcq_json['option_c'], mcq_json['option_d'], mcq_json['correct_option']

In [13]:
def generate_mcqs(df, system_prompt, save_path):
    chain = get_generation_chain(system_prompt)
    def generate_mcq(row):
        return chain.invoke({"query": row['lisa_sheet']})

    batches = np.array_split(df, 5)

    # Process each batch and save results
    for batch_idx, batch in enumerate(batches, 1):
        print(f"Processing batch {batch_idx}/{len(batches)}")
        try:
            # Process the current batch
            batch['mcqs'] = batch.progress_apply(generate_mcq, axis=1)

            # Save the processed batch to a file
            batch.to_csv(f'/{save_path}/batch-{batch_idx}.csv', index=False)
        except Exception as e:
            print(f"Error occurred in batch {batch_idx}: {e}")

    results_df = pd.concat([batches[0], batches[1], batches[2], batches[3], batches[4]], ignore_index=True)
    return results_df

# Generation Process

In [14]:
system_prompt_positive = """
Based on the following educational content, generate a multiple-choice question with four answer options where only one is correct.
The question should assess understanding of the main ideas, and the options should be clear, informative, and relevant.
Ensure that the distractors (incorrect options) follow a logical but incorrect interpretation, based on common misconceptions or misunderstandings of the topic.
Ensure that the way the question is constructed does not allow a test taker with no relevant medical knowledge to identify the correct answer through clues in the phrasing, structure, answer choice formatting, or other linguistic hints.
Answer options must be as short as possible.
"""

path_positive = '/kaggle/working/batches_positive'

df_pos_mcqs = generate_mcqs(df_positive, system_prompt_positive, path_positive)

system_prompt_negative = """
Based on the following educational content, generate a multiple-choice question with four answer options where only one is correct.
The question should assess understanding of the main ideas, and the options should be clear, informative, and relevant.
Ensure that the distractors (incorrect options) follow a logical but incorrect interpretation, based on common misconceptions or misunderstandings of the topic.
It is supposed to be a trick-question. Ensure that the way the question is constructed allows a test taker with no relevant medical knowledge to identify the correct answer through clues in the phrasing, structure, answer choice formatting, or other linguistic hints.
Answer options must be as short as possible.
"""

path_negative = '/kaggle/working/batches_negative'

df_neg_mcqs = generate_mcqs(df_positive, system_prompt_negative, path_negative)

results_df = pd.concat([df_pos_mcqs, df_neg_mcqs], ignore_index=True)
results_df[['question', 'option_a', 'option_b', 'option_c', 'option_d', 'correct_option']] = df['mcqs'].apply(
    lambda x: pd.Series(parse_mcq(x))
)

  return bound(*args, **kwds)


Processing batch 1/5


100%|██████████| 329/329 [16:33<00:00,  3.02s/it]


Processing batch 2/5


100%|██████████| 328/328 [16:20<00:00,  2.99s/it]


Processing batch 3/5


100%|██████████| 328/328 [16:11<00:00,  2.96s/it]


Processing batch 4/5


100%|██████████| 328/328 [16:07<00:00,  2.95s/it]


Processing batch 5/5


100%|██████████| 328/328 [16:07<00:00,  2.95s/it]
  return bound(*args, **kwds)


Processing batch 1/5


100%|██████████| 329/329 [19:32<00:00,  3.56s/it]


Processing batch 2/5


100%|██████████| 328/328 [19:07<00:00,  3.50s/it]


Processing batch 3/5


100%|██████████| 328/328 [20:30<00:00,  3.75s/it]


Processing batch 4/5


100%|██████████| 328/328 [20:11<00:00,  3.69s/it]


Processing batch 5/5


100%|██████████| 328/328 [21:46<00:00,  3.98s/it]


KeyError: 'mcqs'

In [None]:
estimator_prompt = """You are tasked with evaluating a multiple-choice question (which will be provided after this prompt) intended for use in a medical institution exam.
Determine if the way the question is constructed would allow a test taker with no relevant medical knowledge to identify the correct answer through clues in the phrasing, structure, answer choice formatting, or other linguistic hints.
If there are any such clues that would help an uninformed test taker guess the correct answer, respond with "True".
If not, respond with "False".
Do not explain your reasoning. Provide no additional text besides either "True" or "False".
"""

In [None]:
client = OpenAI(api_key = OPENAI_KEY)
def call_openai_api(system_prompt, user_prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [None]:
def estimate_question(row):
    question_text = row['question']
    options = f"a) {row['option_a']}\nb) {row['option_b']}\nc) {row['option_c']}\nd) {row['option_d']}"

    user_prompt = f"""Question:\n\n{question_text}\n\nOptions:\n{options}\n\nCorrect Option: {row['correct_option']}"""
    try:
        return call_openai_api(estimator_prompt, user_prompt)
    except Exception as e:
        print(f"Error processing question at index {row.name}: {e}")
        return None

In [None]:
results_df['can_be_guessed'] = results_df.progress_apply(estimate_question, axis=1)

In [None]:
results_df.to_csv('/kaggle/working/results.csv')