In [None]:
!pip install openai

In [None]:
!pip install langchain_openai

In [None]:
from openai import OpenAI
import os
import pandas as pd
import json

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

In [None]:
# Access the OpenAI key
openai_key = os.getenv("OPENAI_KEY")

In [None]:
client = OpenAI(api_key = OPENAI_KEY)

def call_openai_api(text_features, prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": text_features}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

In [None]:
prompt = """
Based on the following educational content, generate a multiple-choice question with four answer 
options where only one is correct. The question should assess understanding of the main ideas, 
and the options should be clear, informative, and relevant. Ensure that the distractors (incorrect options) 
follow a logical but incorrect interpretation, based on common misconceptions or misunderstandings of the topic.
Answer options must be as short as possible.
"""

In [None]:
prompt = """
Based on the following educational content, generate a multiple-choice question with four answer 
options where only one is correct. The question should adhere to the following guidelines:

1. **Clarity and Readability**: The question must be easy to understand, free from ambiguous phrasing, and appropriately structured.

2. **Relevance of Options**: The answer options should logically relate to the question, ensuring they align with the main idea and avoid irrelevant or confusing distractors.

3. **Suitability for Medical Exams**: The question must meet the professional standards expected in a medical exam, focusing on medical accuracy and relevance.

The generated question must be excellently formulated, clear, and precise, with answer options that are informative, relevant, and make sense in the context of the question. Distractors (incorrect options) should follow logical but incorrect interpretations, based on common misconceptions or misunderstandings of the topic. Keep all answer options as concise as possible.
"""

In [None]:
prompt ="""Based on the following educational content, generate a multiple-choice question with four answer
options where only one is correct. The question should violate at least one of the following guidelines:
1. **Clarity and Readability**: The question must be easy to understand, free from ambiguous phrasing, and appropriately structured.
2. **Relevance of Options**: The answer options should logically relate to the question, ensuring they align with the main idea and avoid irrelevant or confusing distractors.
3. **Suitability for Medical Exams**: The question must meet the professional standards expected in a medical exam, focusing on medical accuracy and relevance.
The generated question must be excellently formulated, clear, and precise, with answer options that are informative, relevant, and make sense in the context of the question. Distractors (incorrect options) should follow logical but incorrect interpretations, based on common misconceptions or misunderstandings of the topic. Keep all answer options as concise as possible.
This question must be negative sample in dpo training.
"""

In [None]:
prompt = """Based on the following educational content, generate a multiple-choice question with four answer options where only one is correct. The question and its options must adhere to the following rule:

1. **Ambiguity Between Correct and Incorrect Options**: The incorrect options (distractors) should be plausible and logically related to the question, creating ambiguity for someone who may not have complete knowledge of the topic. Distractors should reflect common misconceptions or misunderstandings that could reasonably confuse the respondent.
"""

In [None]:
# prompt = """Based on the following educational content, generate a multiple-choice question with four answer options where only one is correct. The question and its options must adhere the following rule:

# 1. **Ambiguity Between Correct and Incorrect Options**: The incorrect options (distractors) should NOT be plausible or logically related to the question. Distractors should be irrelevant, nonsensical, or obviously incorrect, making the correct answer stand out immediately.
# """

In [None]:
model = ChatOpenAI(model="gpt-4o", temperature = 0.7, api_key = OPENAI_KEY)

In [None]:
class MCQQuestion(BaseModel):
    question: str = Field(description="The multiple-choice question")
    option_a: str = Field(description="The first answer option labeled 'A'")
    option_b: str = Field(description="The second answer option labeled 'B'")
    option_c: str = Field(description="The third answer option labeled 'C'")
    option_d: str = Field(description="The fourth answer option labeled 'D'")
    correct_option: str = Field(description="This consists only a letter of correct option")

In [None]:
mcq_parser = JsonOutputParser(pydantic_object=MCQQuestion)

prompt_template = PromptTemplate(
    template="{prompt}.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"prompt": prompt, "format_instructions": mcq_parser.get_format_instructions()},
)

In [None]:
chain = prompt_template | model | mcq_parser

In [None]:
def get_all_txt_contents_as_list(directory_path):
    all_contents = []
    
    # Loop through all files in the given directory
    for filename in os.listdir(directory_path):
        # Check if the file is a .txt file
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)
            
            try:
                # Open and read the .txt file
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    all_contents.append(content)  # Add content to the list
            except Exception as e:
                print(f"Error reading {file_path}: {str(e)}")
    
    return all_contents

In [None]:
def get_all_txt_contents_from_folders(parent_directory):
    all_txt_contents = []
    
    # Loop through each folder in the parent directory
    for folder_name in os.listdir(parent_directory):
        folder_path = os.path.join(parent_directory, folder_name)
        
        # Check if it is a directory
        if os.path.isdir(folder_path):
            # Call the function to read all .txt files in this folder
            folder_contents = get_all_txt_contents_as_list(folder_path)
            
            # Append each content with the folder name
            for content in folder_contents:
                all_txt_contents.append({
                    "folder": folder_name,
                    "content": content
                })
    
    return all_txt_contents

In [None]:
parent_directory = "/kaggle/input/lisa-sheets/lisa_sheets_translated"
all_txt_contents = get_all_txt_contents_from_folders(parent_directory)

In [None]:
# import json
# from sklearn.model_selection import train_test_split

# folders = list(set(item['folder'] for item in all_txt_contents))

# train_folders, test_folders = train_test_split(folders, test_size=0.3, random_state=42)
# train_folders = sorted(train_folders)
# test_folders = sorted(test_folders)

In [None]:
# with open("train_folders.json", "w") as train_file:
#     json.dump(train_folders, train_file)

# with open("test_folders.json", "w") as test_file:
#     json.dump(test_folders, test_file)

In [None]:
import json

# Reading the train folders
with open("/kaggle/input/train-test-lisa-sheets/train_folders.json", "r") as train_file:
    train_folders = json.load(train_file)

# Reading the test folders
with open("/kaggle/input/train-test-lisa-sheets/test_folders.json", "r") as test_file:
    test_folders = json.load(test_file)

In [None]:
train_set = [item for item in all_txt_contents if item['folder'] in train_folders]
test_set = [item for item in all_txt_contents if item['folder'] in test_folders]

In [None]:
%%time
import concurrent.futures


def generate_question_parallel(item):
    """Function to process a single item and generate a question."""
    try:
        generated_question = chain.invoke({"query": item['content']})
        return {
            "folder": item['folder'],
            "content": item['content'],
            "question": generated_question
        }
    except Exception as e:
        print(f"Error occurred for item in folder {item['folder']}: {e}")
        return None

def run_in_parallel(train_set, max_workers=10):
    """Run the question generation in parallel with progress tracking."""
    questions = [None] * len(train_set)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(generate_question_parallel, item): index 
                           for index, item in enumerate(train_set)}

        processed_count = 0 
        for future in concurrent.futures.as_completed(future_to_index):
            index = future_to_index[future]  
            result = future.result()
            if result is not None:
                questions[index] = result  

            # Update and print progress every 100 items
            processed_count += 1
            if processed_count % 100 == 0:
                print(f"{processed_count} samples processed...")

    return [q for q in questions if q is not None]

questions = []
questions = run_in_parallel(train_set[200:1200], max_workers=6)

In [None]:
df = pd.DataFrame(questions)

df.to_csv("questions_positive_ambiguity.csv", index=False)