In [1]:
import pandas as pd
import re

# Load your original CSV
df = pd.read_csv("Multiple-choice Question with a Single Correct Answer.csv")

# Function to split main question and choices A–E from the "question" field
def split_question_and_choices(qtext):
    # Split the question body from the options
    question_match = re.split(r'\nA\.', qtext, maxsplit=1)
    question_main = question_match[0].strip() if question_match else qtext

    # Capture options A to E
    options_text = 'A.' + question_match[1] if len(question_match) > 1 else ''
    pattern = r'([A-E])\.\s(.*?)(?=\n[A-E]\.|$)'
    options = dict(re.findall(pattern, options_text, re.DOTALL))

    # Ensure all options A to E are included, even if missing
    all_options = {k: options.get(k, '') for k in ['A', 'B', 'C', 'D', 'E']}
    
    return pd.Series([question_main] + list(all_options.values()))

# Apply the function to create new columns
df[['question_main', 'A', 'B', 'C', 'D', 'E']] = df['question'].apply(split_question_and_choices)

# Select and rename the final columns
final_df = df[[
    'qid', 'video_id', 'youtube_url', 'duration',
    'question_type', 'capability',
    'question_main', 'question_prompt', 'A', 'B', 'C', 'D', 'E'
]]
final_df = final_df.rename(columns={"question_main": "question"})

# Save the cleaned data to a new CSV file
final_df.to_csv("mcq_cleaned.csv", index=False)

print("✅ Cleaning completed! Saved as 'mcq_cleaned.csv'")

✅ Cleaning completed! Saved as 'mcq_cleaned.csv'
