In [1]:
import pandas as pd
import re

df = pd.read_csv('mcq.csv')

In [4]:
df_clean = df[['qid', 'youtube_url', 'question', 'question_prompt']].copy()

# Extract A, B, C, D choices from 'question'
def extract_abcd_from_question(q_text):
    if not isinstance(q_text, str):
        return [None] * 4
    pattern = r"A\.\s*(.*?)\s*B\.\s*(.*?)\s*C\.\s*(.*?)\s*D\.\s*(.*?)\s*(?=E\.|$)"
    match = re.search(pattern, q_text, re.DOTALL)
    if match:
        return list(match.groups())
    else:
        return [None] * 4

df_clean[['A', 'B', 'C', 'D']] = df_clean['question'].apply(lambda x: pd.Series(extract_abcd_from_question(x)))

# Extract E from 'question_prompt'
def extract_e_from_prompt(prompt):
    if not isinstance(prompt, str):
        return None
    match = re.search(r"E\.\s*(None of the above)", prompt)
    if match:
        return match.group(1).strip()
    return None

df_clean['E'] = df_clean['question_prompt'].apply(extract_e_from_prompt)

# Remove A–D part from question
def remove_abcd_from_question(text):
    if not isinstance(text, str):
        return text
    return re.split(r'\bA\.\s*', text)[0].strip()

df_clean['question'] = df_clean['question'].apply(remove_abcd_from_question)

# Drop the 'question_prompt' column
if 'question_prompt' in df_clean.columns:
    df_clean = df_clean.drop(columns=['question_prompt'])

In [5]:
df_clean.to_csv('mcq_cleaned.csv', index=False)
print("Data has been saved as mcq_cleaned.csv")

Data has been saved as mcq_cleaned.csv
