# Crafting the MCQA dataset


## 0. Data format
We want the final data format to look like this:

`{"subject": "question_subject", "question": "Question: our_question \n\nOptions:\nA. op_a\nB. opt_B\nC. opt_C\nD. opt_D\n\nAnswer:", "answer": "good_answer"}`

We will use the following open source datasets to collect STEM related MCQA questions:
- MMLU (supercategory STEM)
- ARC (only Challenge)


In [1]:
import sys
import os
import pandas as pd
import json

from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath('../'))

from model.utils import *

## 1. Provided questions

In [2]:
data = read_jsonl("desired format examples/mcqa_example.jsonl")
example_df = pd.DataFrame(data)
example_df.head()

Unnamed: 0,subject,question,answer
0,machine_learning,Question: Statement 1| Linear regression estim...,D
1,machine_learning,Question: Statement 1| RoBERTa pretrains on a ...,C
2,machine_learning,Question: Statement 1| Support vector machines...,B
3,machine_learning,Question: A machine learning problem involves ...,D
4,machine_learning,"Question: As of 2020, which architecture is be...",A


Expected question format:

In [60]:
example_df.iloc[0]['question']

'Question: Statement 1| Linear regression estimator has the smallest variance among all unbiased estimators. Statement 2| The coefficients α assigned to the classifiers assembled by AdaBoost are always non-negative.\n\nOptions:\nA. True, True\nB. False, False\nC. True, False\nD. False, True\n\nAnswer:'

### Train/val split

In [61]:
train_df, eval_df = train_test_split(example_df, test_size=0.1, random_state=42, shuffle=True)
eval_df.to_json('mcqa_example_eval.jsonl', orient='records', lines=True)
eval_df.to_json('mcqa_example_eval.jsonl', orient='records', lines=True)

## 2. MMLU dataset

### Getting the STEM categories
The lists of categories and subcategories are from the MMLU github repo.

In [39]:
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

stem_categories = [sub for sub in subcategories if subcategories[sub][0] in categories["STEM"]]
print(stem_categories)

['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning']


In [35]:
def ds_to_jsonl(category):
    def dataset_to_list(dataset):
        return [dict(row) for row in dataset]
    
    ds = load_dataset("cais/mmlu", category)

    # Combine all datasets into a single list
    combined_data = []
    for key in ds.keys():
        combined_data.extend(dataset_to_list(ds[key]))

    name = 'raw data/' + category + "_raw.jsonl"
    with open(name, 'w') as jsonl_file:
        for entry in combined_data:
            jsonl_file.write(json.dumps(entry) + '\n')

In [None]:
for category in stem_categories:
    ds_to_jsonl(category)

### Aggregating in a single df

In [46]:
filenames = ['raw data/' + category + "_raw.jsonl" for category in stem_categories]
combined_data = []
for filename in filenames:
    combined_data.extend(read_jsonl(filename))

df = pd.DataFrame(combined_data)
df.head()

Unnamed: 0,question,subject,choices,answer
0,Find the degree for the given field extension ...,abstract_algebra,"[0, 4, 2, 6]",1
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...",abstract_algebra,"[8, 2, 24, 120]",2
2,Find all zeros in the indicated finite field o...,abstract_algebra,"[0, 1, 0,1, 0,4]",3
3,Statement 1 | A factor group of a non-Abelian ...,abstract_algebra,"[True, True, False, False, True, False, False,...",1
4,Find the product of the given polynomials in t...,abstract_algebra,"[2x^2 + 5, 6x^2 + 4x + 6, 0, x^2 + 1]",1


In [47]:
len(df)

3429

We get 3429 STEM questions from the MMLU dataset, let's get them to the desired format.

### Processing

Reminder of the expected format:

`{"subject": "question_subject", "question": "Question: our_question \n\nOptions:\nA. op_a\nB. opt_B\nC. opt_C\nD. opt_D\n\nAnswer:", "answer": "good_answer"}`

In [65]:
def format_question(row):
    choices = row['choices']
    formatted_question = (
        f"Question: {row['question']}\n\n"
        f"Options:\n"
        f"A. {choices[0]}\n"
        f"B. {choices[1]}\n"
        f"C. {choices[2]}\n"
        f"D. {choices[3]}\n\n"
        f"Answer:"
    )
    return formatted_question

df['question'] = df.apply(format_question, axis=1)
df.head()

Unnamed: 0,question,subject,choices,answer,formatted_question,mapped_answer
0,Question: Find the degree for the given field ...,abstract_algebra,"[0, 4, 2, 6]",B,Question: Find the degree for the given field ...,B
1,"Question: Let p = (1, 2, 5, 4)(2, 3) in S_5 . ...",abstract_algebra,"[8, 2, 24, 120]",C,"Question: Let p = (1, 2, 5, 4)(2, 3) in S_5 . ...",C
2,Question: Find all zeros in the indicated fini...,abstract_algebra,"[0, 1, 0,1, 0,4]",D,Question: Find all zeros in the indicated fini...,D
3,Question: Statement 1 | A factor group of a no...,abstract_algebra,"[True, True, False, False, True, False, False,...",B,Question: Statement 1 | A factor group of a no...,B
4,Question: Find the product of the given polyno...,abstract_algebra,"[2x^2 + 5, 6x^2 + 4x + 6, 0, x^2 + 1]",B,Question: Find the product of the given polyno...,B


In [63]:
answer_mapping = {0: "A", 1: "B", 2: "C", 3: "D"}
df['answer'] = df['answer'].map(answer_mapping)

df = df [['subject','question', 'answer']]

In [68]:
df.head()

Unnamed: 0,subject,question,answer
0,abstract_algebra,Question: Find the degree for the given field ...,B
1,abstract_algebra,"Question: Let p = (1, 2, 5, 4)(2, 3) in S_5 . ...",C
2,abstract_algebra,Question: Find all zeros in the indicated fini...,D
3,abstract_algebra,Question: Statement 1 | A factor group of a no...,B
4,abstract_algebra,Question: Find the product of the given polyno...,B


### Saving as jsonl

In [71]:
df.to_json('stem_mmlu.jsonl', orient='records', lines=True)

### eval/val split

In [56]:
df = pd.DataFrame(read_jsonl('MMLU clean/stem_mmlu.jsonl'))

train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

In [58]:
train_df.to_json('mmlu_train.jsonl', orient='records', lines=True)
eval_df.to_json('mmlu_eval.jsonl', orient='records', lines=True)

## 3. ARC Challenge dataset

### Loading data

In [3]:
path_template = 'raw data/ARC-Challenge-{}.csv'
categories = ['Dev', 'Train', 'Test']

dataframes = [pd.read_csv(path_template.format(category)) for category in categories]

df = pd.concat(dataframes, ignore_index=True)
df = df[['category', 'subject', 'question', 'AnswerKey']]
df.head()

Unnamed: 0,category,subject,question,AnswerKey
0,Dev,,Juan and LaKeisha roll a few objects down a ra...,D
1,Dev,,High-pressure systems stop air from rising int...,C
2,Dev,,Students visited the Morris W. Offit telescope...,D
3,Dev,,Which topic area would be the best to research...,A
4,Dev,,"One year, the oak trees in a park began produc...",B


### Formatting questions
Recap of desired format:

`"question": "Question: our_question \n\nOptions:\nA. op_a\nB. opt_B\nC. opt_C\nD. opt_D\n\nAnswer:"`

In [4]:
def format_question(row):
    question = row['question']
    question = 'Question: ' + question + '\n\nAnswer:'
    question = question.replace('(A)', '\n\nOptions:\nA.')
    question = question.replace('(B)', '\nB.')
    question = question.replace('(C)', '\nC.')
    question = question.replace('(D)', '\nD.')
    return question

df['question'] = df.apply(format_question, axis=1)
df['subject'] = 'ARC'
df.rename(columns={'AnswerKey': 'answer'}, inplace=True)

In [8]:
len(df)

2590

### train/val split

In [10]:
train_df, eval_df = train_test_split(df[['subject', 'question', 'answer']], test_size=0.1, random_state=42, shuffle=True)

### Save as jsonl

In [48]:
# df[df['category'] == 'Train'][['subject', 'question', 'answer']].to_json('arc_train.jsonl', orient='records', lines=True)
# df[df['category'] != 'Train'][['subject', 'question', 'answer']].to_json('arc_eval.jsonl', orient='records', lines=True)

In [49]:
train_df.to_json('arc_train.jsonl', orient='records', lines=True)
eval_df.to_json('arc_eval.jsonl', orient='records', lines=True)

## 4. Combine all MCQA datasets

### All datasets

In [51]:
mmlu = read_jsonl('MMLU clean/stem_mmlu.jsonl')
provided = read_jsonl('desired format examples/mcqa_example.jsonl')
arc_train = read_jsonl('ARC-C clean/arc_train.jsonl')
arc_eval = read_jsonl('ARC-C clean/arc_eval.jsonl')

write_jsonl(provided + mmlu + arc_train + arc_eval, 'MCQA_data.jsonl')

### All train

In [3]:
mmlu_train = read_jsonl('MMLU clean/mmlu_train.jsonl')
provided_train = read_jsonl('desired format examples/mcqa_example_train.jsonl')
arc_train = read_jsonl('ARC-C clean/arc_train.jsonl')

write_jsonl(provided_train + mmlu_train + arc_train, 'MCQA_train.jsonl')

### All val

In [None]:
mmlu_eval = read_jsonl('MMLU clean/mmlu_eval.jsonl')
provided_eval = read_jsonl('desired format examples/mcqa_example_eval.jsonl')
arc_eval = read_jsonl('ARC-C clean/arc_eval.jsonl')

write_jsonl(provided_eval + mmlu_eval + arc_eval, 'MCQA_eval.jsonl')