# Parse SAT Data

In [1]:
import json
from pydantic import BaseModel
from tqdm import tqdm

## Load raw data

In [2]:
with open('../data/raw/lookup.json') as fin:
    lookup = json.load(fin)

In [3]:
with open('../data/raw/english_questions.json') as fin:
    english_questions = json.load(fin)

In [4]:
with open('../data/raw/math_questions.json') as fin:
    math_questions = json.load(fin)

## Data Validation

In [5]:
# confirm that ibn questions are always a list of length 1
for q in english_questions + math_questions:
    if q.get("metadata").get("ibn"):
        assert len(q["question"]) == 1

In [6]:
# confirm that all restricted questions can be found in data
math_live_item_ids = set(lookup["mathLiveItems"])
math_ids = set(q["metadata"]["external_id"] for q in math_questions if q["metadata"]["external_id"])
assert len(math_ids) - len(math_live_item_ids) == len(math_ids - math_live_item_ids)

reading_live_item_ids = set(lookup["readingLiveItems"])
reading_ids = set(q["metadata"]["external_id"] for q in english_questions if q["metadata"]["external_id"])
assert len(reading_ids) - len(reading_live_item_ids) == len(reading_ids - reading_live_item_ids)

## Parse Data

In [7]:
class Parser:
    def parse_question(self: 'Parser', question: dict) -> 'Question':
        raise NotImplemented

class IBNParser(Parser):
    def parse_question(self: 'IBNParser', question: dict) -> 'Question':
        style_map = {
            "Multiple Choice": "mcq",
            "SPR": "spr"
        }
        pre_normalized_style = question["question"][0]["answer"]["style"]
        style = style_map.get(pre_normalized_style, pre_normalized_style)
        
        if style == 'mcq':
            choices = self.parse_multiple_choice(question["question"][0]["answer"]["choices"])
            if correct_choice := question["question"][0]["answer"].get("correct_choice"):
                answers = self.parse_answers(correct_choice, question["question"][0]["answer"]["choices"])
            else:
                answers = None
        elif style == 'spr':
            choices = None
            answers = None
        else:
            raise ValueError(f"Unknown question style: {style}")

        return Question(
            uId=question["metadata"]["uId"],
            external_id=question["metadata"]["external_id"],
            ibn=question["metadata"]["ibn"],
            questionId=question["metadata"]["questionId"],
            subject=question["subject"],
            skill=question["metadata"]["skill_desc"],
            domain=question["metadata"]["primary_class_cd_desc"],
            difficulty=question["metadata"]["difficulty"],
            body=question["question"][0].get("body"),
            prompt=question["question"][0].get("prompt"),
            style=style,
            choices=choices,
            answers=answers,
            rationale=question["question"][0]["answer"]["rationale"],
        )

    @staticmethod
    def parse_multiple_choice(options: dict) -> list[str]:
        return [option["body"] for option in options.values()]

    @staticmethod
    def parse_answers(answer: str, options: dict) -> list[int]:
        indices = options.keys()
        return [_id for (_id, option) in enumerate(indices) if option == answer]


class EIDParser(Parser):
    def parse_question(self: 'EIDParser', question: dict) -> 'Question':
        style = question["question"]["type"]
        if style == 'mcq':
            choices = self.parse_multiple_choice(question["question"]["answerOptions"])
            answers = self.parse_answers(
                question["question"]["keys"],
                question["question"]["answerOptions"]
            )
        elif style == 'spr':
            choices = None
            answers = question["question"]["keys"]
        else:
            raise ValueError(f"Unknown question style: {style}")
        
        return Question(
            uId=question["metadata"]["uId"],
            external_id=question["metadata"]["external_id"],
            ibn=question["metadata"]["ibn"],
            questionId=question["metadata"]["questionId"],
            subject=question["subject"],
            skill=question["metadata"]["skill_desc"],
            domain=question["metadata"]["primary_class_cd_desc"],
            difficulty=question["metadata"]["difficulty"],
            body=question["question"].get("stimulus"),
            prompt=question["question"]["stem"],
            style=style,
            choices=choices,
            answers=answers,
            rationale=question["question"]["rationale"],
        )

    @staticmethod
    def parse_multiple_choice(options: list) -> list[str]:
        return [option["content"] for option in options]

    @staticmethod
    def parse_answers(answers: list, options: list) -> list[int]:
        keys = [option["id"] for option in options]
        return [_id for (_id, option) in enumerate(keys) if option in answers]


class Question(BaseModel):
    """Standardized dataclass for SAT Questions"""
    uId: str
    external_id: str | None
    ibn: str | None
    questionId: str
    subject: str
    skill: str
    domain: str
    difficulty: str
    body: str | None
    prompt: str | None
    style: str
    choices: list[str] | None
    answers: list[int] | list[str] | None
    rationale: str
    
    @classmethod
    def parse_question(cls: 'Question', question: dict) -> 'Question':
        if question.get("metadata").get("ibn"):
            parser = IBNParser()
        else:
            parser = EIDParser()
        return parser.parse_question(question)

def add_subject(question: dict, subject: str) -> dict:
    question["subject"] = subject
    return question

questions_w_subject = (
        [add_subject(q, "math") for q in math_questions]
        + [add_subject(q, "english") for q in english_questions]
)

questions = [Question.parse_question(q) for q in tqdm(questions_w_subject)]

100%|██████████| 2017/2017 [00:00<00:00, 156020.71it/s]


## Export data

In [8]:
data = [q.dict() for q in questions]

In [9]:
with open('../data/parsed/questions.json', 'w') as fout:
    json.dump(data, fout)

In [10]:
!pwd

/home/kevin/IdeaProjects/unclesat/analytics/notebooks


In [11]:
!aws s3 cp ../data s3://unclesat/data/ --recursive

upload: ../data/raw/lookup.json to s3://unclesat/data/raw/lookup.json
upload: ../data/raw/english_question_list.json to s3://unclesat/data/raw/english_question_list.json
upload: ../data/raw/math_question_list.json to s3://unclesat/data/raw/math_question_list.json
upload: ../data/raw/english_questions.json to s3://unclesat/data/raw/english_questions.json
upload: ../data/parsed/questions.json to s3://unclesat/data/parsed/questions.json
upload: ../data/raw/math_questions.json to s3://unclesat/data/raw/math_questions.json
