In [1]:
import json
import openai
import marvin

In [2]:
import json
import uuid

def parse_practice_problems_to_json(content):
    guid = str(uuid.uuid4())
    prompt = f"""
Parse the following unstructured questions into json and return the json object with the following structure:
{{
    "id": {guid},
    "question": "QUESTION EXTRACTED FROM CONTENT",
    "options": {{
        "A": "OPTION A",
        "B": "OPTION B",
        "C": "OPTION C"
    }},
    "data": {{}}
}}

EXAMPLE 1:
CONTENT: 1. Published ratings on stocks ranging from 1 (strong sell) to 5 (strong buy) are 
examples of which measurement scale?
A. Ordinal
B. Continuous
C. Nominal
JSON:
{{
    "id": "d64adf0e-f821-4f9f-af97-e28d98d0317a",
    "question": "Published ratings on stocks ranging from 1 (strong sell) to 5 (strong buy) are examples of which measurement scale?",
    "options": {{
        "A": "Ordinal", 
        "B": "Continuous", 
        "C": "Nominal"
        }},
    "data": ""
}}

EXAMPLE 2:
CONTENT:"7. Each individual row of data in the table can be best characterized as:\nA. panel data.\nB. time-series data.\nC. cross-sectional data.\nDATA: An equity analyst gathers total returns for three country equity indexes over the \npast four years. The data are presented below.\n\nLearning Module 2 \nOrganizing, Visualizing, and Describing Data\n152\nTime Period\nIndex A\nIndex B\nIndex C\nYear t–3\n15.56%\n11.84%\n-4.34%\nYear t–2\n-4.12%\n-6.96%\n9.32%\nYear t–1\n11.19%\n10.29%\n-12.72%\nYear t\n8.98%\n6.32%\n21.44%'
JSON:
{{
  "id": "d29ae7d0-b227-410f-b262-02009051a477",
  "question": "Each individual row of data in the table can be best characterized as:",
  "options": {{
    "A": "panel data",
    "B": "time-series data",
    "C": "cross-sectional data"
  }},
  "data": {{
    "description": "An equity analyst gathers total returns for three country equity indexes over the past four years.",
    "table": {{
      "headers": ["Time Period", "Index A", "Index B", "Index C"],
      "rows": [
        {{ "Time Period": "Year t–3", "Index A": "15.56%", "Index B": "11.84%", "Index C": "-4.34%" }},
        {{ "Time Period": "Year t–2", "Index A": "-4.12%", "Index B": "-6.96%", "Index C": "9.32%" }},
        {{ "Time Period": "Year t–1", "Index A": "11.19%", "Index B": "10.29%", "Index C": "-12.72%" }},
        {{ "Time Period": "Year t", "Index A": "8.98%", "Index B": "6.32%", "Index C": "21.44%" }}
      ]
    }}
  }}
}}

EXAMPLE 3:
CONTENT: '23. The annual returns for three portfolios are shown in the following exhibit. Portfo-\nlios P and R were created in Year 1, Portfolio Q in Year 2.\n\xa0\nAnnual Portfolio Returns (%)\n\xa0\nYear 1\nYear 2\nYear 3\nYear 4\nYear 5\nPortfolio P\n-3.0\n4.0\n5.0\n3.0\n7.0\nPortfolio Q\n-3.0\n6.0\n4.0\n8.0\nPortfolio R\n1.0\n-1.0\n4.0\n4.0\n3.0\nThe median annual return from portfolio creation to Year 5 for:\nA. Portfolio P is 4.5%.\nB. Portfolio Q is 4.0%.\n\nPractice Problems\n157\nC. Portfolio R is higher than its arithmetic mean annual return.'
JSON:
{{
    'id': '23',
    'question': 'The median annual return from portfolio creation to Year 5 for:',
    'options':{{
        'A': 'Portfolio P is 4.5%.',
        'B': 'Portfolio Q is 4.0%.',
        'C': 'Portfolio R is higher than its arithmetic mean annual return.'
    }},
    'data': {{
        'table': {{
            'headers': ['Year 1','Year 2','Year 3','Year 4','Year 5'],
        'rows': [
            {{'Portfolio P': '-3.0','Portfolio Q': '-3.0','Portfolio R': '1.0'}},
            {{'Portfolio P': '4.0', 'Portfolio Q': '6.0', 'Portfolio R': '-1.0'}},
            {{'Portfolio P': '5.0', 'Portfolio Q': '4.0', 'Portfolio R': '4.0'}},
            {{'Portfolio P': '3.0', 'Portfolio Q': '8.0', 'Portfolio R': '4.0'}},
            {{'Portfolio P': '7.0', 'Portfolio Q': '', 'Portfolio R': '3.0'}}
        ]}}
    }}
}}

CONTENT: {content}
JSON:"""

    # print(prompt)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful AI Assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0
    )

    try:
        parsed_json = json.loads(response['choices'][0]['message']['content'])
        return parsed_json
    except json.JSONDecodeError:
        print(f"Failed to parse the following content into JSON:\n{content}")
        return None

def find_practice_problems(json_file):
    # Load the JSON object
    with open(json_file, "r") as f:
        data = json.load(f)

    practice_problems = []

    # Recursive function to search for practice problems in the JSON data
    def search_practice_problems(data):
        if isinstance(data, dict):
            if 'title' in data and data['title'] == "Practice Problems":
                practice_problems.append(data)
            else:
                for key in data:
                    search_practice_problems(data[key])
        elif isinstance(data, list):
            for item in data:
                search_practice_problems(item)

    search_practice_problems(data)
    return practice_problems

def find_and_parse_practice_problems(json_file):
    practice_problems = find_practice_problems(json_file)
    parsed_practice_problems = []

    for problem in practice_problems:
        content = problem.get("content", "")
        if content:
            parsed_json = parse_practice_problems_to_json(content)
            if parsed_json is not None:
                parsed_practice_problems.append(parsed_json)

    return parsed_practice_problems
# Replace 'your_json_file.json' with the path to your JSON file

def parse_questions(text):
    import re

    questions = []
    info_text = ""
    info_first_q = 0
    info_second_q = 0
    max_question_number = 50  # You can change this to the highest expected question number
    # Remove page headers
    text = re.sub(r"Practice Problems \d+", "", text)
    question_end = None
    # text = re.sub(rf'Learning Module {mod_number}\s+{mod_name}', "", text)

    for question_number in range(1, max_question_number + 1):
        # Look for the info text related to the current question
        info_match = re.search(rf"The following information relates to questions[\s\n]*{question_number}-\d+", text)
        # If info text is found, store it for future use
        if info_match:
            info_range = re.findall(r'\d+', info_match.group())
            info_first_q = int(info_range[0])
            info_second_q = int(info_range[1])
            info_start = info_match.end()

            if info_first_q <= question_number <= info_second_q:
                # Find the next question
                next_question_match = re.search(rf"\n{question_number}\.\s.*?A\..*?B\..*?C\.", text, re.DOTALL)

                if next_question_match:
                    info_end = next_question_match.start()
                else:
                    info_end = len(text)

                # Extract the info text
                info_text = text[info_start:info_end].strip()

        # Search for the question with answers (A., B., and C.)
        
        text = text[question_end:] if question_end else text
        question_match = re.search(rf"\n{question_number}\.\s.*?A\..*?B\..*?C\.", text, re.DOTALL)

        if question_match:
            question_start = question_match.start()

            # Find the next question
            next_question_number = question_number + 1
            next_question_match = re.search( rf"\n{next_question_number}\.\s.*?\nA\..*?\nB\..*?\nC\.", text, re.DOTALL)

            if next_question_match:
                if question_match.end() > next_question_match.start():
                    text = text[next_question_match.start():]
                    continue
                else:
                    question_end = next_question_match.start()
            else:
                question_end = len(text)

            # Extract the question
            question = text[question_start:question_end].strip()
            matches = [match for match in re.finditer(r"C\.\s.*?\.", question)]
            last_match = matches[-1] if matches else None
            question = question[:last_match.end()] if last_match else question

            # Prepend the info text, if any, to the question
            if info_text:
                if info_first_q <= question_number <= info_second_q:
                    question = f"{question}\nDATA: {info_text}"
                # else:
                #     info_text = ""
                #     text = text[info_end:]

            questions.append(question)

    return questions


In [3]:

practice_problems = find_practice_problems("./_helpers/output/level_1_volume_1.json")


In [4]:
questions = parse_questions(practice_problems[1]['content'])

In [5]:
q = questions[22]
q

'23. The annual returns for three portfolios are shown in the following exhibit. Portfo-\nlios P and R were created in Year 1, Portfolio Q in Year 2.\n\xa0\nAnnual Portfolio Returns (%)\n\xa0\nYear 1\nYear 2\nYear 3\nYear 4\nYear 5\nPortfolio P\n-3.0\n4.0\n5.0\n3.0\n7.0\nPortfolio Q\n-3.0\n6.0\n4.0\n8.0\nPortfolio R\n1.0\n-1.0\n4.0\n4.0\n3.0\nThe median annual return from portfolio creation to Year 5 for:\nA. Portfolio P is 4.5%.\nB. Portfolio Q is 4.0%.\n\nPractice Problems\n157\nC. Portfolio R is higher than its arithmetic mean annual return.'

In [21]:
from pydantic import BaseModel
from typing import List, Optional
import pandas as pd
from marvin import ai_fn, ai_model

@ai_model(model="gpt-4")
class Question(BaseModel):
    question: str
    options: List[str]
    question_data: Optional[List[dict]] = None


@ai_fn
def parse_question(question: str) -> list[Question]:
    """
    Parse a question string into a structured Question object with fields, "question", "options", and "question_data" (OPTIONAL)
    """

print(parse_question(q))

[Question(question='23. The annual returns for three portfolios are shown in the following exhibit. Portfo-\nlios P and R were created in Year 1, Portfolio Q in Year 2.\n\nAnnual Portfolio Returns (%)\n\nYear 1\nYear 2\nYear 3\nYear 4\nYear 5\nPortfolio P\n-3.0\n4.0\n5.0\n3.0\n7.0\nPortfolio Q\n-3.0\n6.0\n4.0\n8.0\nPortfolio R\n1.0\n-1.0\n4.0\n4.0\n3.0\nThe median annual return from portfolio creation to Year 5 for:\nA. Portfolio P is 4.5%.\nB. Portfolio Q is 4.0%.\n\nPractice Problems\n157\nC. Portfolio R is higher than its arithmetic mean annual return.', options=['A. Portfolio P is 4.5%.', 'B. Portfolio Q is 4.0%.', 'C. Portfolio R is higher than its arithmetic mean annual return.'], question_data=None)]


['1. The table below gives current information on the interest rates for two two-year and two eight-year maturity investments. The table also gives the maturity, liquidity, and default risk characteristics of a new investment possibility (Invest- ment 3). All investments promise only a single payment (a payment at maturity). Assume that premiums relating to inflation, liquidity, and default risk are con- stant across all time horizons. Investment Maturity (in Years) Liquidity Default Risk Interest Rate (%) 1 2 High Low 2.0 2 2 Low Low 2.5 3 7 Low Low r3 4 8 High Low 4.0 5 8 Low High 6.5 Based on the information in the above table, address the following: A. Explain the difference between the interest rates on Investment 1 and Investment',
 '2. B. Estimate the default risk premium. C. Calculate upper and lower limits for the interest rate on Investment 3, r',
 '3. 2. The nominal risk-free rate is best described as the sum of the real risk-free rate and a premium for: A. maturity. B. liqu

In [47]:
parse_practice_problems_to_json(obj[47])

{'number': 48,
 'question': 'Consider two variables, A and B. If variable A has a mean of -0.56, variable B has a mean of 0.23, and the covariance between the two variables is positive, the correlation between these two variables is:',
 'options': ['A. negative.', 'B. zero.', 'C. positive.'],
 'data': {'variable_A_mean': -0.56,
  'variable_B_mean': 0.23,
  'covariance': 'positive'}}

In [165]:
import spacy
from spacy import displacy
import en_core_web_sm
import re

def preprocess_text(text):
    # Replace problem numbers and choices with a unique symbol
    text = re.sub(r'(\d+)\.', r'\1<@>', text)
    text = re.sub(r'(A|B|C)\.', r'\1<#>', text)
    return text

def postprocess_text(text):
    # Replace the unique symbol with a period
    text = text.replace('<@>', '.')
    text = text.replace('<#>', '.')
    return text

In [166]:
def parse_text_to_json(text):
    text = preprocess_text(text)

    nlp = spacy.blank('en')
    nlp.add_pipe('sentencizer') 
    doc = nlp(text)

    problems_json = []
    current_problem = None

    for sentence in doc.sents:
        if "<@>" in sentence.text: 
            # If the sentence contains a problem number, it's a new problem
            if current_problem is not None:
                problems_json.append(current_problem)
            current_problem = {"problem_number": postprocess_text(sentence.text.split('<@>')[0].strip()), "problem_text": "", "choices": []}
            print(current_problem)
            problem_text = sentence.text.split('<@>')[1].strip()
            print(problem_text)
            if "<#>" in problem_text:
                # If the problem text contains a choice, split it
                current_problem["problem_text"] = postprocess_text(problem_text.split('<#>')[0].strip())
                current_problem["choices"].append('A.' + postprocess_text(problem_text.split('<#>')[1].strip()))
            else:
                current_problem["problem_text"] = postprocess_text(problem_text)
        elif "<#>" in sentence.text:
            # If the sentence contains a choice, it's a choice for the current problem
            current_problem["choices"].append(postprocess_text(sentence.text.strip()))
        else:
            # If the sentence doesn't contain a problem number or a choice, it's part of the problem text
            current_problem["problem_text"] += postprocess_text(sentence.text.strip())
    problems_json.append(current_problem)

    return problems_json
print(parse_text_to_json(practice_problems[0]['content'][18:]))


{'problem_number': '1', 'problem_text': '', 'choices': []}
The table below gives current information on the interest rates for two two-year and two eight-year maturity investments.
{'problem_number': 'Investment Maturity (in Years) Liquidity Default Risk Interest Rate (%) 1 2 High Low 2', 'problem_text': '', 'choices': []}
0 2 2 Low Low 2
{'problem_number': 'C. Calculate upper and lower limits for the interest rate on Investment 3, r3', 'problem_text': '', 'choices': []}
2
{'problem_number': '3', 'problem_text': '', 'choices': []}
Which of the following risk premiums is most relevant in explaining the differ- ence in yields between 30-year bonds issued by the US Treasury and 30-year bonds issued by a small private issuer?
{'problem_number': 'A. Inflation B. Maturity C. Liquidity 4', 'problem_text': '', 'choices': []}
The value in six years of $75,000 invested today at a stated annual interest rate of 7% compounded quarterly is closest to: A<#> $112,555
{'problem_number': 'If that rate 

In [163]:
print(parse_text_to_json(practice_problems[0]['content'][18:]))


{'problem_number': '1', 'problem_text': '', 'choices': []}
The table below gives current information on the interest rates for two two-year and two eight-year maturity investments.
{'problem_number': 'Investment Maturity (in Years) Liquidity Default Risk Interest Rate (%) 1 2 High Low 2', 'problem_text': '', 'choices': []}
0 2 2 Low Low 2
{'problem_number': 'C. Calculate upper and lower limits for the interest rate on Investment 3, r3', 'problem_text': '', 'choices': []}
2
{'problem_number': '3', 'problem_text': '', 'choices': []}
Which of the following risk premiums is most relevant in explaining the differ- ence in yields between 30-year bonds issued by the US Treasury and 30-year bonds issued by a small private issuer?
{'problem_number': 'A. Inflation B. Maturity C. Liquidity 4', 'problem_text': '', 'choices': []}
The value in six years of $75,000 invested today at a stated annual interest rate of 7% compounded quarterly is closest to: A<#> $112,555
{'problem_number': 'If that rate 

In [134]:
practice_problems[0]['content'][18:]

'1. The table below gives current information on the interest rates for two two-year and two eight-year maturity investments. The table also gives the maturity, liquidity, and default risk characteristics of a new investment possibility (Invest- ment 3). All investments promise only a single payment (a payment at maturity). Assume that premiums relating to inflation, liquidity, and default risk are con- stant across all time horizons. Investment Maturity (in Years) Liquidity Default Risk Interest Rate (%) 1 2 High Low 2.0 2 2 Low Low 2.5 3 7 Low Low r3 4 8 High Low 4.0 5 8 Low High 6.5 Based on the information in the above table, address the following: A. Explain the difference between the interest rates on Investment 1 and Investment 2. B. Estimate the default risk premium. C. Calculate upper and lower limits for the interest rate on Investment 3, r3. 2. The nominal risk-free rate is best described as the sum of the real risk-free rate and a premium for: A. maturity. B. liquidity. C. 