# Get JSON of an exam paper

In [85]:
import os
import openai
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
import re
import json

# Set your OpenAI API key
openai.api_key = os.environ.get('OPENAI_API_KEY')

# Load the PDF file
pdf_path = "exam_papers/bpghs_emath_prelim_paper1.pdf" 

loader = PyPDFLoader(pdf_path)
documents = loader.load()

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)


In [86]:
# Function to extract JSON data from the response
def extract_json_from_response(response):
    try:
        data = json.loads(response)
        return data
    except json.JSONDecodeError:
        # Try to extract JSON from the response using regex
        pattern = r'\[.*\]'
        match = re.search(pattern, response, re.DOTALL)
        if match:
            json_str = match.group(0)
            try:
                data = json.loads(json_str)
                return data
            except json.JSONDecodeError as e:
                print("Error parsing JSON:", e)
                return None
        else:
            print("No JSON data found in the response.")
            return None


## Get JSON for each question

In [87]:
# Set maximum pages per chunk to avoid exceeding token limits
max_pages_per_chunk = 5

chunks = []
current_chunk = []
current_pages = []
for i, doc in enumerate(documents):
    page_num = i + 1
    current_chunk.append(f"\n\n--- Page {page_num} ---\n\n{doc.page_content}")
    current_pages.append(page_num)
    if len(current_chunk) == max_pages_per_chunk or i == len(documents) - 1:
        text_content = ''.join(current_chunk)
        chunks.append((text_content, current_pages.copy()))
        current_chunk = []
        current_pages = []

In [88]:
# Define the prompt template
question_prompt_template =  """
    You are provided with text extracted from a PDF exam paper. The text may include multiple math questions along with diagrams.

    Please identify each question, determine the question_number, which is an integer representing the full question (eg: 1, 2, 3), along with the question_part, a string representing the question with any sub-parts that it belongs to (eg: "1", "2a", "2b", "2ai", "2aii", "3iv", "3v" etc.).
    Please also determine its page boundaries (page_start and page_end), and categorize it into one of the following categories:
        Quadratic Functions, Equations and Inequalities
        Surds
        Polynomials, cubic equations and partial fractions
        Binomial theorem and its application
        Exponential and logarithmic functions
        Coordinate geometry
        Linear law
        Trigonometric functions and equations
        Trigonometric identities and formulae
        Gradients, derivatives and differentiation techniques
        Applications of differentiation
        Differentiation of trigonometric, logarithmic, and exponential functions and their applications
        Integration
        Applications of integration
        Kinematics
        Proofs in plane geometry

    Output your result as a list of JSON objects enclosed in a code block like this:

    ```json
    ["question": "...", "question_number": "...", "question_part": "...", "page_start": ..., "page_end": ..., "category": "..."]
    ```
    Please ensure the output is valid JSON.

    Here is the text content:
    
    {text_content}
    """
    

In [89]:
prompt = PromptTemplate(
    input_variables=["text_content"],
    template=question_prompt_template,
)

llm = ChatOpenAI(temperature=0, model_name='gpt-4o', api_key=openai.api_key)

questions_chain = LLMChain(llm=llm, prompt=prompt)

In [90]:
all_data = []

for idx, (text_content, pages) in enumerate(chunks):
    print(f"Processing chunk {idx+1}/{len(chunks)}, pages {pages[0]}-{pages[-1]}")

    response = questions_chain.run(text_content=text_content)

    data = extract_json_from_response(response)

    if data is not None:
        all_data.extend(data)
    else:
        print(f"Failed to extract data for chunk {idx+1}")

Processing chunk 1/5, pages 1-5
Processing chunk 2/5, pages 6-10
Processing chunk 3/5, pages 11-15
Processing chunk 4/5, pages 16-20
Processing chunk 5/5, pages 21-22


In [91]:
print(all_data)

[{'question': 'Calculate 332.13 81.233.23.524××, giving your answer correct to 4 significant figures.', 'question_number': '1', 'question_part': '1', 'page_start': 3, 'page_end': 3, 'category': 'Surds'}, {'question': 'At a warehouse sale, all prices are reduced by 15%. The price of a set of ear pods during the sale is $221. (i) Find its original price.', 'question_number': '2', 'question_part': '2i', 'page_start': 3, 'page_end': 3, 'category': 'Exponential and logarithmic functions'}, {'question': 'If the salesman still earns a profit of 10.5% during the sale, find the percentage profit he earns from selling the pair of ear pods if it is not on sale.', 'question_number': '2', 'question_part': '2ii', 'page_start': 3, 'page_end': 3, 'category': 'Exponential and logarithmic functions'}, {'question': 'Ethan measures the amount of rain, in millimetres (mm), each day for 31 days. The bar chart shows his results. (a) Write down the median amount of rain.', 'question_number': '3', 'question_pa

In [92]:
print(all_data[0].keys())

dict_keys(['question', 'question_number', 'question_part', 'page_start', 'page_end', 'category'])


In [93]:
with open('amath_topics.json') as f:
    amath_topics = json.load(f)
    print(amath_topics)

{'Quadratic Functions, Equations and Inequalities': ['Solving simultaneous equations by substitution', 'Finding coordinates of intersection points', 'Finding maximum and minimum value of a quadratic function by completing the square', 'Sketching graph of quadratic function', 'Solving quadratic inequalities', 'Intersection of straight line and curve', 'Always positive/negative quadratic expression', 'Quadratic functions in real-world context'], 'Surds': ['Rationalising denominator of surd', 'Adding and/or subtracting surds', 'Solving equations involving surds', 'Word problems involving surds'], 'Polynomials, cubic equations and partial fractions': ['Identities', 'Identities with an unknown quotient', 'Long division/synthetic method/remainder theorem', 'Application of remainder theorem', 'Sum and difference of 2 cubes', 'Solving cubic equation', 'Factor theorem and solving a cubic equation', 'Factor theorem and sketching of cubic curve', 'Forming cubic equation/expression', 'Proper algeb

In [94]:
# Define a prompt template for assigning question types
question_type_prompt_template = '''
Given the following question and its relevant topics, assign the most appropriate question type from the list.

Question: {question_text}

Relevant Question Types:
{question_types_list}

Please return the most appropriate question type from the list above. If none of the question types are relevant, you can return "Unknown".
Simply return the question type as a string, nothing else.
'''

question_type_prompt = PromptTemplate(
    input_variables=["question_text", "question_types_list"],
    template=question_type_prompt_template,
)

topics_chain = LLMChain(llm=llm, prompt=question_type_prompt)

# Assuming all_data contains the extracted questions and amath_topics is the topic-question type mapping
for question in all_data:
    q_text = question.get("question")
    q_category = question.get("category")
    
    # Get relevant question types based on the identified category (topic)
    relevant_topics = amath_topics.get(q_category)
    
    # If there are relevant topics, proceed with assigning the question type
    if relevant_topics:
        question_types_list = "\n".join(relevant_topics)
        
        # Run the LLM chain to get the most appropriate question type
        response = topics_chain.run(question_text=q_text, question_types_list=question_types_list)
        
        # Assign the question type to the question data
        question["question_type"] = response.strip()  # Clean up the response
    
    else:
        # If no relevant topics, mark the question type as unknown
        question["question_type"] = "Unknown"


## Get JSON of paper meta information

In [95]:
# Define the prompt to extract meta information
meta_info_prompt_template = '''
You are provided with text from the first page of a PDF document. Extract the following meta information:
- Subject ("Additional Mathematics", "Elementary Mathematics", "H1 Mathematics", or "H2 Mathematics")
- School
- Level ("O Level" or "A Level")
- Year
- Exam type ("Preliminary Examination", "Mid Year Examination", "Final Year Examination", or "Test")
- Paper (1 or 2)

Please output the result as a JSON object with these fields:

```json
[
  <curly_bracket_start>
  "subject": "...",
  "school": "...",
  "level": "...",
  "year": "...",
  "exam_type": "...",
  "paper": "..."
  <curly_bracket_end>
]
```

Here is the text content:

{text_content}
'''

meta_info_prompt = PromptTemplate(
    input_variables=["text_content"],
    template=meta_info_prompt_template,
)

meta_chain = LLMChain(llm=llm, prompt=meta_info_prompt)

# Extract meta information from the first page
first_page_content = documents[0].page_content
meta_response = meta_chain.run(text_content=first_page_content)


In [96]:
meta_info = extract_json_from_response(meta_response)
meta_info = meta_info[0] if meta_info else {}

## Combine into one output JSON

In [97]:
# Combine meta information with extracted questions
output_data = {
    "meta_info": meta_info,
    "questions": all_data
}

# Save output to a JSON file
subject = meta_info.get("subject")
school = meta_info.get("school")
year = meta_info.get("year")
exam_type = meta_info.get("exam_type")
paper = meta_info.get("paper")

output_filename = f'{subject}_{school}_{year}_{exam_type}_paper{paper}.json'
with open(output_filename, 'w') as f:
    json.dump(output_data, f, indent=2)

print("Extraction complete. Data saved to ", output_filename)

Extraction complete. Data saved to  Elementary Mathematics_Bukit Panjang Government High School_2023_Preliminary Examination_paper1.json


## Split PDF into pages

In [98]:
import pypdfium2 as pdfium

# Load a document
pdf = pdfium.PdfDocument(pdf_path)

paper_imgs_dir = f"exam_papers/{output_filename}/"
os.makedirs(paper_imgs_dir, exist_ok=True)

# Loop over pages and render
for i in range(len(pdf)):
    page = pdf[i]
    image = page.render(scale=4).to_pil()
    image.save(f'{paper_imgs_dir}pg{i+1}.jpg')