# Get JSON of an exam paper

In [None]:
import os
import openai
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
import re
import json

# Set your OpenAI API key
openai.api_key = os.environ.get('OPENAI_API_KEY')

# Load the PDF file
pdf_path = "exam_papers/bpghs_emath_prelim_paper1.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

llm = ChatOpenAI(temperature=0, model_name='gpt-4o', api_key=openai.api_key)

In [171]:
# Function to extract JSON data from the response
def extract_json_from_response(response):
    try:
        data = json.loads(response)
        return data
    except json.JSONDecodeError:
        # Try to extract JSON from the response using regex
        pattern = r'\[.*\]'
        match = re.search(pattern, response, re.DOTALL)
        if match:
            json_str = match.group(0)
            try:
                data = json.loads(json_str)
                return data
            except json.JSONDecodeError as e:
                print("Error parsing JSON:", e)
                return None
        else:
            print("No JSON data found in the response.")
            return None


## Get meta JSON for document

In [172]:
# Define the prompt to extract meta information
meta_info_prompt_template = '''
You are provided with text from the first page of a PDF document. Extract the following meta information:
- Subject ("additional_mathematics" or "elementary_mathematics")
- School
- Level ("o_level" or "a_level")
- Year
- Exam type ("preliminary_exam", "final_exam", "mid_year_exam")
- Paper (1 or 2)

Please output the result as a JSON object with these fields:

```json
[
  <curly_bracket_start>
  "subject": "...",
  "school": "...",
  "level": "...",
  "year": "...",
  "exam_type": "...",
  "paper": "..."
  <curly_bracket_end>
]
```

Here is the text content:

{text_content}
'''

meta_info_prompt = PromptTemplate(
    input_variables=["text_content"],
    template=meta_info_prompt_template,
)

meta_chain = LLMChain(llm=llm, prompt=meta_info_prompt)

# Extract meta information from the first page
first_page_content = documents[0].page_content
meta_response = meta_chain.run(text_content=first_page_content)

In [173]:
meta_info = extract_json_from_response(meta_response)
meta_info = meta_info[0] if meta_info else {}

## Get JSON for each question

In [174]:
# Set maximum pages per chunk to avoid exceeding token limits
max_pages_per_chunk = 5

chunks = []
current_chunk = []
current_pages = []
for i, doc in enumerate(documents):
    page_num = i + 1
    current_chunk.append(f"\n\n--- Page {page_num} ---\n\n{doc.page_content}")
    current_pages.append(page_num)
    if len(current_chunk) == max_pages_per_chunk or i == len(documents) - 1:
        text_content = ''.join(current_chunk)
        chunks.append((text_content, current_pages.copy()))
        current_chunk = []
        current_pages = []

In [175]:
with open('amath_topics.json') as f:
    amath_topics = json.load(f)
    print(amath_topics)

{'Quadratic Functions, Equations and Inequalities': ['Solving simultaneous equations by substitution', 'Finding coordinates of intersection points', 'Finding maximum and minimum value of a quadratic function by completing the square', 'Sketching graph of quadratic function', 'Solving quadratic inequalities', 'Intersection of straight line and curve', 'Always positive/negative quadratic expression', 'Quadratic functions in real-world context'], 'Surds': ['Rationalising denominator of surd', 'Adding and/or subtracting surds', 'Solving equations involving surds', 'Word problems involving surds'], 'Polynomials, cubic equations and partial fractions': ['Identities', 'Identities with an unknown quotient', 'Long division/synthetic method/remainder theorem', 'Application of remainder theorem', 'Sum and difference of 2 cubes', 'Solving cubic equation', 'Factor theorem and solving a cubic equation', 'Factor theorem and sketching of cubic curve', 'Forming cubic equation/expression', 'Proper algeb

In [176]:
with open('emath_topics.json') as f:
    emath_topics = json.load(f)
    print(emath_topics)

{'Numbers and their operations': ['Classifying numbers', 'Solving a problem involving negative numbers', 'Finding the prime factors of a composite number', 'Finding the highest common factor of 2 numbers', 'Finding the highest common factor of 3 numbers', 'Finding the lowest common factor of 2 numbers', 'Finding the lowest common factor of 3 numbers', 'Evaluating the square root of a number', 'Solving a problem involving a perfect square', 'Evaluating the cube root of a number', 'Solving a problem involving a perfect cube', 'Solving a problem involving a cube root', 'Reciprocals', 'Rounding a whole number to a specified number of significant figures', 'Rounding a decimal to a specified number of significant figures', 'Estimating the answer to a calculation', 'Performing calculations involving prefixes', 'Performing calculations involving standard form', 'Applying the laws of indices', 'Simplifying expressions involving indices'], 'Ratio and proportion': ['Simplifying ratios involving 2

In [177]:
# Define the prompt template
question_prompt_template =  """
    You are provided with text extracted from a PDF exam paper. The text may include multiple math questions along with diagrams. Ignore any solutions or answers provided in the text.
    
    Please identify each question, determine the question_number, which is an integer representing the full question (eg: 1, 2, 3), along with the question_part, a string representing the question with any sub-parts that it belongs to (eg: "1", "2a", "2b", "2ai", "2aii", "3iv", "3v" etc.).
    After reading each question, express it in LaTeX format so that it can be rendered correctly.
    Please also determine its page boundaries (page_start and page_end), and categorize it into one of the following categories:
        {categories}

    Output your result as a list of JSON objects enclosed in a code block like this:

    ```json
    ["question": "...", "question_number": "...", "question_part": "...", "page_start": ..., "page_end": ..., "category": "..."]
    ```
    Please ensure the output is valid JSON.

    Here is the text content:
    
    {text_content}
    """
    

In [178]:
prompt = PromptTemplate(
    input_variables=["text_content", "categories"],
    template=question_prompt_template,
)

questions_chain = LLMChain(llm=llm, prompt=prompt)

In [179]:
all_data = []

subject = meta_info.get("subject")
if subject == "additional_mathematics":
    topics = amath_topics
else:
    topics = emath_topics
    
categories = list(topics.keys())

for idx, (text_content, pages) in enumerate(chunks):
    print(f"Processing chunk {idx+1}/{len(chunks)}, pages {pages[0]}-{pages[-1]}")

    response = questions_chain.run(text_content=text_content, categories=categories)

    data = extract_json_from_response(response)

    if data is not None:
        for question in data:
            question['score'] = 1  # score
        all_data.extend(data)
    else:
        print(f"Failed to extract data for chunk {idx+1}")

Processing chunk 1/3, pages 1-5
Processing chunk 2/3, pages 6-10
Processing chunk 3/3, pages 11-11


In [180]:
print(all_data)

[{'question': 'Given that \\[ \\begin{pmatrix} 1 & 2 & 0 \\\\ 3 & 0 & 3 \\\\ 2 & 2 & 1 \\end{pmatrix} \\begin{pmatrix} x \\\\ y \\\\ 1 \\end{pmatrix} = \\begin{pmatrix} 5 \\\\ 13 \\\\ 4 \\end{pmatrix} \\], find the value of \\( x \\) and \\( y \\).', 'question_number': '1', 'question_part': '1', 'page_start': 3, 'page_end': 3, 'category': 'Matrices'}, {'question': 'Mr Gan weighed 8 male students. The median mass of the students was 63.5 kg. The interquartile range of the masses of the students was 4.5 kg. Mr Gan realised that the scale of the weighing machine was inaccurate. The correct mass of each student was 1.5 kg less than what he had recorded. Write down the correct values for the median mass and interquartile range.', 'question_number': '2', 'question_part': '2', 'page_start': 3, 'page_end': 3, 'category': 'Data analysis'}, {'question': '27 female patients and 30 male patients living in Xin-town are found to be infected with the E-virus. The ages of the patients are shown in the

In [181]:
print(all_data[0].keys())

dict_keys(['question', 'question_number', 'question_part', 'page_start', 'page_end', 'category'])


In [182]:
# Define a prompt template for assigning question types
question_type_prompt_template = '''
Given the following question and its relevant topics, assign the most appropriate question type from the list.

Question: {question_text}

Relevant Question Types:
{question_types_list}

Please return the most appropriate question type from the list above. If none of the question types are relevant, you can return "Unknown".
Simply return the question type as a string, nothing else.
'''

question_type_prompt = PromptTemplate(
    input_variables=["question_text", "question_types_list"],
    template=question_type_prompt_template,
)

topics_chain = LLMChain(llm=llm, prompt=question_type_prompt)

# Assuming all_data contains the extracted questions and amath_topics is the topic-question type mapping
for question in all_data:
    q_text = question.get("question")
    q_category = question.get("category")
    
    # Get relevant question types based on the identified category (topic)
    relevant_topics = topics.get(q_category)

    # Assign difficulty level based on heuristics (e.g., length of question or keywords)
    if len(q_text.split()) < 20:
        question["difficulty_level"] = "easy"
    elif len(q_text.split()) < 50:
        question["difficulty_level"] = "medium"
    else:
        question["difficulty_level"] = "hard"

    # If there are relevant topics, proceed with assigning the question type
    if relevant_topics:
        question_types_list = "\n".join(relevant_topics)
        
        # Run the LLM chain to get the most appropriate question type
        response = topics_chain.run(question_text=q_text, question_types_list=question_types_list)
        
        # Assign the question type to the question data
        question["question_type"] = response.strip()  # Clean up the response
    
    else:
        # If no relevant topics, mark the question type as unknown
        question["question_type"] = "Unknown"


## Combine into one output JSON

In [183]:
# Combine meta information with extracted questions
output_data = {
    "meta_info": meta_info,
    "questions": all_data
}

# Save output to a JSON file
subject = meta_info.get("subject")
school = meta_info.get("school")
year = meta_info.get("year")
exam_type = meta_info.get("exam_type")
paper = meta_info.get("paper")

output_filename = f'{subject}_{school}_{year}_{exam_type}_paper{paper}.json'
with open(output_filename, 'w') as f:
    json.dump(output_data, f, indent=2)

print("Extraction complete. Data saved to ", output_filename)

Extraction complete. Data saved to  elementary_mathematics_XINMIN SECONDARY SCHOOL_2023_mid_year_exam_paper2.json


## Split PDF into pages

In [184]:
'''
import pypdfium2 as pdfium

# Load a document
pdf = pdfium.PdfDocument(pdf_path)

paper_imgs_dir = f"exam_papers/{output_filename}/"
os.makedirs(paper_imgs_dir, exist_ok=True)

# Loop over pages and render
for i in range(len(pdf)):
    page = pdf[i]
    image = page.render(scale=4).to_pil()
    image.save(f'{paper_imgs_dir}pg{i+1}.jpg')
'''

'\nimport pypdfium2 as pdfium\n\n# Load a document\npdf = pdfium.PdfDocument(pdf_path)\n\npaper_imgs_dir = f"exam_papers/{output_filename}/"\nos.makedirs(paper_imgs_dir, exist_ok=True)\n\n# Loop over pages and render\nfor i in range(len(pdf)):\n    page = pdf[i]\n    image = page.render(scale=4).to_pil()\n    image.save(f\'{paper_imgs_dir}pg{i+1}.jpg\')\n'