# Get JSON of an exam paper

In [33]:
import os
import openai
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
import re
import json

# Set your OpenAI API key
openai.api_key = os.environ.get('OPENAI_API_KEY')

# Load the PDF file
pdf_path = "exam_papers/chij_amaths_p2.pdf" 

loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [34]:
# Function to extract JSON data from the response
def extract_json_from_response(response):
    try:
        data = json.loads(response)
        return data
    except json.JSONDecodeError:
        # Try to extract JSON from the response using regex
        pattern = r'\[.*\]'
        match = re.search(pattern, response, re.DOTALL)
        if match:
            json_str = match.group(0)
            try:
                data = json.loads(json_str)
                return data
            except json.JSONDecodeError as e:
                print("Error parsing JSON:", e)
                return None
        else:
            print("No JSON data found in the response.")
            return None


## Get JSON for each question

In [35]:
# Set maximum pages per chunk to avoid exceeding token limits
max_pages_per_chunk = 5

chunks = []
current_chunk = []
current_pages = []
for i, doc in enumerate(documents):
    page_num = i + 1
    current_chunk.append(f"\n\n--- Page {page_num} ---\n\n{doc.page_content}")
    current_pages.append(page_num)
    if len(current_chunk) == max_pages_per_chunk or i == len(documents) - 1:
        text_content = ''.join(current_chunk)
        chunks.append((text_content, current_pages.copy()))
        current_chunk = []
        current_pages = []

In [36]:
# Define the prompt template
prompt_template =  """
    You are provided with text extracted from a PDF exam paper. The text may include multiple math questions along with diagrams.

    Please identify each question, determine its page boundaries (page_start and page_end), and categorize it into one of the following categories:

    Quadratic Functions, Equations and Inequalities
    Surds
    Polynomials, cubic equations and partial fractions
    Binomial theorem and its application
    Exponential and logarithmic functions
    Coordinate geometry
    Linear law
    Trigonometric functions and equations
    Trigonometric identities and formulae
    Gradients, derivatives and differentiation techniques
    Applications of differentiation
    Differentiation of trigonometric, logarithmic, and exponential functions and their applications
    Integration
    Applications of integration
    Kinematics
    Proofs in plane geometry

    Output your result as a list of JSON objects enclosed in a code block like this:

    ```json
    ["question": "...", "page_start": ..., "page_end": ..., "category": "...", ...]
    ```
    Please ensure the output is valid JSON.

    Here is the text content:
    
    {text_content}
    """
    

In [37]:
prompt = PromptTemplate(
    input_variables=["text_content"],
    template=prompt_template,
)

llm = ChatOpenAI(temperature=0, model_name='gpt-4o', api_key=openai.api_key)

questions_chain = LLMChain(llm=llm, prompt=prompt)

In [38]:
all_data = []

for idx, (text_content, pages) in enumerate(chunks):
    print(f"Processing chunk {idx+1}/{len(chunks)}, pages {pages[0]}-{pages[-1]}")

    response = questions_chain.run(text_content=text_content)

    data = extract_json_from_response(response)

    if data is not None:
        all_data.extend(data)
    else:
        print(f"Failed to extract data for chunk {idx+1}")

Processing chunk 1/4, pages 1-5
Processing chunk 2/4, pages 6-10
Processing chunk 3/4, pages 11-15
Processing chunk 4/4, pages 16-17


In [39]:
print(all_data)

[{'question': 'Given that f(x) = 3x^3 + 6x^2 - 7x - 2, show that x + 1 is a factor of f(x) and hence factorise f(x) completely.', 'page_start': 3, 'page_end': 3, 'category': 'Polynomials, cubic equations and partial fractions'}, {'question': 'The equation of a circle is (x - 3)^2 + (y + 4)^2 = 26. Determine if the origin O lies inside or outside the circle.', 'page_start': 3, 'page_end': 3, 'category': 'Coordinate geometry'}, {'question': 'Solve the equation e^(2x) - e^x = 3, giving your answer(s) correct to 3 significant figures.', 'page_start': 4, 'page_end': 4, 'category': 'Exponential and logarithmic functions'}, {'question': 'Given that y = 3x^3 - 16x^5, find (i) the value(s) of x for which dy/dx = 1, (ii) the value of the integral from 0 to 1 of y dx, giving your answer correct to 3 significant figures.', 'page_start': 5, 'page_end': 5, 'category': 'Applications of differentiation'}, {'question': '5(a) (i) Using the substitution z = 1 + x, write down all the terms in the expansio

In [40]:
print(all_data[0].keys())

dict_keys(['question', 'page_start', 'page_end', 'category'])


In [41]:
with open('amath_topics.json') as f:
    amath_topics = json.load(f)
    print(amath_topics)

{'Quadratic Functions, Equations and Inequalities': ['Solving simultaneous equations by substitution', 'Finding coordinates of intersection points', 'Finding maximum and minimum value of a quadratic function by completing the square', 'Sketching graph of quadratic function', 'Solving quadratic inequalities', 'Intersection of straight line and curve', 'Always positive/negative quadratic expression', 'Quadratic functions in real-world context'], 'Surds': ['Rationalising denominator of surd', 'Adding and/or subtracting surds', 'Solving equations involving surds', 'Word problems involving surds'], 'Polynomials, cubic equations and partial fractions': ['Identities', 'Identities with an unknown quotient', 'Long division/synthetic method/remainder theorem', 'Application of remainder theorem', 'Sum and difference of 2 cubes', 'Solving cubic equation', 'Factor theorem and solving a cubic equation', 'Factor theorem and sketching of cubic curve', 'Forming cubic equation/expression', 'Proper algeb

In [42]:
# Define a prompt template for assigning question types
question_type_prompt_template = '''
Given the following question and its relevant topics, assign the most appropriate question type from the list.

Question: {question_text}

Relevant Question Types:
{question_types_list}

Please choose the most appropriate question types from the list above. Choose either 1, 2 or 3 most relevant question types only, separated by commas.
'''

question_type_prompt = PromptTemplate(
    input_variables=["question_text", "question_types_list"],
    template=question_type_prompt_template,
)

topics_chain = LLMChain(llm=llm, prompt=question_type_prompt)

# Assuming all_data contains the extracted questions and amath_topics is the topic-question type mapping
for question in all_data:
    q_text = question.get("question")
    q_category = question.get("category")
    
    # Get relevant question types based on the identified category (topic)
    relevant_topics = amath_topics.get(q_category)
    
    # If there are relevant topics, proceed with assigning the question type
    if relevant_topics:
        question_types_list = "\n".join(relevant_topics)
        
        # Run the LLM chain to get the most appropriate question type
        response = topics_chain.run(question_text=q_text, question_types_list=question_types_list)
        
        # Assign the question type to the question data
        question["question_type"] = response.strip()  # Clean up the response
    
    else:
        # If no relevant topics, mark the question type as unknown
        question["question_type"] = "Unknown"


## Get JSON of paper meta information

In [43]:
# Define the prompt to extract meta information
meta_info_prompt_template = '''
You are provided with text from the first page of a PDF document. Extract the following meta information:
- Subject (Additional Mathematics or Elementary Mathematics)
- School
- Year
- Exam type (Preliminary Examination, Mid Year Examination, Final Year Examination, or Test)
- Paper (1 or 2)

Please output the result as a JSON object with these fields:

```json
[
  <curly_bracket_start>
  "subject": "...",
  "school": "...",
  "year": "...",
  "exam_type": "...",
  "paper": "..."
  <curly_bracket_end>
]
```

Here is the text content:

{text_content}
'''

meta_info_prompt = PromptTemplate(
    input_variables=["text_content"],
    template=meta_info_prompt_template,
)

meta_chain = LLMChain(llm=llm, prompt=meta_info_prompt)

# Extract meta information from the first page
first_page_content = documents[0].page_content
meta_response = meta_chain.run(text_content=first_page_content)


In [44]:
meta_info = extract_json_from_response(meta_response)
meta_info = meta_info[0] if meta_info else {}

## Combine into one output JSON

In [45]:
# Combine meta information with extracted questions
output_data = {
    "meta_info": meta_info,
    "questions": all_data
}

# Save output to a JSON file
subject = meta_info.get("subject")
school = meta_info.get("school")
year = meta_info.get("year")
exam_type = meta_info.get("exam_type")
paper = meta_info.get("paper")

output_filename = f'{subject}_{school}_{year}_{exam_type}_paper{paper}.json'
with open(output_filename, 'w') as f:
    json.dump(output_data, f, indent=2)

print("Extraction complete. Data saved to ", output_filename)

Extraction complete. Data saved to  Additional Mathematics_CHIJ ST. THERESA’S CONVENT_2023_Preliminary Examination_paper2.json


## Split PDF into pages

In [46]:
import pypdfium2 as pdfium

# Load a document
pdf = pdfium.PdfDocument(pdf_path)

paper_imgs_dir = f"exam_papers/{output_filename}/"
os.makedirs(paper_imgs_dir, exist_ok=True)

# Loop over pages and render
for i in range(len(pdf)):
    page = pdf[i]
    image = page.render(scale=4).to_pil()
    image.save(f'{paper_imgs_dir}pg{i+1}.jpg')

In [47]:
!pip freeze > requirements.txt