# Get JSON of an exam paper

In [None]:
import os
import openai
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.document_loaders import PyPDFLoader
import re
import json

# Set your OpenAI API key
openai.api_key = os.environ.get('OPENAI_API_KEY')

# Load the PDF file
pdf_path = "exam_papers/bpghs_emath_prelim_paper1.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

llm = ChatOpenAI(temperature=0, model_name='gpt-4o', api_key=openai.api_key)

In [171]:
# Function to extract JSON data from the response
def extract_json_from_response(response):
    try:
        data = json.loads(response)
        return data
    except json.JSONDecodeError:
        # Try to extract JSON from the response using regex
        pattern = r'\[.*\]'
        match = re.search(pattern, response, re.DOTALL)
        if match:
            json_str = match.group(0)
            try:
                data = json.loads(json_str)
                return data
            except json.JSONDecodeError as e:
                print("Error parsing JSON:", e)
                return None
        else:
            print("No JSON data found in the response.")
            return None


## Get meta JSON for document

In [172]:
# Define the prompt to extract meta information
meta_info_prompt_template = '''
You are provided with text from the first page of a PDF document. Extract the following meta information:
- Subject ("additional_mathematics" or "elementary_mathematics")
- School
- Level ("o_level" or "a_level")
- Year
- Exam type ("preliminary_exam", "final_exam", "mid_year_exam")
- Paper (1 or 2)

Please output the result as a JSON object with these fields:

```json
[
  <curly_bracket_start>
  "subject": "...",
  "school": "...",
  "level": "...",
  "year": "...",
  "exam_type": "...",
  "paper": "..."
  <curly_bracket_end>
]
```

Here is the text content:

{text_content}
'''

meta_info_prompt = PromptTemplate(
    input_variables=["text_content"],
    template=meta_info_prompt_template,
)

meta_chain = LLMChain(llm=llm, prompt=meta_info_prompt)

# Extract meta information from the first page
first_page_content = documents[0].page_content
meta_response = meta_chain.run(text_content=first_page_content)

In [173]:
meta_info = extract_json_from_response(meta_response)
meta_info = meta_info[0] if meta_info else {}

## Get JSON for each question

In [174]:
# Regular expression to match marks in square brackets (e.g., [4])
marks_pattern = r'\[(\d+)\]'

chunks = []
current_chunk = []
current_pages = []
marks_per_page = {}

for i, doc in enumerate(documents):
    page_num = i + 1
    page_content = doc.page_content

    # Extract marks from the page using the regex
    marks = re.findall(marks_pattern, page_content)
    marks_per_page[page_num] = [int(mark) for mark in marks]  # Store marks as integers

    current_chunk.append(f"\n\n--- Page {page_num} ---\n\n{page_content}")
    current_pages.append(page_num)
    if len(current_chunk) == max_pages_per_chunk or i == len(documents) - 1:
        text_content = ''.join(current_chunk)
        chunks.append((text_content, current_pages.copy()))
        current_chunk = []
        current_pages = []

# Now `marks_per_page` contains all extracted marks per page

In [None]:
with open('amath_topics.json') as f:
    amath_topics = json.load(f)
    print(amath_topics)

In [None]:
with open('emath_topics.json') as f:
    emath_topics = json.load(f)
    print(emath_topics)

In [177]:
# Define the prompt template
question_prompt_template =  """
    You are provided with text extracted from a PDF exam paper. The text may include multiple math questions along with diagrams. Ignore any solutions or answers provided in the text.
    
    Please identify each question, determine the question_number, which is an integer representing the full question (eg: 1, 2, 3), along with the question_part, a string representing the question with any sub-parts that it belongs to (eg: "1", "2a", "2b", "2ai", "2aii", "3iv", "3v" etc.).
    After reading each question, express it in LaTeX format so that it can be rendered correctly.
    Please also determine its page boundaries (page_start and page_end), and categorize it into one of the following categories:
        {categories}

    Output your result as a list of JSON objects enclosed in a code block like this:

    ```json
    ["question": "...", "question_number": "...", "question_part": "...", "page_start": ..., "page_end": ..., "category": "..."]
    ```
    Please ensure the output is valid JSON.

    Here is the text content:
    
    {text_content}
    """
    

In [178]:
prompt = PromptTemplate(
    input_variables=["text_content", "categories"],
    template=question_prompt_template,
)

questions_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
def calculate_score(question):
    """
    Calculate a dynamic score for a question based solely on:
    - Marks assigned to the question
    """
    # Base score
    base_score = 1.0

    # Marks-based heuristic: assume 'marks' is provided in the question dictionary
    marks = question.get('marks', 1)  # Default to 1 mark if not specified
    mark_score = min(marks / 2, 5)  # Normalize marks to a score, capping max effect at 5

    # Combine base and mark score
    final_score = base_score + (mark_score * 1.5)  # Marks have a strong influence
    return round(final_score, 2)  # Limit to 2 decimal places


In [None]:
all_data = []

subject = meta_info.get("subject")
if subject == "additional_mathematics":
    topics = amath_topics
else:
    topics = emath_topics
    
categories = list(topics.keys())

for idx, (text_content, pages) in enumerate(chunks):
    print(f"Processing chunk {idx+1}/{len(chunks)}, pages {pages[0]}-{pages[-1]}")

    response = questions_chain.run(text_content=text_content, categories=categories)

    data = extract_json_from_response(response)

    if data is not None:
        for question in data:
            question['score'] = calculate_score(question)  # Dynamic scoring function
        all_data.extend(data)
    else:
        print(f"Failed to extract data for chunk {idx+1}")


In [None]:
print(all_data)

In [None]:
print(all_data[0].keys())

In [182]:
# Define a prompt template for assigning question types
question_type_prompt_template = '''
Given the following question and its relevant topics, assign the most appropriate question type from the list.

Question: {question_text}

Relevant Question Types:
{question_types_list}

Please return the most appropriate question type from the list above. If none of the question types are relevant, you can return "Unknown".
Simply return the question type as a string, nothing else.
'''

question_type_prompt = PromptTemplate(
    input_variables=["question_text", "question_types_list"],
    template=question_type_prompt_template,
)

topics_chain = LLMChain(llm=llm, prompt=question_type_prompt)

# Assuming `all_data` contains the parsed questions and their page information
for question in all_data:
    page_start = question['page_start']  # Page where the question starts

    # Get marks for the corresponding page
    page_marks = marks_per_page.get(page_start, [])

    # Assign the first available mark to the question, or default to 1
    if page_marks:
        question['marks'] = page_marks.pop(0)  # Use and remove the first mark
    else:
        question['marks'] = 1  # Default mark if no marks are found



## Combine into one output JSON

In [None]:
# Combine meta information with extracted questions
output_data = {
    "meta_info": meta_info,
    "questions": all_data
}

# Save output to a JSON file
subject = meta_info.get("subject")
school = meta_info.get("school")
year = meta_info.get("year")
exam_type = meta_info.get("exam_type")
paper = meta_info.get("paper")

output_filename = f'{subject}_{school}_{year}_{exam_type}_paper{paper}.json'
with open(output_filename, 'w') as f:
    json.dump(output_data, f, indent=2)

print("Extraction complete. Data saved to ", output_filename)

## Split PDF into pages

In [None]:
'''
import pypdfium2 as pdfium

# Load a document
pdf = pdfium.PdfDocument(pdf_path)

paper_imgs_dir = f"exam_papers/{output_filename}/"
os.makedirs(paper_imgs_dir, exist_ok=True)

# Loop over pages and render
for i in range(len(pdf)):
    page = pdf[i]
    image = page.render(scale=4).to_pil()
    image.save(f'{paper_imgs_dir}pg{i+1}.jpg')
'''