# [Experiment] Impact of Number of retrieved documents on Generated Response

## 1. Setup

In [1]:
###################################
##### Setup Working Directory #####
###################################
import os

cwd = os.getcwd()

# Change directory to root to import app directory functions
if cwd.split("\\")[-1] == "experiments":
    cwd = "\\".join(os.getcwd().split("\\")[:-1])

os.chdir(cwd)
print("Current Working Directory:")
print(os.getcwd())

Current Working Directory:
c:\Users\shuti\OneDrive\Documents\Term 7 Modules\50.045 Information Retrieval\Project\eduRAG


In [2]:
import os
from pymongo import MongoClient

# connect to MongoDB
MONGODB_URI = os.environ.get("MONGODB_URI")
print(f"Connecting to MongoDB at {MONGODB_URI}")
mongo_client = MongoClient(MONGODB_URI)
db = mongo_client["exam_db"]
question_collection = db["question"]

Connecting to MongoDB at mongodb+srv://admin:Z5TQpg4qtNoOggBt@cluster0.x0nvy.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0


In [3]:
import os
import json
from typing import Optional, Any
from dotenv import load_dotenv
from fastapi import HTTPException, Body
from app.utils.format_utils import (
    normalise_query,
    format_first_question_xml,
)
from app.utils.openai_utils import get_embedding
from app.db.vector_search import vector_search
from app.utils.openai_utils import (
    get_generated_questions_and_answers,
)
from app.models import Message
from app.utils.image_utils import extract_question_metadata, find_and_crop_image
from app import constants
from ulid import ULID

def query(
    user_query: list[Message],
    exp_number_of_docs_to_retrieve: int,  # Experimental Feature
    subject: Optional[str] = Body(default="elementary_mathematics"),
    level: Optional[str] = Body(default=None),
    exam_type: Optional[str] = Body(default=None),
):
    # try:
    # Normalize the query
    user_query = normalise_query(user_query)
    
    # Perform vector search to find similar questions
    results = vector_search(
        user_query[-1].content, question_collection, [subject, level, exam_type],
        returnLimit=exp_number_of_docs_to_retrieve # Experimental Feature
    )
    if not results:
        raise HTTPException(
            status_code=404,
            detail="No similar questions found. Please try again with a different question.",
        )
    
    # Initialize variables to store aggregated metadata and output
    aggregated_metadata = {"topics": set(), "sub_topics": set(), "links": set()}
    questions_xml = ""
    output_jsons = []

    # Process each result to aggregate metadata and prepare XML
    for result in results:
        (
            question_paper_filepath,
            question_body,
            image_filename,
            page_start,
            page_end,
        ) = extract_question_metadata(result)

        # Crop the question image
        find_and_crop_image(
            pdf_url=question_paper_filepath,
            search_text=question_body,
            question_filename=image_filename,
            page_start=page_start,
            page_end=page_end,
        )
        image_filepath = f"{constants.TEMP_DIR}/{image_filename}.png"

        # Aggregate metadata
        aggregated_metadata["topics"].add(result["topic"])
        aggregated_metadata["sub_topics"].add(result["sub_topic"])
        aggregated_metadata["links"].add(result.get("question_paper_filepath", ""))

        # Generate XML for this question
        question_xml = format_first_question_xml([result])
        questions_xml += question_xml + "\n"
    
    # Convert aggregated metadata sets to lists
    aggregated_metadata = {key: list(value) for key, value in aggregated_metadata.items()}

    # Generate questions based on aggregated metadata and all ground-truth documents
    response = get_generated_questions_and_answers(
        question_details=questions_xml,
        image_filepath=image_filepath,
        aggregated_metadata=aggregated_metadata  # Pass aggregated context
    )

    # Save the generated questions and answers to a JSON file
    os.makedirs(constants.OUTPUT_DIR, exist_ok=True)
    json_filepath = f"{constants.OUTPUT_DIR}/{str(ULID())}.json"
    response_dict = response.model_dump()
    response_dict["ground_truth"] = aggregated_metadata
    output_jsons.append(response_dict)

    with open(json_filepath, "w") as f:
        json.dump(output_jsons, f, indent=4)

    return {"response": output_jsons, "first_question": questions_xml}

## 2. Experiments

In [4]:
EXPERIMENT_SEARCH_SAPCE = {
    "exp_number_of_docs_to_retrieve": [1, 2, 3, 5, 10, 24]
}

MOCK_DATA = {
    "user_query": [
        Message(
            role="user",
            content="Give me questions related to binomial theorem."
        )
    ],
    "subject": "additional_mathematics"   
}

In [5]:
results = {}
for exp_number_of_docs_to_retrieve in EXPERIMENT_SEARCH_SAPCE["exp_number_of_docs_to_retrieve"]:
    print(f"----- {exp_number_of_docs_to_retrieve} docs retrieved -----")
    
    response = query(
        user_query = MOCK_DATA["user_query"],
        subject = MOCK_DATA["subject"],
        exp_number_of_docs_to_retrieve = exp_number_of_docs_to_retrieve
    )
    results[f"{exp_number_of_docs_to_retrieve}_retrieved_docs"] = response

    # Print Results
    print(response)


----- 1 docs retrieved -----
Found match on page 21. Saved to temp/fairfield_methodist_school_secondary_elementary_mathematics_preliminary_exam_2024_1_24i.png
{'response': [{'questions': [{'question_text': 'A jar contains 8 red marbles, 15 blue marbles, and 7 white marbles. A marble is randomly selected from the jar and then put back. Find the probability of not selecting a red marble, and express your answer in simplest form.', 'topic': 'Probability', 'sub_topic': 'Finding the probability of single events', 'steps': ['Calculate the total number of marbles: 8 (red) + 15 (blue) + 7 (white) = 30 marbles.', 'The number of marbles that are not red: 15 (blue) + 7 (white) = 22 marbles.', 'Calculate the probability of not selecting a red marble: 22/30.', 'Simplify the fraction: 22/30 = 11/15.'], 'answer': '11/15'}, {'question_text': 'A box contains 12 black pens, 9 blue pens, and 5 green pens. A pen is selected at random and replaced. What is the probability of not choosing a green pen? Provi

In [6]:
print(results)

{'1_retrieved_docs': {'response': [{'questions': [{'question_text': 'A jar contains 8 red marbles, 15 blue marbles, and 7 white marbles. A marble is randomly selected from the jar and then put back. Find the probability of not selecting a red marble, and express your answer in simplest form.', 'topic': 'Probability', 'sub_topic': 'Finding the probability of single events', 'steps': ['Calculate the total number of marbles: 8 (red) + 15 (blue) + 7 (white) = 30 marbles.', 'The number of marbles that are not red: 15 (blue) + 7 (white) = 22 marbles.', 'Calculate the probability of not selecting a red marble: 22/30.', 'Simplify the fraction: 22/30 = 11/15.'], 'answer': '11/15'}, {'question_text': 'A box contains 12 black pens, 9 blue pens, and 5 green pens. A pen is selected at random and replaced. What is the probability of not choosing a green pen? Provide your answer in its simplest form.', 'topic': 'Probability', 'sub_topic': 'Finding the probability of single events', 'steps': ['Calcula

### Export Results

In [7]:
import json

with open('experiments/exp1_output.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)


## 3. Analysis

Goal: Ensure that generated questions are diverse but still sourced from retrieved documents. Ensure that the questions produced are diverse + answerable + conforms to SEAB syllabus

By inspecting on the `experiments/exp1_output.json`, we can derive the following insights:

- `2` is the best number of retrieved documents:
  - It generated the most number of questions, and with the most variety (most inter-topics and intra-topic differences).
  - It is answerable
  - It confirms to SEAB syllabus
