# Question Generator for Contextual RAG

This notebook generates realistic sample questions and answers based on your document content, which can be used to evaluate your RAG system.

### 0. Prerequisites

In [None]:
%load_ext autoreload
%autoreload 2

# Install required packages
%pip install ipywidgets python-dotenv tqdm

# Import basic dependencies
import os
import sys
import json
import uuid
import random
from pathlib import Path
from tqdm.notebook import tqdm

# Create output directory
os.makedirs("output", exist_ok=True)

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    load_dotenv('.env')
    print("Environment variables loaded from .env file")
except ImportError:
    print("python-dotenv not installed, skipping .env loading")

### 1. Setup File Information

In [None]:
# Define input file and chunking parameters
input_file = "data/bedrock-ug.pdf"
chunk_size = 1000
start_page = 0
end_page = -1  # -1 means process all pages

# Extract document name from file path
document_name = Path(input_file).resolve().stem
print(f"Document name: {document_name}")

In [None]:
try:
    # Import required services and configuration
    from libs.bedrock_service import BedrockService
    from config import Config
    
    # Load configuration
    config = Config.load()
    
    # Update config with environment variables if available
    config.aws.region = os.environ.get("AWS_DEFAULT_REGION", config.aws.region)
    config.aws.profile = os.environ.get("AWS_PROFILE", config.aws.profile)
    config.bedrock.model_id = os.environ.get("BEDROCK_MODEL_ID", config.bedrock.model_id)
    config.bedrock.embed_model_id = os.environ.get("EMBED_MODEL_ID", config.bedrock.embed_model_id)
    
    # Initialize Bedrock service
    bedrock_service = BedrockService(
        config.aws.region, 
        config.aws.profile, 
        config.bedrock.retries, 
        config.bedrock.embed_model_id, 
        config.bedrock.model_id, 
        config.model.max_tokens, 
        config.model.temperature, 
        config.model.top_p
    )
    
    print("✅ Bedrock service initialized successfully")
    print(f"Model ID: {config.bedrock.model_id}")
    
except ImportError as e:
    print(f"❌ Error importing required modules: {str(e)}")
    print("Make sure all dependencies are installed and the paths are correct")
    sys.path.append('..')
    print("Added parent directory to Python path. Try running the cell again.")
    raise
except Exception as e:
    print(f"❌ Error initializing Bedrock service: {str(e)}")
    raise

### 2. Split Document into Chunks

In [None]:
try:
    # Import DocumentParser from local library
    from libs.document_parser import DocumentParser
    
    print(f"Loading PDF from {input_file}...")
    print(f"Pages: {start_page} to {'end' if end_page == -1 else end_page}")
    
    # Load and split document
    full_text = DocumentParser.load_pdf(input_file, start_page, end_page)
    chunked_document = DocumentParser.split(full_text, chunk_size, -1)
    chunks = chunked_document[0]['chunks']
    
    print(f"✅ Document loaded and split into {len(chunks)} chunks")
    
except ImportError:
    print("Error importing DocumentParser. Make sure the libs directory is available.")
    print("You might need to add the parent directory to Python path:")
    sys.path.append('..')
    # Try again with updated path
    from libs.document_parser import DocumentParser
    full_text = DocumentParser.load_pdf(input_file, start_page, end_page)
    chunked_document = DocumentParser.split(full_text, chunk_size, -1)
    chunks = chunked_document[0]['chunks']
except Exception as e:
    print(f"❌ Error loading or chunking document: {str(e)}")
    raise

### 3. Build Prompt and Tool Config

Define the system prompts for different types of question generation and the tool configuration.

In [None]:
# System prompts for different question types
sys_template = {
    "complex": """
        You are an expert at generating practical questions based on given documentation.
        Your task is to generate complex, reasoning questions and answers.

        Follow these rules:
        1. Generate questions that reflect real user information needs related to the document's subject matter (e.g., technical docs : feature availability, implementation details)
        2. Ensure questions are relevant, concise, preferably under 25 words, and fully answerable with the provided information
        3. Focus on extracting key information that users are likely to seek, while avoiding narrow or less important questions.
        4. When provided with code blocks, focus on understanding the overall functionality rather than the specific syntax or variables. Feel free to request examples of how to use key APIs or features.
        5. Do not use phrases like 'based on the provided context' or 'according to the context'.
    """,
    "simple": """
        You are an expert at generating practical questions based on given documentation.
        Your task is to create simple, directly answerable questions from the given context.

        Follow these rules:
        1. Generate questions that reflect real user information needs related to the document's subject matter (e.g., technical docs : feature availability, implementation details)
        2. Ensure questions are relevant, concise, preferably under 10 words, and fully answerable with the provided information
        3. Focus on extracting key information that users are likely to seek, while avoiding narrow or less important questions.
        4. When provided with code blocks, focus on understanding the overall functionality rather than the specific syntax or variables. Feel free to request examples of how to use key APIs or features.
        5. Do not use phrases like 'based on the provided context' or 'according to the context'.
    """
}

print("System prompts defined for simple and complex questions")

In [None]:
# Tool configuration for question-answer generation
tool_config = {
    "tools": [
        {
            "toolSpec": {
                "name": "QuestionAnswerGenerator",
                "description": "Generates questions and answers based on the given context.",
                "inputSchema": {
                    "json": {
                        "type": "object",
                        "properties": {
                            "question": {
                                "type": "string",
                                "description": "The generated question"
                            },
                            "answer": {
                                "type": "string",
                                "description": "The answer to the generated question"
                            }
                        },
                        "required": ["question", "answer"]
                    }
                }
            }
        }
    ]
}

print("Tool configuration defined for QuestionAnswerGenerator")

### 4. Generate Questions

In [None]:
# Set the number of question-answer pairs to generate
num_pairs = 5  # Will generate 5 simple and 5 complex questions (total 10)

# Define output file path
output_file = f"output/{document_name}_sample_questions.jsonl"
print(f"Questions will be saved to: {output_file}")

# Delete existing output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)
    print(f"Removed existing file: {output_file}")

In [None]:
# Initialize dataset to store generated questions
total_chunks = len(chunks)
dataset = []

# Track questions by type
generated_question = {"simple": [], "complex": []}

# Generation parameters
temperature = 0.0
top_p = 0.5

# Validate that we have enough chunks
if total_chunks < 3:
    raise ValueError(f"Not enough chunks to generate questions. Found {total_chunks}, need at least 3.")

print(f"Generating {num_pairs*2} questions ({num_pairs} simple + {num_pairs} complex)...")

# Generate questions
for i in tqdm(range(num_pairs * 2)):
    try:
        # Select random starting chunk position
        start_id = random.randint(0, total_chunks - 3)
        
        # Get three consecutive chunks for context
        context_chunks = [
            chunks[start_id]['content'],
            chunks[start_id + 1]['content'],
            chunks[start_id + 2]['content']
        ]
        
        # Combine chunks into context
        context = " ".join(context_chunks)
        
        # Alternate between complex and simple questions
        if i % 2 == 0:
            question_type = "complex"
        else:
            question_type = "simple"

        # Create user prompt
        user_template = f"""
        Generate a {question_type} question and its answer based on the following context:

        Context: {context}

        Use the QuestionAnswerGenerator tool to provide the output.
        """

        # Prepare prompt and inference config
        sys_prompt = sys_template[question_type]
        user_prompt = [{"role": "user", "content": [{"text": user_template}]}]

        # Call Bedrock with tool configuration
        response = bedrock_service.converse_with_tools(
            messages=user_prompt,
            system_prompt=sys_prompt,
            tools=tool_config,
            temperature=temperature,
            top_p=top_p,
            max_tokens=4096
        )

        stop_reason = response['stopReason']

        # Process the tool response
        if stop_reason == 'tool_use':
            tool_requests = response['output']['message']['content']

            for tool_request in [x for x in tool_requests if 'toolUse' in x]:
                if tool_request['toolUse']['name'] == 'QuestionAnswerGenerator':
                    # Extract question and answer
                    question = tool_request['toolUse']['input']['question']
                    answer = tool_request['toolUse']['input']['answer']
                    
                    # Create QA item
                    qa_item = {
                        "question": question,
                        "ground_truth": answer,
                        "question_type": question_type,
                        "context": context
                    }

                    # Save to JSONL file
                    with open(output_file, 'a') as f:
                        json.dump(qa_item, f)
                        f.write('\n')
                    
                    # Add to dataset
                    dataset.append(qa_item)
                    generated_question[question_type].append(question)
                    
                    print(f"Question {i+1}/{num_pairs*2} ({question_type}): {question[:50]}...")
        else:
            print(f"⚠️ Warning: Question generation stopped with reason '{stop_reason}' instead of 'tool_use'")
            
    except Exception as e:
        print(f"❌ Error generating question {i+1}: {str(e)}")

print(f"\n✅ Generated {len(dataset)} questions ({len(generated_question['simple'])} simple + {len(generated_question['complex'])} complex)")
print(f"Questions saved to {output_file}")

### 5. Display Sample Questions

In [None]:
# Display a sample of generated questions
print("\n=== Sample Simple Questions ===")
for i, question in enumerate(generated_question["simple"][:3], 1):
    print(f"{i}. {question}")

print("\n=== Sample Complex Questions ===")
for i, question in enumerate(generated_question["complex"][:3], 1):
    print(f"{i}. {question}")

print(f"\n✅ Complete dataset saved to {output_file}")