### 0. Prerequisites

In [1]:
%load_ext autoreload
%autoreload 2
%pip install ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### 1. Setup File Information

In [2]:
from pathlib import Path

input_file = "data/bedrock-ug.pdf"
chunk_size = 1000
start_page = 0
end_page = -1

document_name = Path(input_file).resolve().stem
document_name

'bedrock-ug'

In [None]:
from libs.bedrock_service import BedrockService
from config import Config
config = Config.load()

bedrock_service = BedrockService(config.aws.region, config.aws.profile, config.bedrock.retries, config.bedrock.embed_model_id, config.bedrock.model_id, config.model.max_tokens, config.model.temperature, config.model.top_p)

us-west-2


### 2. Split Document into Chunks

In [3]:
from helpers.document_helper import DocumentHelper
chunked_document = DocumentHelper.split(DocumentHelper.load_pdf(input_file, start_page, end_page), chunk_size, -1)
chunks = chunked_document[0]['chunks']


100%|██████████| 1671/1671 [01:19<00:00, 20.99it/s]


PDF Load: data/bedrock-ug.pdf


100%|██████████| 1/1 [00:00<00:00, 93.94it/s]


### 3. Build Prompt and Tool Config

In [7]:
sys_template = {
    "complex": """
        You are an expert at generating practical questions based on given documentation.
        Your task is to generate complex, reasoning questions and answers.

        Follow these rules:
        1. Generate questions that reflect real user information needs related to the document's subject matter (e.g., technical docs : feature availability, implementation details)
        2. Ensure questions are relevant, concise, preferably under 25 words, and fully answerable with the provided information
        3. Focus on extracting key information that users are likely to seek, while avoiding narrow or less important questions.
        4. When provided with code blocks, focus on understanding the overall functionality rather than the specific syntax or variables. Feel free to request examples of how to use key APIs or features.
        5. Do not use phrases like 'based on the provided context' or 'according to the context'.
    """,
    "simple": """
        You are an expert at generating practical questions based on given documentation.
        Your task is to create simple, directly answerable questions from the given context.

        Follow these rules:
        1. Generate questions that reflect real user information needs related to the document's subject matter (e.g., technical docs : feature availability, implementation details)
        2. Ensure questions are relevant, concise, preferably under 10 words, and fully answerable with the provided information
        3. Focus on extracting key information that users are likely to seek, while avoiding narrow or less important questions.
        4. When provided with code blocks, focus on understanding the overall functionality rather than the specific syntax or variables. Feel free to request examples of how to use key APIs or features.
        5. Do not use phrases like 'based on the provided context' or 'according to the context'.
    """
}

In [6]:
tool_config = {
    "tools": [
        {
            "toolSpec": {
                "name": "QuestionAnswerGenerator",
                "description": "Generates questions and answers based on the given context.",
                "inputSchema": {
                    "json": {
                        "type": "object",
                        "properties": {
                            "question": {
                                "type": "string",
                                "description": "The generated question"
                            },
                            "answer": {
                                "type": "string",
                                "description": "The answer to the generated question"
                            }
                        },
                        "required": ["question", "answer"]
                    }
                }
            }
        }
    ]
}

### 4. Generate Question

In [8]:
num_pairs = 5

output_file = f"output/{document_name}_sample_questions.jsonl"
output_file

In [20]:
import random
import json
import uuid
from tqdm.notebook import tqdm

total_chunks = len(chunks)
dataset = []

generated_question = {"simple": [], "complex": []}

for i in tqdm(range(num_pairs * 2)):
    start_id = random.randint(0, total_chunks - 3)
    context_chunks = [
        chunks[start_id]['content'],
        chunks[start_id + 1]['content'],
        chunks[start_id + 2]['content']
    ]
    
    context = " ".join(context_chunks)
    
    if i % 2 == 0:
        question_type = "complex"
    else:
        question_type = "simple"

    user_template = f"""
    Generate a {question_type} question and its answer based on the following context:

    Context: {context}

    Use the QuestionAnswerGenerator tool to provide the output.
    """

    sys_prompt = [{"text": sys_template[question_type]}]
    user_prompt = [{"role": "user", "content": [{"text": user_template}]}]
    temperature = 0.0
    top_p = 0.5
    inference_config = {"temperature": temperature, "topP": top_p}

    response = bedrock_service.converse_with_tools(
        messages=user_prompt,
        system_prompt=sys_template[question_type],
        tools=tool_config,
        temperature=temperature,
        top_p=top_p,
        max_tokens=4096
    )

    stop_reason = response['stopReason']

    if stop_reason == 'tool_use':
        tool_requests = response['output']['message']['content']

        for tool_request in [x for x in tool_requests if 'toolUse' in x]:
            if tool_request['toolUse']['name'] == 'QuestionAnswerGenerator':
                res = tool_request['toolUse']['input']

                qa_item = {
                    "question_id": f"{uuid.uuid4()}",
                    "question": tool_request['toolUse']['input']['question'],
                    "ground_truth": tool_request['toolUse']['input']['answer'],
                    "question_type": question_type,
                    "context": context
                }

                with open(output_file, 'a') as f:
                    json.dump(qa_item, f)
                    f.write('\n')
                
                dataset.append(qa_item)

dataset


  0%|          | 0/10 [00:00<?, ?it/s]

[{'question_id': '154444d4-a694-4dea-bb47-fac2b39ba251',
  'question': "How do the S3 storage and S3 retrieval nodes in Amazon Bedrock's prompt flow differ in their functionality, inputs, and outputs, and what potential use case might combine these two node types?",
  'ground_truth': "The S3 storage and S3 retrieval nodes in Amazon Bedrock's prompt flow have distinct but complementary functions:\n\n1. S3 storage node:\n   - Function: Stores data in an Amazon S3 location\n   - Inputs: Content to store and the object key\n   - Output: URI of the S3 location\n   - Configuration: Specifies the S3 bucket for data storage\n\n2. S3 retrieval node:\n   - Function: Retrieves data from an Amazon S3 location\n   - Input: Object key\n   - Output: Content from the S3 location (currently limited to UTF-8 encoded strings)\n   - Configuration: Specifies the S3 bucket for data retrieval\n\nA potential use case combining these nodes could be a multi-step data processing workflow:\n1. Use the S3 storage 