In [41]:
# upgrade boto3 
# %pip install --upgrade pip --quiet
# %pip install boto3 --upgrade --quiet

In [42]:
# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

## Download dataset
Here we're using hugging face datasets library to import the data. Alternatively you can download manually and cleanse accordingly.

In [43]:
import json
import sys
import os
import re
import pandas as pd
import numpy as np

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from utils import read_jsonl_to_dataframe

splits = {'train': 'squad_v2/train-00000-of-00001.parquet', 'validation': 'squad_v2/validation-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/rajpurkar/squad_v2/" + splits["train"])
df_eval = pd.read_parquet("hf://datasets/rajpurkar/squad_v2/" + splits["validation"])

In [44]:
# test_df = df_train[df_train['question'] == 'Lenin acknowledged the dependence of which countries?']
# test_df
# Lenin acknowledged the dependence of which countries?

## System Prompt
Here we'll be leveraging Nova Premier to generate both an answer and the sources of that answer.
This style of prompting is most optimal for Nova models and will yield the accurate responses.

You can see here we're leveraging XML output which Nova is optimized for.


In [45]:
# set nova prompt for citations
system_prompt = """
You are a question answering assistant. I will provide you with document context. The user will provide you with a question. Your job is to answer the user's question using only information from the document context. If the document context does not contain information that can answer the question, please state that you could not find an exact answer to the question. Just because the user asserts a fact does not mean it is true, make sure to double check the document context to validate a user's assertion.

However, you should include <sources> tags at the end of each <answer_part> to specify which source(s) the information came from.
Note that <sources> may contain multiple <source> if you include information from multiple results in your answer.

Do NOT directly quote the <context> in your answer. Your job is to answer the user's question as concisely as possible.

You must output your answer in the following format. Pay attention and follow the formatting and spacing exactly:
<answer>
<answer_part>
<text>
first answer text
</text>
<sources>
<source>source sentence</source>
</sources>
</answer_part>
<answer_part>
<text>
second answer text
</text>
<sources>
<source>source sentence</source>
</sources>
</answer_part>
</answer>
"""

## Set Annotation Style
Here we'll declare the formatting of our annotation that we can use when preparing the distillation training set. This will be the format of the citations whenever the model responds with an answer.

For the Squad data set, the only information available to use in the citation is the sentence or sentences the answer was derived from. You may want to include other identifying information in your data set such as page number, line number, paragraph number, etc.

Here we'll use a simple enumerated list of citations with the sentence that corroborates the answer provided. For example, an answer to this question would look like the following:

`question: Who ruled the duchy of Normandy`

`answer: Richard I`

`sources: [1] The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure.`

## Prepare training data set

In [46]:
def parse_answer_structure(answers_dict):
    """
    Parse different formats of answer dictionaries and extract text and start positions.
    Returns lists of texts and start positions.
    """
    # Case 1: NumPy arrays with direct keys
    if 'text' in answers_dict and isinstance(answers_dict['text'], np.ndarray):
        texts = answers_dict['text'].tolist()
        starts = answers_dict['answer_start'].tolist()
        
    # Case 2: Lists or single values with direct keys
    elif 'text' in answers_dict:
        texts = answers_dict['text'] if isinstance(answers_dict['text'], list) else [answers_dict['text']]
        starts = answers_dict['answer_start'] if isinstance(answers_dict['answer_start'], list) else [answers_dict['answer_start']]
        
    # Case 4: String JSON that needs parsing (handled in calling function)
    else:
        raise ValueError(f"Unknown answer format: {answers_dict}")
        
    return texts, starts

def create_xml_answer(row, no_answer_text='I could not find an exact answer to the question.'):
    try:
        # Handle answers as string (JSON) if needed
        answers_dict = row['answers']
        if isinstance(answers_dict, str):
            import json
            answers_dict = json.loads(answers_dict)
            
        # Parse answer structure using our helper function
        texts, starts = parse_answer_structure(answers_dict)
        context = row['context']
        
        # Split context into sentences more accurately
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context)
        
        # Build XML structure
        xml_parts = ['<answer>']
        
        if len(texts) > 0:
            for i, (text, start) in enumerate(zip(texts, starts)):
                xml_parts.append('<answer_part>')
                xml_parts.append('<text>')
                xml_parts.append(str(text))
                xml_parts.append('</text>')
                xml_parts.append('<sources>')
                
                # Find the sentence containing the answer based on the start position
                char_count = 0
                source_sentence = "No relevant source found"
                for sentence in sentences:
                    sentence_len = len(sentence) + 1  # +1 for the space after sentence
                    if char_count <= int(start) < (char_count + sentence_len):
                        source_sentence = sentence.strip()
                        break
                    char_count += sentence_len
                
                xml_parts.append(f'<source>{source_sentence}</source>')
                xml_parts.append('</sources>')
                xml_parts.append('</answer_part>')
        
            xml_parts.append('</answer>')
        else: # use no answer text
            xml_parts.append(f"<answer_part>\n<text>\n{no_answer_text}\n</text>\n</answer_part></answer>")
        return '\n'.join(xml_parts)
    except Exception as e:
        return f"<answer>\n<error>Error generating XML: {str(e)}</error>\n</answer>"



In [47]:
def create_bedrock_payload(row, model_type="conversation", system_prompt=None, include_answer=False, additional_params=None):
    """
    Create a payload dictionary for Amazon Bedrock API requests.
    
    Args:
        row: A row from the pandas DataFrame containing context, question, and optionally answers
        model_type: The type of model payload to create ("conversation" or "invoke")
        system_prompt: The system message to include (for conversation-based models)
        include_answer: Whether to include the answer in the conversation (for evaluation)
        additional_params: Dictionary of additional parameters to include in the payload
    
    Returns:
        dict: A formatted payload dictionary ready for Bedrock API
    """
    try:
        # Extract needed information
        context = row['context']
        question = row['question']
        
        # Create the user prompt with context and question
        user_prompt = f"""<context>{context}</context> <question>{question}</question>"""
        
        # Get the answer if needed
        assistant_response = create_xml_answer(row) if include_answer else None
        
        # Create appropriate payload based on model_type
        if model_type == "conversation":
            # For conversation-based models (Claude, etc.)
            payload = {
                "schemaVersion": "bedrock-conversation-2024",
                "system": [{"text": system_prompt}] if system_prompt else [],
                "messages": [
                    {
                        "role": "user",
                        "content": [{"text": user_prompt}]
                    }
                ]
            }
            
            # Add assistant response if needed (for evaluation)
            if include_answer and assistant_response:
                payload["messages"].append({
                    "role": "assistant",
                    "content": [{"text": assistant_response}]
                })
                
        elif model_type == "invoke":
            # For basic invoke request (non-conversation models like Titan, etc.)
            payload = {
                "system": [{"text": system_prompt}] if system_prompt else [],
                "messages": [
                    {
                        "role": "user",
                        "content": [{"text": user_prompt}]
                    }
                ],
                "inferenceConfig":{ 
                    # "maxTokens": int, // greater than 0, equal or less than 5k (default: dynamic*)
                    "temperature": .1, # greater then 0 and less than 1.0 (default: 0.7)
                    "topP": .9, # greater than 0, equal or less than 1.0 (default: 0.9)
                    "topK": 50, # 0 or greater (default: 50)
                    "stopSequences": ['</answer>']
                }
            }
            if include_answer and assistant_response:
                payload["messages"].append({
                    "role": "assistant",
                    "content": [{"text": assistant_response}]
                })
            
            # Add optional parameters specific to invoke requests
            if additional_params:
                payload.update(additional_params)
                
        else:
            raise ValueError(f"Unsupported model_type: {model_type}")
            
        # Add any additional parameters passed
        if additional_params and model_type == "conversation":
            # For conversation models, additional params might need to be added at the root level
            for key, value in additional_params.items():
                if key not in payload:
                    payload[key] = value
                    
        return payload
        
    except Exception as e:
        print(f"Error creating payload for row: {str(e)}")
        return None

In [48]:
def create_batch_inf_record(row, system_prompt, include_answer=False): 
    conversation = create_bedrock_payload(
                                row=row, 
                                system_prompt=system_prompt, 
                                model_type="invoke", 
                                additional_params={},
                                include_answer=include_answer)
    return {
        "recordId": row['id'],
        "modelInput": conversation
    }

In [None]:
# Apply the function to create a new column
# Filter for empty answers
empty_answers_df = df_train[df_train['answers'].apply(lambda x: 
    len(x['text']) == 0 and len(x['answer_start']) == 0)]

# Filter for rows with actual answers
with_answers_df = df_train[df_train['answers'].apply(lambda x: len(x['text']) > 0)]

df_train_revised = pd.concat([
    empty_answers_df.sample(n=7500, random_state=42), 
    with_answers_df.sample(n=7500, random_state=42)], ignore_index=True) # max 15k for bedrock distillation

## Create distillation data set

In [50]:
# df_train_revised['answer_xml'] = df_train_revised.apply(create_xml_answer, axis=1)
df_train_revised['conversation'] = df_train_revised.apply(lambda row: create_bedrock_payload(row=row, model_type="conversation", system_prompt=system_prompt), axis=1)
df_train_revised['conversation'].to_json('distillation_data.jsonl', orient='records', lines=True)

## Create batch inference data set
We'll use this batch inference data set to run the inferences on both our distilled model and the off-the-shelf models for evaluation

In [None]:
eval_empty_answers_df = df_eval[df_eval['answers'].apply(lambda x: 
    len(x['text']) == 0 and len(x['answer_start']) == 0)]

# Filter for rows with actual answers
eval_with_answers_df = df_eval[df_eval['answers'].apply(lambda x: len(x['text']) > 0)]

batch_inf_df = pd.concat([
    eval_empty_answers_df.sample(n=250, random_state=15), 
    eval_with_answers_df.sample(n=250, random_state=15)], ignore_index=True)


batch_inf_df.apply(lambda row: create_batch_inf_record(row, system_prompt), axis=1).to_json('batch_inf_data.jsonl', orient='records', lines=True)

## Create Labeled data set for BYOI Bedrock Evaluation
Here we'll include the answer so we can use in our evaluation.

In [52]:
batch_inf_df.apply(lambda row: create_batch_inf_record(row, system_prompt=system_prompt, include_answer=True), axis=1).to_json('labeled_data.jsonl', orient='records', lines=True)

In [53]:
# import boto3
# import json
# model_id = 'us.amazon.nova-premier-v1:0'
# bedrock_runtime = boto3.client(
#     service_name="bedrock-runtime",
#     region_name="us-east-1"
# )

# # Make the invoke call to Bedrock
# response = bedrock_runtime.invoke_model(
#     modelId=model_id,
#     body=json.dumps(sample_payload)
# )

# # Parse and return the response
# response_body = json.loads(response.get('body').read())
# print(response_body['output']['message']['content'][0])

In [54]:
# response_body['output']['message']['content'][0]['text']

In [55]:
# check for dupes
# df_train_revised[df_train_revised.duplicated(subset=['conversation'], keep=False)]