In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [None]:
%store -r sentiment_categorisation_prompt_id
%store -r summarisation_prompt_id
%store -r extraction_prompt_id

In [None]:
#or set them manually
#sentiment_categorisation_prompt_id = ""
#summarisation_prompt_id = ""
#extraction_prompt_id = "" 

In [None]:
!pip install -q boto3==1.34.149
!pip install -q langgraph==0.1.17
!pip install -q langchain==0.2.11
!pip install -q langchain-community==0.2.10
!pip install -q langchain-aws==0.1.12
!pip install nest_asyncio

In [None]:
import boto3
import importlib

#adding our utils library to sys path
import sys
sys.path.append("../src/utils/")
import llm_utils
import reflection_graph
from reflection_graph import ReflectionGraph

importlib.reload(llm_utils)
importlib.reload(reflection_graph)

bedrock_agent_client = boto3.client('bedrock-agent')
bedrock_runtime = boto3.client(service_name='bedrock-runtime')

### Loading Transcripts to be used later in the notebook.

In [None]:
transcripts = llm_utils.load_jsonlines_file("../generated/transcripts/transcripts.jsonl")
transcripts[0]

### Retrieve prompt from Bedrock Prompt Management

In [None]:
bedrock_agent_client = boto3.client('bedrock-agent')

#retrieve prompt details from Bedrock
sentiment_categorisation_prompt = bedrock_agent_client.get_prompt(
        promptIdentifier=sentiment_categorisation_prompt_id
    )
#parse the response and retrieve the elements we need for later.
sentiment_prompt_dict = llm_utils.get_elts_from_prompt_get_response(sentiment_categorisation_prompt)

#retrieve prompt details from Bedrock
summarisation_prompt = bedrock_agent_client.get_prompt(
        promptIdentifier=summarisation_prompt_id
    )
#parse the response and retrieve the elements we need for later.
summarisation_prompt_dict = llm_utils.get_elts_from_prompt_get_response(summarisation_prompt)

#retrieve prompt details from Bedrock
extraction_prompt = bedrock_agent_client.get_prompt(
        promptIdentifier=extraction_prompt_id
    )
#parse the response and retrieve the elements we need for later.
extraction_prompt_dict = llm_utils.get_elts_from_prompt_get_response(extraction_prompt)

# Groundtruth data generation

Ideally, your groundtruth data should be generated either by your live production system from end users or from manually crafted evals as you want those to be of high quality and reviewed/approved. 

An intermediary method could be to manually create a small sample of ground truth data and use LLMs to create variations of them.

In that notebook, because we are focusing on the tools and process for our evaluation pipeline primarily, we are taking a shortcut and we use a LLM to generate our groundtruth data which is not recommended for real life use cases.

To improve the groundtruth LLM generated data in comparison to a normal output from our prompt, we have implemented and added a reflection loop for an "Evaluator/Inspector" LLM to review the generated output of an "Actor".

See the link below for more info on that pattern:
https://www.promptingguide.ai/techniques/reflexion


### Reflection graph with LangGraph library

If you are interested in the details of the implementation, uncomment the below cell.

In [None]:
#!pygmentize ../src/utils/reflection_graph.py

### Model Configuration used for our reflection evaluator

As we're using the langGraph library, we're using a ChatBedrockConverse wrapper from the langchain_aws library

In [None]:
from langchain_aws import ChatBedrockConverse

evaluator_llm = ChatBedrockConverse(
    model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    max_tokens = 4096,
    temperature = 0,
    top_p = 0.6
)

## Groundtruth data generation for sentiment classification

The below code should take 2min to execute. we're using the asyncio library to parallelise the calls converse apis. 

You can have a look at the reflection_graph.py while you wait.

In [None]:
import concurrent.futures
import asyncio

import nest_asyncio
#This line is required to allow Jupyter Notebook to run asynchronous code correctly, as Jupyter Notebook has its own event loop running in the background
nest_asyncio.apply()

def on_task_done(task, task_number, total_tasks):
    print(f"Task #{task_number} completed successfully")


#Util function that generates our dataset
async def generate_dataset_for_evaluation(semaphore, prompt_dict, evaluator_llm, actor_llm, reflection_graph, transcript, combined_data):
    async with semaphore:

        #format prompt with transcript
        prompt = prompt_dict["prompt_text"].format(transcript=transcript["transcript"])

        #generate response from LLM
        generated_answer= llm_utils.converse_api_call_no_tool(prompt, 
                                "", 
                                bedrock_runtime, 
                                conversation_history= [], 
                                prefill="",
                                model_id=prompt_dict["modelId"], 
                                temperature=prompt_dict["temperature"], 
                                top_p=prompt_dict["topP"], 
                                max_tokens=prompt_dict["maxTokens"],
                                debug=False)
        
        #generate groundtruth
        groundtruth = await reflection_graph.run_graph(prompt)

        combined_data.append({"question": transcript, "answer": generated_answer, "groundtruth": groundtruth})

async def run_generate_for_all_dataset(transcripts, prompt_dict, evaluator_llm):

    #used to store results
    output = []

    #configuring a ChatBedrockConverse llm object to pass to our langgraph reflection graph
    actor_llm = ChatBedrockConverse(
        model_id=prompt_dict["modelId"],
        max_tokens = prompt_dict["maxTokens"],
        temperature = prompt_dict["temperature"],
        top_p = prompt_dict["topP"]
    )
    #instantiate our reflection graph.
    reflection_graph = ReflectionGraph(actor_llm, evaluator_llm)

    #with concurrent.futures.ThreadPoolExecutor() as executor:
        # Get the current event loop
        #loop = asyncio.get_running_loop()


    # Create a semaphore with the specified concurrency limit
    semaphore = asyncio.Semaphore(10)

    loop = asyncio.get_event_loop()

    #create tasks
    total_tasks = len(transcripts)
    tasks = []

    for i, transcript in enumerate(transcripts, start=1):
        task = asyncio.create_task(
            generate_dataset_for_evaluation(semaphore, prompt_dict, evaluator_llm, actor_llm, reflection_graph, transcript, output)
        )
        # Add a callback to the task
        task.add_done_callback(lambda t, task_number=i, total=total_tasks: on_task_done(t, task_number, total))
        tasks.append(task)

    #waiting to complete.
    results = await asyncio.gather(*tasks)

    return output


In [None]:
sentiment_gen_data = asyncio.run(run_generate_for_all_dataset(transcripts, sentiment_prompt_dict, evaluator_llm))

In [None]:
#reshaping the dataset to use it later
def reshape_data(data):
    reshaped_data = dict()
    reshaped_data["question"] = []
    reshaped_data["answer"] = []
    reshaped_data["groundtruth"] = []

    for triplet in data:
        reshaped_data["question"].append(triplet["question"])
        reshaped_data["answer"].append(triplet["answer"])
        reshaped_data["groundtruth"].append(triplet["groundtruth"])

    return reshaped_data

In [None]:
sentiment_combined_data = reshape_data(sentiment_gen_data)

We quickly check how many difference we have between groundtruth and answers.

In [None]:
difference_counter = 0
number_questions = len(sentiment_combined_data["question"])
for i in range(number_questions):
    if (sentiment_combined_data["answer"][i] != sentiment_combined_data["groundtruth"][i]):
        difference_counter +=1
print(f"we have {difference_counter}/{number_questions} differences between groundtruth and answers")
        

### Export to json file

In [None]:
llm_utils.save_dict_to_json(sentiment_combined_data, "../generated/groundtruth/sentiment_gt.json")

## Groundtruth data generation for summarisation

We run the same functions as for the sentiment data generation but with a different prompt. 

Running the below cell should take 8-9min.

In [None]:
summarisation_gen_data = asyncio.run(run_generate_for_all_dataset(transcripts, summarisation_prompt_dict, evaluator_llm))

In [None]:
summarisation_combined_data = reshape_data(summarisation_gen_data)

In [None]:
# printing few examples
number_questions = len(summarisation_combined_data["question"])
for i in range(0,2):
    print(f"Example {i}:")
    print("-----------------------")
    print(f"LLM answer:\n{summarisation_combined_data["answer"][i]}\n")
    print(f"Groundtruth:\n{summarisation_combined_data["groundtruth"][i]}\n")
    print("--------------------------------------------------------------------------\n\n")

### Export to JSON file

In [None]:
llm_utils.save_dict_to_json(summarisation_combined_data, "../generated/groundtruth/summary_gt.json")

## Groundtruth data generation for Theme extraction

The below cell should take 2-3min to run.

In [None]:
extraction_gen_data = asyncio.run(run_generate_for_all_dataset(transcripts, extraction_prompt_dict, evaluator_llm))

In [None]:
extraction_combined_data = reshape_data(extraction_gen_data)

In [None]:
# printing few examples
number_questions = len(extraction_combined_data["question"])
for i in range(0,3):
    print(f"Example {i}:\n")
    print(f"LLM answer:\n{extraction_combined_data["answer"][i]}\n")
    print(f"Groundtruth:\n{extraction_combined_data["groundtruth"][i]}\n")

### Export to JSON file

In [None]:
llm_utils.save_dict_to_json(extraction_combined_data, "../generated/groundtruth/extraction_gt.json")