In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

# Synthetic Data generation

We generate data to be used for a variety of use cases that we will use for our evaluation process.

We choose to generate Call Centre transcripts from which we can do theme extraction, categorisation, summarisation, 3 very common use cases for generative AI. However, this is just an example and we can easily customise those prompts to generate Doctors' transcripts or legal documents if needed for your use case.

In [None]:
!pip install -q boto3==1.34.144 reportlab==4.2.2

In [None]:
import boto3
import json
import os
import importlib

#adding our utils library to sys path
import sys
sys.path.append("../src/utils/")
import llm_utils
importlib.reload(llm_utils)

session = boto3.Session()
bedrock_runtime = session.client(service_name='bedrock-runtime')
bedrock = session.client(service_name='bedrock')

## Transcripts generation

### Generate calls topics

In [None]:
number_of_topics = 20

topic_system_prompt = """
You are a synthetic data generator expert at generating topics of conversations given a specific context.
"""

topic_prompt = """ 
Your task is to generate topics of conversations between a customer and a call centre agent working for company BCD.

Company BCD provides an online video on demand platform streaming movies, series and live sport.
Viewers are subscribing to company BCD's platform for an annual or monthly fee with different level of access to content (premium, sport, paid per view).
Customer can access the service via various applications: Web, IOS, Android, Smart TV, PS5 and XBOX.

Generate <number>{number}</number> different topics of conversation using factual and concise language.

Output the response in well formatted JSON format in <answer> tag as per the example below.

<example>
    {
        'topics': [
            'Topic 1',
            'Topic 2',
            'Topic 3',
            ...
        ]
    }
</example>
"""

In [None]:
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"

user_input = topic_prompt.replace("number", str(number_of_topics))

topics_dict = llm_utils.converse_api_call_no_tool(user_input, 
                              topic_system_prompt, 
                              bedrock_runtime, 
                              conversation_history= [], 
                              prefill="<answer>{", 
                              model_id=model_id, 
                              temperature=0, 
                              top_p=1, 
                              max_tokens=4096,
                              json_check=True)

In [None]:
topics_dict

### Generate Transcripts based on topics

Note: I had to put a reminder for closing curly brackets to get it consistent. see the  "IMPORTANT: Do not forget the closing curly bracket "}" for each transcript as shown in the example below." line in the prompt.

In [None]:
transcript_system_prompt = """
You are a synthetic data generator expert at generating conversations given a specific context.
"""

transcript_prompt = """ 
Your task is to generate <number>{number}</number> transcripts between a customer and a call centre agent on the following topic <topic>{topic}</topic>.

Make sure that the transcripts are anchored into the following context: 
<context>Company BCD provides an online video on demand platform streaming movies, series and live sport.
Viewers are subscribing to company BCD's platform for an annual or monthly fee with different level of access to content (premium, sport, paid per view).
Customer can access the service via various applications: Web, IOS, Android, Smart TV, PS5 and XBOX.</context>

The customer might express some of the following emotions when calling the support call centre: Satisfaction, Frustration, Anger, Anxiety, Impatience, Confusion, Urgency, Relief, Gratitude, Disappointment, Resignation

The call centre agent should always keep professional and helpul and use an empathetic tone where required even when face with animosity from the customer.

The conversation should be at least <word count>{word_count}</word count> words.

Output the response as a well formatted JSON in <answer> tag as per the example below. 

IMPORTANT: Do not forget the closing curly bracket "}" for each transcript as shown in the example below.

<example>
    <answer>
        [
            {"topic" : "{topic}", "transcript":"Customer 1: text, Agent 1: text, Customer 1: text, Agent 1: text"},
            {"topic" : "{topic}", "transcript":"Customer 2: text, Agent 2: text, Customer 2: text, Agent 2: text"},
            {"topic" : "{topic}", "transcript":"Customer 3: text, Agent 3: text, Customer 3: text, Agent 3: text"}
        ]
    </answer>
</example>
"""

We generate transcripts concurrently to speed up the generation (should take 2min with 10 threads).

In [None]:
import concurrent.futures

word_count = 500
number_of_transcripts = 5
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
transcripts = []

def generate_item(topic, word_count, number, prompt, system_prompt, bedrock_runtime, model_id):

    prompt_user_input = prompt.replace("{topic}", topic).replace("{word_count}", str(word_count)).replace("{number}", str(number))

    transcript = llm_utils.converse_api_call_no_tool(prompt_user_input,
                                           system_prompt,
                                           bedrock_runtime,
                                           conversation_history=[],
                                           prefill="<answer>",
                                           model_id=model_id,
                                           temperature=0.6,
                                           top_p=0.8,
                                           max_tokens=4096,
                                           json_check=True)
    return transcript

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # we create a list of future object by submitting the execution of the function with the different topic.
    futures = [executor.submit(generate_item, topic, word_count, number_of_transcripts, transcript_prompt, transcript_system_prompt, bedrock_runtime, model_id) for topic in topics_dict['topics']]
    # we iterate over the future event as they complete and we retrieve the output for each 
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        transcripts.append(result)
        


In [None]:
#shows transcript of the first topic
transcripts[0]

we export the output in a jsonl file

In [None]:
#export array to jsonlines file
file_path = '../generated/transcripts/transcripts.jsonl'

with open(file_path, 'w') as outfile:
    for topic_entry in transcripts:
        for entry in topic_entry:
            json.dump(entry, outfile)
            outfile.write('\n')

## Generate FAQ documents

Those documents will be used as part of our RAG solution that will be a component of the solution that we need to optimise and evaluate.

In [None]:
faq_system_prompt = """
You are a synthetic data generator expert at generating customer support FAQs.
"""

faq_prompt = """ 
Your task is to generate <number>{number_faq}</number> FAQs based on the following topic <topic>{topic}</topic>.

Make sure that the faqs are anchored into the following context: 
<context>Company BCD provides an online video on demand platform streaming movies, series and live sport.
Viewers are subscribing to company BCD's platform for an annual or monthly fee with different level of access to content (premium, sport, paid per view).
Customer can access the service via various applications: Web, IOS, Android, Smart TV, PS5 and XBOX.</context>

The FAQ should include a question from a customer and a detailed response of at least <number>{word_count}</number> words.

Output the response in <answer> tag using the well formatted JSON format as shown in the example.

IMPORTANT: Do not forget the closing curly bracket "}" for each transcript as shown in the example below.

<example>
  <answer>
    [
      {"topic":"{topic}", "faq":"Q: question 1, R: response 1"},
      {"topic":"{topic}", "faq":"Q: question 2, R: response 2"},
      {"topic":"{topic}", "faq":"Q: question 3, R: response 3"}
    ]
  </answer>
</example>
"""

We generate the FAQS (should take 2-3 min)

In [None]:
word_count = 300
number_faq = 3
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
#model_id = "anthropic.claude-3-haiku-20240307-v1:0"

faqs = []

#we use the same generate_item function
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # we create a list of future object by submitting the execution of the function with the different topic.
    futures = [executor.submit(generate_item, topic, word_count, number_faq, faq_prompt, faq_system_prompt, bedrock_runtime, model_id) for topic in topics_dict['topics']]
    # we iterate over the future event as they complete and we retrieve the output for each 
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        faqs.append(result)


In [None]:
#print faqs for the first topic
faqs[0]

We export the output in a jsonlines formatted file in case we need it for later in the workshop

In [None]:
#export array to jsonlines file
file_path = '../generated/faqs/faqs.jsonl'

with open(file_path, 'w') as outfile:
    for faq_entry in faqs:
        for entry in faq_entry:
            json.dump(entry, outfile)
            outfile.write('\n')

We also export it as pdf to upload to the knowledge base

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet


#setting style for topic heading
styles = getSampleStyleSheet()
topic_style = styles['Heading1']

counter = 1
for faq_entry in faqs:
    for entry in faq_entry:

        file_path = f'../generated/faqs/faq-doc-{counter}.pdf'
        counter += 1

        # Create a PDF document
        doc = SimpleDocTemplate(file_path, pagesize=letter)
        elements = []

        # Add the topic
        topic_paragraph = Paragraph(entry['topic'], topic_style)
        elements.append(topic_paragraph)

        # Add a blank line
        elements.append(Paragraph('', styles['BodyText']))

        # Add the FAQ
        faq_style = styles['BodyText']
        faq_paragraph = Paragraph(entry['faq'], faq_style)
        elements.append(faq_paragraph)
        # Build the PDF
        doc.build(elements)

### Create S3 bucket to store FAQ docs and upload to S3

In [None]:
sts_client = boto3.client('sts')
s3_client = boto3.client('s3')
boto3_session = boto3.session.Session()
region_name = boto3_session.region_name
account_id = sts_client.get_caller_identity()["Account"]

In [None]:
## Create S3 bucket to store the KB data source
s3_suffix = f"{region_name}-{account_id}"
bucket_name = f'bedrock-kb-eval-{s3_suffix}' 
# Check if bucket exists, and if not create S3 bucket for knowledge base data source
try:
    s3_client.head_bucket(Bucket=bucket_name)
    print(f'Bucket {bucket_name} Exists')
except Exception as e:
    print(f'Creating bucket {bucket_name}')
    s3bucket = s3_client.create_bucket(
        Bucket=bucket_name
    )

Upload pdf documents from local directory to s3 bucket

In [None]:
local_pdf_dir = "../generated/faqs"

target_dir_s3 = os.path.join(bucket_name, "faqs")

# Iterate over all files in the local directory
for filename in os.listdir(local_pdf_dir):
    if filename.endswith('.pdf'):
        file_path = os.path.join(local_pdf_dir, filename)

        # Upload the file to S3 with the "faq/" prefix
        try:
            s3_client.upload_file(file_path, bucket_name, f"faqs/{filename}")
            print(f"Uploaded {filename} to {bucket_name}/faqs/")
        except Exception as e:
            print(f"Error uploading {filename}: {e}")


In [None]:
%store target_dir_s3
%store bucket_name