# Hands-on Tutorial
## Domain-Driven LLM Development: Insights into RAG and Fine-Tuning Practices
### Lab 2.1 : Synthetic Test Data Generation 
#### Summary: 
This lab focused on generating synthetic data for testing the fine-tuned model in the next lab. We use Claude3 Sonnet on Amazon Bedrock. 

- The question-context-answer pairs provided by CUAD dataset are used as "seed data"     
- The generated dataset are in the format [context, seed_question, question, answer]    


### Initialization

In [None]:
!pip install langchain

In [None]:
import json
import os
import sys

import boto3
import botocore

In [None]:
import numpy as np
import time
import pandas as pd

from langchain.prompts import PromptTemplate

In [None]:
boto3_bedrock = boto3.client(service_name="bedrock", region_name="us-west-2")
boto3_bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name="us-west-2")

In [None]:
def QA_Gen_Bedrock(model_id,model_kwargs,prompt):
                
    input_token = len(prompt.split())/0.75

    if ('titan' in model_id):    
        model_body = {
            "inputText": f"{prompt}"
        }
        model_body["textGenerationConfig"] =  model_kwargs  
    elif ('claude-3' in model_id):
        model_body = {
                        "anthropic_version": "bedrock-2023-05-31",
                        "max_tokens": 1024,
                        "messages": [
                            {
                                "role": "user",
                                "content": [{"type": "text", "text": prompt}],
                            }
                        ],
        }
    else:
        model_body = {
            "prompt": f"{prompt}"
        }
        model_body.update(model_kwargs)

    body_bytes = json.dumps(model_body).encode('utf-8')

    st = time.time()

    if ('claude-3' in model_id):
        response = boto3_bedrock_runtime.invoke_model(
                    modelId=model_id,
                    body=body_bytes,
                )
    else:
        response = boto3_bedrock_runtime.invoke_model(
                    modelId=model_id,
                    contentType="application/json",
                    accept="*/*",
                    body=body_bytes,
                )

    et = time.time()
    elapsed_time = et - st

    if ('titan' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["results"][0]["outputText"].strip()
        llm_latency = response["ResponseMetadata"]["HTTPHeaders"]["x-amzn-bedrock-invocation-latency"]
    elif ('llama' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["generation"].strip()
    elif ('claude-v2' in model_id or 'claude-instant-v1' in model_id ):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["completion"].strip()
    elif ('claude-3' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["content"][0]["text"].strip()
    elif ('mistral' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["outputs"][0]["text"].strip()    
    else :
        llm_response = 'MODEL TYPE NOT YET SUPPORTED.'
    
    output_token = len(llm_response.split())/0.75

    throuput = output_token/elapsed_time
    
    return llm_response, elapsed_time, input_token, output_token, throuput

In [None]:
def extract_strings_recursive(test_str, tag):
    try:
        # finding the index of the first occurrence of the opening tag
        start_idx = test_str.find("<" + tag + ">")

        # base case
        if start_idx == -1:
            return []

        # extracting the string between the opening and closing tags
        end_idx = test_str.find("</" + tag + ">", start_idx)
        res = [test_str[start_idx+len(tag)+2:end_idx]]

        # recursive call to extract strings after the current tag
        res += extract_strings_recursive(test_str[end_idx+len(tag)+3:], tag)

        return res
    
    except:
        return "bad format"

### Prepare for seed data

In [None]:
# load data from json
JSON_FILE = '../lab-data/CUAD_v1.json'

with open(JSON_FILE, 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [None]:
def create_qa_file(file_name, file_index):
    question_list = []
    input_list = []
    answer_list = []
    qa_id_list = []
    context_list = []

    for i in range(len((df['data'][file_index])['paragraphs'][0]['qas'])):
        qas_input = df['data'][file_index]['paragraphs'][0]['qas'][i]['question']
        qas_question = qas_input.split("Details: ")[1].strip()
        if '?' not in qas_question:
            qas_question = "What is "+qas_question+'?'        
        qa_id = df['data'][file_index]['paragraphs'][0]['qas'][i]['id']
        answer = ''
        for j in range(len(df['data'][file_index]['paragraphs'][0]['qas'][i]['answers'])):
            answer = answer + df['data'][file_index]['paragraphs'][0]['qas'][i]['answers'][j]['text'] + ', '
        answer = answer[:len(answer)-2]
        
        question_list.append(qas_question)
        answer_list.append(answer)
        input_list.append(qas_input)
        qa_id_list.append(qa_id)
            
    # build dataframe
    df_seed_data = pd.DataFrame()
    df_seed_data['question'] = question_list
    df_seed_data['input'] = input_list    
    df_seed_data['answer'] = answer_list
    df_seed_data['qa_id'] = qa_id_list

    # remove nan answer
    df_seed_data = df_seed_data[df_seed_data['answer']!='']
    df_seed_data = df_seed_data.reset_index()
    
    QA_FILE = '../lab-data/' + file_name + '_qa.csv'

    df_seed_data.to_csv(QA_FILE, encoding='utf-8', sep=',', index=False)
    
    return len(df_seed_data)

In [None]:
FILE_INDEX = 38
FILE_NAME = 'ENERGOUSCORP'
row_num = create_qa_file(FILE_NAME, FILE_INDEX)
row_num

### Define prompt for synthetic data generation

In [None]:
from langchain.prompts import PromptTemplate

prompt_template_trngen = """
Human:

You are an AI assistant, your task is to generate question-answer pair from the given context. 

Analyze the context within the <context> XML tag and the seed question in <seed> XML tag, 
generate one question that rephrases the seed question within the <seed> XML tag. 
Make sure the generated questions are also relevant to the context within the <context> XML tag. 

In your response, present the question within the <question> tag.
DO NOT nest <question> element. 
DO NOT put any extra attribute in the <question> tag. 

<context>
{context}
</context>

<seed>
{seed_question}
</seed>

Assistant:
"""

PROMPT_trngen = PromptTemplate(template=prompt_template_trngen, input_variables=["context","seed_question"])

### Load seed data

In [None]:
INPUT_FILE = "../lab-data/ENERGOUSCORP_qa.csv"
df_input = pd.read_csv (INPUT_FILE)
df_input.head(5)

In [None]:
context_list = df_input.input.values.tolist()
question_list  = df_input.question.values.tolist()
answer_list  = df_input.answer.values.tolist()

In [None]:
len(question_list)

### Generate testing data from a random seed data

In [None]:
model_id = 'anthropic.claude-3-sonnet-20240229-v1:0' 

model_kwargs = {
        "max_tokens": 1024,
        "top_p": 0.95,
        "temperature": 0.05
}   

In [None]:
question_list[25]

In [None]:
prompt = PROMPT_trngen.format(context = context_list[25], seed_question = question_list[25])

qa_response = QA_Gen_Bedrock(model_id,model_kwargs,prompt)

In [None]:
print(extract_strings_recursive(qa_response[0], "question"))

### Generate training/testing data in batch

In [None]:
val_context_list = []
val_seed_list = []
val_question_list = []
val_answer_list = []

for i in range(len(question_list)):
   
    print(i+1,end=': ')
    prompt = PROMPT_trngen.format(context = context_list[i], seed_question = question_list[i])

    qa_response = QA_Gen_Bedrock(model_id,model_kwargs,prompt)

    res_q = extract_strings_recursive(qa_response[0], "question")
    
    if "bad format" in res_q or len(res_q)==0:
        pass
    else:
        val_context_list.append(context_list[i])
        val_seed_list.append(question_list[i])
        val_question_list.append(res_q[0])        
        val_answer_list.append(answer_list[i])
        print('*',end='')
        
print("\nCompleted: generated ", len(question_list))

In [None]:
val_question_list

### Store generated dataset 

In [None]:
VAL_FILE = "../lab-data/ENERGOUSCORP_qa_test.csv"  

df_val_dataset = pd.DataFrame()  

df_val_dataset["context"] = val_context_list
df_val_dataset["seed_question"] = val_seed_list
df_val_dataset["question"] = val_question_list
df_val_dataset["answer"] = val_answer_list

df_val_dataset.to_csv(VAL_FILE, index=False)

In [None]:
df_val_dataset