# Synthetic Data Generation     
   
   
This notebook provides a comprehensive guide for synthetic data generation using self-instruct framework, to prepare training and validation datasets for Supervised Fine Tuning (SFT) an LLM. 

In this notebook, we use the self-instruct framework to process a document in HTML format and generate question-answer pairs by LLM on Amazon Bedrock. 


### Step 0. Initialization

In [None]:
# Uncomment the following lines to install the required Python packages 

#!pip install -q -U git+https://github.com/huggingface/transformers.git
#!pip install -q -U git+https://github.com/huggingface/peft.git
#!pip install -q -U git+https://github.com/huggingface/accelerate.git
#!pip install unstructured

In [None]:
import json
import os
import sys

import boto3
import botocore

In [None]:
import numpy as np
import time
import pandas as pd

from langchain.prompts import PromptTemplate

In [None]:
# Initiate Bedrock in AWS region
boto3_bedrock = boto3.client(service_name="bedrock", region_name="us-west-2")
boto3_bedrock_runtime = boto3.client(service_name="bedrock-runtime", region_name="us-west-2")

In [None]:
def extract_strings_recursive(test_str, tag):
    try:
        # finding the index of the first occurrence of the opening tag
        start_idx = test_str.find("<" + tag + ">")

        # base case
        if start_idx == -1:
            return []

        # extracting the string between the opening and closing tags
        end_idx = test_str.find("</" + tag + ">", start_idx)
        res = [test_str[start_idx+len(tag)+2:end_idx]]

        # recursive call to extract strings after the current tag
        res += extract_strings_recursive(test_str[end_idx+len(tag)+3:], tag)

        return res
    
    except:
        return "bad format"

In [None]:
def QA_Gen_Bedrock(model_id,model_kwargs,prompt):
                
    input_token = len(prompt.split())/0.75

    if ('titan' in model_id):    
        model_body = {
            "inputText": f"{prompt}"
        }
        model_body["textGenerationConfig"] =  model_kwargs  
    elif ('claude-3' in model_id):
        model_body = {
                        "anthropic_version": "bedrock-2023-05-31",
                        "max_tokens": 1024,
                        "messages": [
                            {
                                "role": "user",
                                "content": [{"type": "text", "text": prompt}],
                            }
                        ],
        }
    else:
        model_body = {
            "prompt": f"{prompt}"
        }
        model_body.update(model_kwargs)

    body_bytes = json.dumps(model_body).encode('utf-8')

    st = time.time()

    if ('claude-3' in model_id):
        response = boto3_bedrock_runtime.invoke_model(
                    modelId=model_id,
                    body=body_bytes,
                )
    else:
        response = boto3_bedrock_runtime.invoke_model(
                    modelId=model_id,
                    contentType="application/json",
                    accept="*/*",
                    body=body_bytes,
                )

    et = time.time()
    elapsed_time = et - st

    if ('titan' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["results"][0]["outputText"].strip()
        llm_latency = response["ResponseMetadata"]["HTTPHeaders"]["x-amzn-bedrock-invocation-latency"]
    elif ('llama' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["generation"].strip()
    elif ('claude-v2' in model_id or 'claude-instant-v1' in model_id ):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["completion"].strip()
    elif ('claude-3' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["content"][0]["text"].strip()
    elif ('mistral' in model_id):
        response_body_json = json.loads(response['body'].read().decode('utf-8'))
        llm_response = response_body_json["outputs"][0]["text"].strip()    
    else :
        llm_response = 'MODEL TYPE NOT YET SUPPORTED.'
    
    output_token = len(llm_response.split())/0.75

    throuput = output_token/elapsed_time
    
    return llm_response, elapsed_time, input_token, output_token, throuput

### Step 1: Generate seed q-a pairs

Create prompt template to generate seed questions, one question for each paragraph

In [None]:
prompt_template_qagen = """
<s>[INST] 
You are an AI assistant, your task is to generate question-answer pair from the given context. 

Analyze the context within the <context> XML tag, generate one question from the context. 
In the question, DO NOT refer to the context.  
provide answer to each question according to the content in the context. 
In your response, present the question within the <question> tag, and the answer within the <answer> tag.
DO NOT nest <question> and <answer> element. 
DO NOT put any extra attribute in the <question> and <answer> tag. 

<context>
{context}
</context>

[/INST] </s>
"""

PROMPT_qagen = PromptTemplate(template=prompt_template_qagen, input_variables=["context"])

Load and process sections in HTML format using Langchain

In [None]:
from langchain.document_loaders import UnstructuredHTMLLoader

In [None]:
# Load your HTML document in the ./data and specify the full path below 

html_file = <your document.html>

In [None]:
loader = UnstructuredHTMLLoader(html_file)
data = loader.load()
pages = loader.load_and_split()
print("The documents contain "+str(len(pages))+" pages.")

Generate question-answer pair using Mistral-7b-instruct model on Bedrock. You can choose a different model and configure LLM hyper-parameters. 

In [None]:
model_id = 'mistral.mistral-7b-instruct-v0:2' 

model_kwargs = {
        "max_tokens": 1024,
        "top_p": 0.95,
        "temperature": 0.05
}   

In [None]:
i = 0

context_list = []
question_list = []
answer_list = []

for k in range(len(pages)):
    print("\nPage ",k+1)
    docs = pages[k].page_content  
    #print(docs)

    paragraphs = docs.split('\n\n')

    for text in paragraphs:
        if len(text)>10:
            print(i+1,end=': ')
            prompt = PROMPT_qagen.format(context = text)
            #print(prompt)

            qa_response = QA_Gen_Bedrock(model_id,model_kwargs,prompt)
            #print(qa_response[0])
            
            res_q = extract_strings_recursive(qa_response[0], "question")[0]
            res_a = extract_strings_recursive(qa_response[0], "answer")[0]
            
            #if "bad format" in res_q or "bad format" in res_a:
            if "bad format" in res_q or "bad format" in res_a or len(res_q)==0 or len(res_a)==0:
                pass
            else:
                context_list.append(text)
                question_list.append(res_q)
                answer_list.append(res_a)
                i=i+1
        else:
            pass

print("\nCompleted...")

Save the seed question-answer pairs in csv format

In [None]:
seed_data_file = html_file+"_seed_selfinstruct_mistral-7b.csv"

df_train_dataset = pd.DataFrame()  

df_train_dataset["context"] = context_list
df_train_dataset["question"] = question_list
df_train_dataset["answer"] = answer_list

df_train_dataset.to_csv(seed_data_file, index=False)

### Step 2: Human validation

Leverage the domain SMEs' expertise to validate the seed question-answer pairs in Step 1, and update the dataset to get ready for the next step. 

### Step 3: Generate training and validation dataset from seed dataset

Create prompt template for generating more question-answer pairs from the seed dataset

In [None]:
from langchain.prompts import PromptTemplate

prompt_template_trngen = """
<s>[INST]
You are an AI assistant, your task is to generate question-answer pair from the given context. 

Analyze the context within the <context> XML tag and the seed question in <seed> XML tag, 
generate four questions that rephrases the seed question within the <seed> XML tag. 
Make sure the generated questions are also relevant to the context within the <context> XML tag. 

In your response, present the question within the <question> tag.
DO NOT nest <question> element. 
DO NOT put any extra attribute in the <question> tag. 

<context>
{context}
</context>

<seed>
{seed_question}
</seed>

[/INST] </s>
"""

PROMPT_trngen = PromptTemplate(template=prompt_template_trngen, input_variables=["context","seed_question"])

Generate more question-answer pairs to scale up the training and validation datasets for LLM fine-tuning   

Here for each seed question-answer pair, we generate    
- 3 additional q-a pairs for training data
- 1 additional q-a pair for validation data

In [None]:
# Load the seed data 
df_input = pd.read_csv (seed_data_file)
context_list = df_input.context.values.tolist()
question_list  = df_input.question.values.tolist()
answer_list  = df_input.answer.values.tolist()

In [None]:
# Generate question-answer pairs 
trn_context_list = []
trn_seed_list = []
trn_question_list = []
trn_answer_list = []

val_context_list = []
val_seed_list = []
val_question_list = []
val_answer_list = []

for i in range(len(question_list)):

    trn_context_list.append(context_list[i])
    trn_seed_list.append(question_list[i])
    trn_question_list.append(question_list[i])
    trn_answer_list.append(answer_list[i])    
    
    print(i+1,end=': ')
    prompt = PROMPT_trngen.format(context = context_list[i], seed_question = question_list[i])
    #print(prompt)

    qa_response = QA_Gen_Bedrock(model_id,model_kwargs,prompt)
    #print(qa_response[0])

    res_q = extract_strings_recursive(qa_response[0], "question")

    #if "bad format" in res_q or "bad format" in res_a:
    if "bad format" in res_q or "bad format" in res_a or len(res_q)==0 or len(res_a)==0:
        pass
    else:
        if (len(res_q)>3):
            num_q = 3
        else:
            num_q = len(res_q)
            
        for j in range(num_q):
            trn_context_list.append(context_list[i])
            trn_seed_list.append(question_list[i])
            trn_question_list.append(res_q[j])    # 3 for trn 
            trn_answer_list.append(answer_list[i])
            print('.',end='')
        
        val_context_list.append(context_list[i])
        val_seed_list.append(question_list[i])
        val_question_list.append(res_q[3])        # 1 for val
        val_answer_list.append(answer_list[i])
        print('*',end='')
        
print("\nCompleted: generated ", len(question_list))

Save the training and validation datasets in csv format

In [None]:
TRN_FILE = html_file+"_trnfromseed_mistral-7b.csv"  

df_trn_dataset = pd.DataFrame()  

df_trn_dataset["context"] = trn_context_list
df_trn_dataset["seed_question"] = trn_seed_list
df_trn_dataset["question"] = trn_question_list
df_trn_dataset["answer"] = trn_answer_list

df_trn_dataset.to_csv(TRN_FILE, index=False)

In [None]:
VAL_FILE = html_file+"_valfromseed_mistral-7b.csv"  

df_val_dataset = pd.DataFrame()  

df_val_dataset["context"] = val_context_list
df_val_dataset["seed_question"] = val_seed_list
df_val_dataset["question"] = val_question_list
df_val_dataset["answer"] = val_answer_list

df_val_dataset.to_csv(VAL_FILE, index=False)

### Step 4: Semantic verification 
      
Calculate Semantic Similarity Score between seed dataset and training/validation dataset. Here we use Titan-text-embedding-v1 model to calculate, you can choose other embedding models on Bedrock or from 3rd party.    


In [None]:
from sentence_transformers import SentenceTransformer, util

# SS using Titan embedding model
def get_titan_embedding(text):
    
    body = json.dumps({"inputText": text})
    modelId = 'amazon.titan-embed-text-v1'    # support 8K token 
    accept = 'application/json'
    contentType = 'application/json'    
    
    response = boto3_bedrock_runtime.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    
    return embedding
    
def calculate_semantic_sim_titan(pred_list,ref_list):
   
    sem_score = []
    average_sem_sim = 0
    
    for i in range(len(ref_list)):
        #print(i," ",end = ':')
        ref_embedding = get_titan_embedding(ref_list[i])
        pred_embedding = get_titan_embedding(pred_list[i])
        cos_sim = util.cos_sim(ref_embedding, pred_embedding)
        #print(cos_sim[0][0].item())
        
        sem_score.append(cos_sim[0][0].item())
    
    average_sem_sim_titan = np.mean(sem_score)   
    
    #print("Average similarity: ", average_sem_sim)
    
    return average_sem_sim_titan

In [None]:
TRN_FILE = html_file+"_trnfromseed_mistral-7b.csv"  

df_trn = pd.read_csv (TRN_FILE)
trn_seed_list = df_trn.seed_question.values.tolist()
trn_question_list = df_trn.question.values.tolist()

In [None]:
VAL_FILE = html_file+"_valfromseed_mistral-7b.csv"  

df_val = pd.read_csv (VAL_FILE)
val_seed_list = df_val.seed_question.values.tolist()
val_question_list = df_val.question.values.tolist()

In [None]:
trn_ss_score = calculate_semantic_sim_titan(trn_seed_list,trn_question_list)
trn_ss_score

In [None]:
val_ss_score = calculate_semantic_sim_titan(val_seed_list,val_question_list)
val_ss_score

### Step 5: LLM Fine Tuning using the generate datasets
      
Please continue this step with the "fine_tuning_self-instruct_notebook".   
