In [1]:
import boto3
import json
import random
import re
import pandas as pd
import time

# Set the seed
random.seed(1)

bedrock_runtime = boto3.client ('bedrock-runtime', region_name = 'us-east-1')
s3 = boto3.client('s3')

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


## Product Catalog

In [2]:
prompts = {
    'generate_product_description': """
        Generate an imaginary product description document for the imaginary manufacturing company called MANUT GmbH. 
        The description should be long (at least 4000 letters) and descriptive. It should describe one specific product, not a product family. 
        An example of a product description is given below:
        
        {}
        
        Make sure that product name is not similar to these : {}
        
        Provide the product name and desctiption in the following format:
        <begin>
        Only the product name is here
        <end>
        <begin>
        Description text is here
        <end>

    """,
    'generate_safety_instructions' : """
        Create an imaginary safety instructions for the manufacturing product developed by MANUT. The safety instructions should be clear, detailed and long (at least 4000 letters). 
        
        The product description:
        
        {}
        
        An example of security description:
        
        {}
        
        Provide the security instructions in the following format: 
        <begin>
        Security Instruction text is here
        <end>
    """,
    'generate_configuration' : """
        Create an imaginary product configuration instructions for the manufacturing product developed by MANUT. The configuration instructions should be clear, detailed and long (at least 4000 letters).
        
        The product description:
        
        {}
        
        An example of configuration document:
        
        {}
        
        Provide the security instructions in the following format: 
        <begin>
        Product Configuration Instructions is here
        <end>

    """,
    'generate_operation_guide' : """
        Create an imaginary product operation guide for the manufacturing product developed by MANUT. The product operation guide should be clear, detailed and long (at least 4000 letters).
        
        The product description:
        
        {}
        
        An example of product operation guide:
        
        {}
        
        Provide the product operation guide in the following format: 
        <begin>
        Product operation guide is here
        <end>

    """,
}

In [3]:
# We will use an actual product catalog to make out synthetic dataset more realistic. Fill the text fields a real world examples.
# You can add more examples, it will be randomly choosen in few_shot_prompting_claude_3_sonnet(). 
operation_examples = [
   {
    'name': 'example_product_1',
    'text': """
            Example operation manual
            """
    }
    
]
configuration_examples = [
    {
    'name': 'example_product_1',
    'text': """
            Example configuration manual
            """
    }
        
]

safety_instructions_examples = [
    {
        'name': 'example_product_1',
        'text': """
                Example safety instructions
                """
    }

]

product_description_examples = [
    {
        'name': 'example_product_1',
        'text': """
                Example product description
                """   
    }   
]

In [4]:
def few_shot_prompting_claude_3_sonnet(prompt_dict, prompt, given_text = "", example_list=[]):
    example_count=len(example_list)
    print(f"Example count: {example_count}")
    example_index = random.sample(range(example_count), 1)
    
    print(f'''Choosen examples for the prompt: {example_index}''')
    
    example_text = example_list[example_index[0]]['text']
    # given text is product names (for product description generator) or product descriptions (ot)
    if prompt == 'generate_product_description':
        prompt = prompt_dict[prompt].format(example_text,given_text)
    else:
        prompt = prompt_dict[prompt].format(given_text,example_text)
        
    print(f"prompt: {prompt}")
    
    kwargs = {
      "modelId": "anthropic.claude-3-sonnet-20240229-v1:0",
      "contentType": "application/json",
      "accept": "application/json",
      "body": json.dumps ({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 4096,
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": prompt
              }
            ]
          }
        ]
      })
    }
    response = bedrock_runtime.invoke_model(**kwargs)
    body = json.loads(response['body'].read())
    text_content = body['content'][0]['text']
    return text_content

In [5]:
"""
check if the pattern matches, be careful with wrong formatted text contents. The content should include 2 parts in given format:
<begin>
Product name is here
<end>
<begin>
Generated text is here
<end>
"""

def extract_text(text_content): 
    pattern = r"<begin>\n(.+?)\n<end>\n\n<begin>\n(.+?)\n<end>"

    # Search for matches
    match = re.search(pattern, text_content, re.DOTALL)

    # Extract the product name and description if a match is found
    if match:
        product_name = match.group(1).strip()
        extracted_text = match.group(2).strip() #eg. product description
    else:
        print("WARNING! No match found.")
    
        
    return product_name, extracted_text

In [None]:
product_list = []
#removing <begin> and <end> has not being tested yet. Check it before run.

for i in range(5): #product count you want to have
    product_names_str = '' # We are checking current product names to not generate a product with a similar name
    for p in product_list:
        product_names_str +=  p['name'] + ', '

    product_description_text = few_shot_prompting_claude_3_sonnet(prompts,'generate_product_description',product_names_str,product_description_examples)
    print("INFO: Product name and description are generated.")
    random_sleep = random.uniform(30, 60)
    time.sleep(random_sleep)
    product_name, product_description = extract_text(product_description_text)

    safety_instructions = few_shot_prompting_claude_3_sonnet(prompts,'generate_safety_instructions',product_description,safety_instructions_examples)
    safety_instructions = safety_instructions.replace("<begin>", "").replace("<end>", "").strip()
    print("INFO: Safety instructions are generated.")

    random_sleep = random.uniform(30, 60)
    time.sleep(random_sleep)
    
    configuration = few_shot_prompting_claude_3_sonnet(prompts,'generate_configuration',product_description,configuration_examples)
    configuration = configuration.replace("<begin>", "").replace("<end>", "").strip()
    print("INFO: Configuration instructions are generated.")

    random_sleep = random.uniform(30, 60)
    time.sleep(random_sleep)
    
    operation = few_shot_prompting_claude_3_sonnet(prompts,'generate_operation_guide',product_description,operation_examples)
    operation = operation.replace("<begin>", "").replace("<end>", "").strip()
    print("INFO: Operation instructions are generated.")

    random_sleep = random.uniform(30, 60)
    time.sleep(random_sleep)
    
    product = {
        'name': product_name,
        'description': product_description,
        'safety_instructions': safety_instructions,
        'configuration': configuration,
        'operation': operation    
    }
    
    product_list.append(product)
    print(f"for {i+1}. generation, total product count: {len(product_list)}")

Example count: 1
Choosen examples for the prompt: [0]
prompt: 
        Generate an imaginary product description document for the imaginary manufacturing company called MANUT GmbH. 
        The description should be long (at least 4000 letters) and descriptive. It should describe one specific product, not a product family. 
        An example of a product description is given below:
        
        
            The SIRIUS 3RV motor starter protectors described here have been developed to carry out switching and protection functions as part of a plant or machine.
            SIRIUS 3RV2 motor starter protectors are available in the following versions:
            • Motor starter protectors, standard version (3RV20) Short-circuit and overload protection
            • Motor starter protectors with relay function (3RV21)
            Short-circuit protection and auto-RESET in the event of overload in one device
            • MSP for starter combinations (3RV23)
            Short-circuit pr

In [22]:
product_list[0].keys()

dict_keys(['name', 'description', 'safety_instructions', 'configuration', 'operation'])

## Question-Answer Dataset

In [24]:
qa_prompt = """
I will provide you a description, safety_instructions, configuration, and operation documents for a manufacturing product. 
Generate a question and the correct and detailed answer for the given content and make sure questions will cover the important aspects of the product.

Product name: {}
Desctription: {}
Safety instructions: {}
Configuration manual: {}
Operation Instructions: {}

Provide the Question&Answer pairs in the following format:

<begin_question>
Question only
<end_question>
<begin_anser>
Answer only
<end_answer>

"""

In [25]:
def prompting_claude_3_sonnet(prompt, product_document):
    product_name = product_document['name']
    description = product_document['description']
    safety_instructions = product_document['safety_instructions']
    configuration = product_document['configuration']
    operation = product_document['operation']
    
    prompt = prompt.format(product_name, description, safety_instructions, configuration, operation)
    
    kwargs = {
      "modelId": "anthropic.claude-3-sonnet-20240229-v1:0",
      "contentType": "application/json",
      "accept": "application/json",
      "body": json.dumps ({
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 4096,
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": prompt
              }
            ]
          }
        ]
      })
    }
    response = bedrock_runtime.invoke_model(**kwargs)
    body = json.loads(response['body'].read())
    text_content = body['content'][0]['text']
    return text_content,product_name

In [26]:
def create_qa_dataset(prompt, products):
    question_answer_list = []
    for product in products:
        qa_text, product_name = prompting_claude_3_sonnet(prompt, product)

        questions = re.findall(r'<begin_question>(.*?)<end_question>', qa_text, re.DOTALL)
        answers = re.findall(r'<begin_answer>(.*?)<end_answer>', qa_text, re.DOTALL)

        # Cleaning up the extracted data
        questions = [q.strip() for q in questions]
        answers = [a.strip() for a in answers]

        for i,q in enumerate(questions):
            qa_pair = {
                'product_name': product_name,
                'question': q,
                'answer': answers[i]
            }
            question_answer_list.append(qa_pair)
            print(f"INFO: Q&A pair is generated for {product_name}")

        random_sleep = random.uniform(90, 120)
        time.sleep(random_sleep)
    return question_answer_list

In [27]:
question_answer_list = create_qa_dataset(qa_prompt, product_list)

INFO: Q&A pair is generated for QUANTUM PowerTrain X3000
INFO: Q&A pair is generated for QUANTUM PowerTrain X3000
INFO: Q&A pair is generated for QUANTUM PowerTrain X3000
INFO: Q&A pair is generated for QUANTUM PowerTrain X3000
INFO: Q&A pair is generated for MANUTRON X5-Series
INFO: Q&A pair is generated for MANUTRON X5-Series
INFO: Q&A pair is generated for MANUTRON X5-Series
INFO: Q&A pair is generated for MANUTRON X5-Series
INFO: Q&A pair is generated for MANUTRON X5-Series
INFO: Q&A pair is generated for MANUTRIX Servo-Drive SL8000
INFO: Q&A pair is generated for MANUTRIX Servo-Drive SL8000
INFO: Q&A pair is generated for MANUTRIX Servo-Drive SL8000
INFO: Q&A pair is generated for MANUTRIX Servo-Drive SL8000
INFO: Q&A pair is generated for MANUTRON X7 Precision Gearbox
INFO: Q&A pair is generated for MANUTRON X7 Precision Gearbox
INFO: Q&A pair is generated for MANUTRON X7 Precision Gearbox
INFO: Q&A pair is generated for MANUTRON X7 Precision Gearbox
INFO: Q&A pair is generated f

In [29]:
question_answer_list[0]

{'product_name': 'QUANTUM PowerTrain X3000',
 'question': 'What are the key components of the QUANTUM PowerTrain X3000, and how do they work together to deliver outstanding performance and efficiency?',
 'answer': "The QUANTUM PowerTrain X3000 combines an electric motor, a high-efficiency internal combustion engine, and a lithium-ion battery pack through MANUT's proprietary hybrid system. The electric motor provides instant torque and acceleration, while the optimized combustion engine ensures extended range and high-speed capabilities.\n\nThe intelligent energy management system continuously monitors driving conditions and dynamically adjusts the power distribution between the electric motor and combustion engine for optimal efficiency. The electric motor is powered by the high-capacity lithium-ion battery pack, delivering smooth and silent operation in urban environments.\n\nThe advanced internal combustion engine incorporates technologies like direct injection, variable valve timing