In [70]:
import os
import json
import openai
import tiktoken
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)
from dotenv import find_dotenv, load_dotenv

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
model_id = "gpt-3.5-turbo-1106"
client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
tokenizer = tiktoken.encoding_for_model(model_id)

In [6]:
prompt_updated = """
# TASK DESCRIPTION {rand_num}
Create a dataset for LLM fine-tuning consisting of user queries about industrial robotic operations and their corresponding JSON function calls. Focus on generating queries with varying complexity levels, from simple to advanced, and provide examples in real-world scenarios.

# JSON PARAMETER VALUES
- query: User-generated query, varying in complexity (contains different number of functions in query).
- functions: Array of function calls derived from the query, identified using separators like "and", "then", ",", etc.
- function_name: Specific robot function ("move_tcp", "move_joint", "get_joint_values").
- input_name: Name of the function's input parameter, specifying the type (integer, float, string).
- input_value: Value of the input parameter, adhering to the specified type.

# FUNCTIONS EXPLAINED WITH EXAMPLES
- move_tcp: Moves the robot's TCP. 
    Inputs: x, y, z (position; type: float), q1, q2, q3, q4 (quaternion values; type: float), unit (correlates to x, y, z; type: str, ["m", "cm", "mm"]). 
    Example: "Move TCP to coordinates (0.5, 0.3, 0.7) m" translates to x: 0.5, y: 0.3, z: 0.7; unit: m
- move_joint: Rotates/moves a robot joint. 
    Inputs: joint (index number; type: array of integers; note: joints go from 0 to n-1), angle (degrees or radians, type: array of floats), unit (correlates to angle; type: str, ["rad", "deg"]). 
    Example: "Rotate joint 2 by 45 degrees" results in joint: [2], angle: [45], unit: deg.
- get_joint_values: Retrieves robot joint statuses, no input parameters required. 
    Example: "Get the current status of robot joints."

# GUIDANCE
- Focus on verbs and technical terms for function selection.
- Ensure queries are deterministic and precise.
- Queries should vary in complexity, from direct instructions to those requiring contextual understanding.
- Real-World Scenarios: Frame queries in practical industrial settings.

# RESPONSE FORMAT
- Include only necessary functions as per the query.
- Maintain the sequence of functions as in the query.
- Specify units only when required, omit default values.

# JSON FORMAT
{{
    "query": "<generated_query>",
    "functions": [
        {{
            "function_name": "<name>",
            "inputs": [
                {{"name": "<name>", "value": "<value>", "unit": "<unit>"}},
                {{"name": "<name>", "value": "<value>", "unit": "<unit>"}}
            ]
        }},
        {{
            "function_name": "<name>",
            "inputs": [{{"name": "<name>", "value": "<value>", "unit": "<unit>"}}]
        }}
    ]
}}

# GENERATED EXAMPLES:
    NOTE: DO NOT copy these examples, only use them as a reference!
    {{
        'query': 'Move robot tcp along x and y for 1000mm',
        'functions': [{{'function_name': 'move_tcp',
            'inputs': [
                {{'name': 'x', 'value': 1000.0, 'unit': 'mm'}},
                {{'name': 'y', 'value': 1000.0, 'unit': 'mm'}}
                ]
            }}
        ]
    }}
    {{
        'query': 'Move robot sixth joint for -30 degrees',
        'functions': [{{'function_name': 'move_joint',
            'inputs': [
                {{'name': 'joint', 'value': [5], 'unit': None}},
                {{'name': 'angle', 'value': [-30.0], 'unit': 'deg'}}
                ]
            }}
        ]
    }}
    {{
        'query': 'Give me robot joints info',
        'functions': [{{'function_name': 'get_joint_values',
            'inputs': []
            }}
        ]
    }}
    {{
        'query': 'Rotate robot base for 45 and move TCP along x axis for 50 milimeters.',
        'functions': [
            {{'function_name': 'move_joint',
                'inputs': [
                    {{'name': 'joint', 'value': [0], 'unit': None}},
                    {{'name': 'angle', 'value': [45.0], 'unit': 'deg'}}
                ]
            }},
            {{'function_name': 'move_tcp',
                'inputs': [{{'name': 'x', 'value': 50.0, 'unit': 'mm'}}]
            }}
        ]
    }}
    {{
        'query': 'I want you to rotate joint 2 for 30 and joint 7 for 45 degrees then joint 3 for pi/4',
        'functions': [
            {{'function_name': 'move_joint',
                'inputs': [
                    {{'name': 'joint', 'value': [2, 7], 'unit': None}},
                    {{'name': 'angle', 'value': [30.0, 45.0], 'unit': 'deg'}}
                    ]
            }},
            {{'function_name': 'move_joint',
                'inputs': [
                    {{'name': 'joint', 'value': [3], 'unit': None}},
                    {{'name': 'angle', 'value': [0.785398], 'unit': 'rad'}}
                    ]
            }}
        ]
    }}
"""

In [126]:
prompt_json = """
# TASK DESCRIPTION
Create a dataset for LLM fine-tuning from user queries about industrial robotic operations by formatting them to corresponding JSON function calls.

# JSON PARAMETER VALUES
- functions: Array of function calls derived from the query, identified using separators like "and", "then", ",", etc.
- function_name: Specific robot function ("move_tcp", "move_joint", "get_joint_values").
- input_name: Name of the function's input parameter, specifying the type (integer, float, string).
- input_value: Value of the input parameter, adhering to the specified type.

# FUNCTIONS EXPLAINED WITH EXAMPLES
- move_tcp: Moves the robot's TCP. 
    Inputs: x, y, z (position; type: float), q1, q2, q3, q4 (quaternion values; type: float), unit (correlates to x, y, z; type: str, ["m", "cm", "mm"]). 
    Example: "Move TCP to coordinates (0.5, 0.3, 0.7) m" translates to x: 0.5, y: 0.3, z: 0.7; unit: m
- move_joint: Rotates/moves a robot joint. 
    Inputs: joint (index number; type: array of integers; note: joints go from 0 to n-1), angle (degrees or radians, type: array of floats), unit (correlates to angle; type: str, ["rad", "deg"]). 
    Example: "Rotate joint 2 by 45 degrees" results in joint: [2], angle: [45], unit: deg.
- get_joint_values: Retrieves robot joint statuses, no input parameters required. 
    Example: "Get the current status of robot joints."

# GUIDANCE
- Focus on verbs and technical terms for function selection.
- Include only the necessary functions directly implied by the query.
- Maintain the sequence of functions as in the query.
- Include only necessary functions as per the query.
- Specify units only when required, omit default values.

# RESPONSE JSON FORMAT
{{
    "query": "<generated_query>",
    "functions": [
        {{
            "function_name": "<name>",
            "inputs": [
                {{"name": "<name>", "value": "<value>", "unit": "<unit>"}},
                {{"name": "<name>", "value": "<value>", "unit": "<unit>"}}
            ]
        }},
        {{
            "function_name": "<name>",
            "inputs": [{{"name": "<name>", "value": "<value>", "unit": "<unit>"}}]
        }}
    ]
}}

USER QUERY: Move robot tcp along x and y for 1000mm
RESPONSE:
{{
    'functions': [{{'function_name': 'move_tcp',
        'inputs': [
            {{'name': 'x', 'value': 1000.0, 'unit': 'mm'}},
            {{'name': 'y', 'value': 1000.0, 'unit': 'mm'}}
            ]
        }}
    ]
}}

USER QUERY: Move robot sixth joint for -30 degrees
RESPONSE:
{{
    'functions': [{{'function_name': 'move_joint',
        'inputs': [
            {{'name': 'joint', 'value': [5], 'unit': None}},
            {{'name': 'angle', 'value': [-30.0], 'unit': 'deg'}}
            ]
        }}
    ]
}}

USER QUERY: Give me robot joints info
RESPONSE:
{{
    'functions': [{{'function_name': 'get_joint_values',
        'inputs': []
        }}
    ]
}}

USER QUERY: Rotate robot base for 45 and move TCP along x axis for 50 milimeters.
RESPONSE:
{{
    'functions': [
        {{'function_name': 'move_joint',
            'inputs': [
                {{'name': 'joint', 'value': [0], 'unit': None}},
                {{'name': 'angle', 'value': [45.0], 'unit': 'deg'}}
            ]
        }},
        {{'function_name': 'move_tcp',
            'inputs': [{{'name': 'x', 'value': 50.0, 'unit': 'mm'}}]
        }}
    ]
}}

USER QUERY: I want you to rotate joint 2 for 30 and joint 7 for 45 degrees then joint 3 for pi/4
RESPONSE:
{{
    'functions': [
        {{'function_name': 'move_joint',
            'inputs': [
                {{'name': 'joint', 'value': [2, 7], 'unit': None}},
                {{'name': 'angle', 'value': [30.0, 45.0], 'unit': 'deg'}}
                ]
        }},
        {{'function_name': 'move_joint',
            'inputs': [
                {{'name': 'joint', 'value': [3], 'unit': None}},
                {{'name': 'angle', 'value': [0.785398], 'unit': 'rad'}}
                ]
        }}
    ]
}}

USER QUERY: {user_query}
RESPONSE:
"""

In [119]:
prompt_tokenized = tokenizer.encode(prompt_json)
len(prompt_tokenized)

1097

In [132]:
queries[0]

'Move robot TCP to coordinates (0.5, 0.3, 0.7) in meters'

In [133]:
messages = [
    {
        "role": "user",
        "content": prompt_json.format(user_query=queries[0]),
    }
]
messages

[{'role': 'user',
  'content': '\n# TASK DESCRIPTION\nCreate a dataset for LLM fine-tuning from user queries about industrial robotic operations by formatting them to corresponding JSON function calls.\n\n# JSON PARAMETER VALUES\n- functions: Array of function calls derived from the query, identified using separators like "and", "then", ",", etc.\n- function_name: Specific robot function ("move_tcp", "move_joint", "get_joint_values").\n- input_name: Name of the function\'s input parameter, specifying the type (integer, float, string).\n- input_value: Value of the input parameter, adhering to the specified type.\n\n# FUNCTIONS EXPLAINED WITH EXAMPLES\n- move_tcp: Moves the robot\'s TCP. \n    Inputs: x, y, z (position; type: float), q1, q2, q3, q4 (quaternion values; type: float), unit (correlates to x, y, z; type: str, ["m", "cm", "mm"]). \n    Example: "Move TCP to coordinates (0.5, 0.3, 0.7) m" translates to x: 0.5, y: 0.3, z: 0.7; unit: m\n- move_joint: Rotates/moves a robot joint. 

In [134]:
response = client.chat.completions.create(
    model=model_id,
    temperature=0,
    response_format={"type": "json_object"},
    frequency_penalty=None,
    messages=messages,
)

In [135]:
formatted_data = json.loads(response.choices[0].message.content)
formatted_data

{'functions': [{'function_name': 'move_tcp',
   'inputs': [{'name': 'x', 'value': 0.5, 'unit': 'm'},
    {'name': 'y', 'value': 0.3, 'unit': 'm'},
    {'name': 'z', 'value': 0.7, 'unit': 'm'}]}]}

In [68]:
prompt_queries = """
# TASK DESCRIPTION {rand_num}
Create a dataset for LLM fine-tuning consisting of user queries about industrial robotic operations and their corresponding JSON function calls. Focus on generating queries with varying complexity levels, from simple to advanced, and provide examples in real-world scenarios.

# FUNCTIONS EXPLAINED WITH EXAMPLES
- move_tcp: Moves the robot's TCP. 
    Inputs: x, y, z (position; type: float), q1, q2, q3, q4 (quaternion values; type: float), unit (correlates to x, y, z; type: str, ["m", "cm", "mm"]). 
    Example: "Move TCP to coordinates (0.5, 0.3, 0.7) m" translates to x: 0.5, y: 0.3, z: 0.7; unit: m
- move_joint: Rotates/moves a robot joint. 
    Inputs: joint (index number; type: array of integers; note: joints go from 0 to n-1), angle (degrees or radians, type: array of floats), unit (correlates to angle; type: str, ["rad", "deg"]). 
    Example: "Rotate joint 2 by 45 degrees" results in joint: [2], angle: [45], unit: deg.
- get_joint_values: Retrieves robot joint statuses, no input parameters required. 
    Example: "Get the current status of robot joints."

# GUIDANCE
- Focus on verbs and technical terms for function selection.
- Ensure queries are deterministic and precise.
- Queries should vary in complexity, from direct instructions to those requiring contextual understanding.
- Real-World Scenarios: Frame queries in practical industrial settings.

# RESPONSE FORMAT
- Return format is JSON.
- Only one key: "queries"
- Values are list with strings that are generated queries.
- Generate {num_examples_per_prompt} unique queries.
- Query examples:
    - Move robot tcp along x and y for 1000mm
    - Move robot sixth joint for -30 degrees
    - Give me robot joints info
    - Rotate robot base for 45 and move TCP along x axis for 50 milimeters.
    - I want you to rotate joint 2 for 30 and joint 7 for 45 degrees then joint 3 for pi/4
    - 
- Output format example:
    {{
        "queries" : [<generated_query>, <generated_query>, ..., <generated_query>]
    }}

"""

In [16]:
prompt_tokenized = tokenizer.encode(prompt_queries)
len(prompt_tokenized)

522

In [136]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def generate_data(
    temperature: float, messages: list, frequency_penalty: float = None
) -> dict:
    response = client.chat.completions.create(
        model=model_id,
        temperature=temperature,
        response_format={"type": "json_object"},
        frequency_penalty=frequency_penalty,
        messages=messages,
    )
    return json.loads(response.choices[0].message.content)

In [122]:
def save_data(output_path: Path, queries: list) -> None:
    with open(output_path, "w", encoding="utf-8") as file:
        json.dump(queries, file, indent=4, ensure_ascii=False)

### Generate queries

In [106]:
current_dt = datetime.now().strftime("%Y-%b-%d_%H-%M-%S")
output_path = Path(f"./generated-queries_{current_dt}.json")

num_examples = 1_000
num_examples_per_prompt = 20
iterations = int(num_examples / num_examples_per_prompt)
check_point = 5

queries = []
for iter in tqdm(range(iterations), total=iterations):
    try:
        messages = [
            {
                "role": "user",
                "content": prompt_queries.format(
                    rand_num=np.random.rand().__round__(5),
                    num_examples_per_prompt=num_examples_per_prompt,
                ),
            },
        ]
        result = generate_data(
            temperature=1,
            frequency_penalty=round(np.random.rand() * 0.4, 4),
            messages=messages,
        )

        queries.extend(
            [query for query in result["queries"] if len(query)]
        )  # Check for empty string
    except json.JSONDecodeError as e:
        print(f"ERROR happend while parsing response: {e}")
    if (iter + 1) % check_point == 0:
        save_data(output_path, queries)

save_data(output_path, queries)
print(f"Number of generated queries: {len(queries)}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [141]:
len(queries)

986

### Generate JSONs

In [140]:
current_dt = datetime.now().strftime("%Y-%b-%d_%H-%M-%S")
output_path = Path(f"./formatted-queries_{current_dt}.json")

check_point = 5

formatted_queries = []
for index, query in tqdm(enumerate(queries), total=len(queries)):
    try:
        messages = [
            {
                "role": "user",
                "content": prompt_json.format(user_query=query),
            }
        ]
        result = generate_data(
            temperature=0,
            messages=messages,
            frequency_penalty=None,
        )
        formatted_queries.append((query, result))

    except json.JSONDecodeError as e:
        print(f"ERROR happend while parsing response: {e}")

    if (index + 1) % check_point == 0:
        save_data(output_path, formatted_queries)

save_data(output_path, formatted_queries)
print(f"Number of formatted queries: {len(formatted_queries)}")

  0%|          | 0/986 [00:00<?, ?it/s]

Number of formatted queries: 986


### Create JSONL file for parallel request

In [11]:
filename = Path("./example_requests_to_parallel_process.jsonl")
n_requests = 10  # 10_000
jobs = [
    {
        "model": model_id,
        "response_format": {"type": "json_object"},
        "temperature": 1,
        "messages": [
            {
                "role": "user",
                "content": prompt_updated.format(
                    rand_num=np.random.rand().__round__(5)
                ),
            },
        ],
    }
    for _ in range(n_requests)
]
with open(filename, "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

#### Command to call script:

python ./DATA/api_request_parallel_processor.py \
  --requests_filepath ./DATA/example_requests_to_parallel_process.jsonl \
  --save_filepath ./DATA/example_requests_to_parallel_process_results.jsonl \
  --request_url https://api.openai.com/v1/embeddings \
  --max_requests_per_minute 500 \
  --max_tokens_per_minute 1000000 \
  --token_encoding_name cl100k_base \
  --max_attempts 5 \
  --logging_level 20