In [1]:
import io
import json
import os
from pprint import pp

import pandas as pd
from termcolor import colored

# Parse Result

In [2]:
def get_directories(path):
    dirs = [dir for dir in os.listdir(path) if os.path.isdir(os.path.join(path, dir))]
    return dirs

In [3]:
def load_jsonl(path):
    lines = []
    with open(path, "r") as file:
        for line in file:
            lines.append(json.loads(line))
    return lines

In [4]:
def get_model_dir(model_dirs, gen_mode, model_name_escaped, dedup_str, n_tool_calls):
    for model_dir in model_dirs:
        if gen_mode == "structured" and "unstructured" in model_dir:
            continue

        if gen_mode in model_dir and model_name_escaped in model_dir and dedup_str in model_dir and n_tool_calls in model_dir:
            return model_dir
    return None

In [5]:
def index_it(idx, questions, solutions, generations, scores, idx_type="error"):

    if idx_type == "absolute":
        question = questions[idx]
        solution = solutions[idx]
        generation = generations[idx]
        score = [score for i, score in enumerate(scores) if i > 0 and score["id"] == idx]
    elif idx_type == "error":
        if idx+1 > len(scores):
            raise ValueError(f"There are {len(scores)-1} errors in this file. But you requested error {idx} (0-index).")
        score = scores[idx+1]
        abs_idx = score["id"] - 1
        question = questions[abs_idx]
        solution = solutions[abs_idx]
        generation = generations[abs_idx]
    else:
        raise ValueError("Wrong idx_type.")

    return question, solution, generation, score

In [6]:
def extract_values( question, solution, generation, score):
    user_query = question["question"]
    tools = question["function"]
    if not isinstance(tools, list):
        tools = [tools]

    raw_result = generation["result"]
    messages = generation["messages"]
    tool_calls = generation["tool_calls"]
    n_tool_calls = generation["n_tool_calls"]
    valid = score["valid"]
    error = score["error"]
    error_type = score["error_type"]
    decoded_result = score["model_result_decoded"]

    result = {
        "user_query": user_query,
        "tools": tools,
        "valid": valid,
        "tool_calls": tool_calls,
        "solution": solution,
        "error": error,
        "error_type": error_type,
        "raw_result": raw_result,
        "decoded_result": decoded_result,
        "messages": messages,
        "n_tool_calls": n_tool_calls
    }

    return result

In [16]:
def spec_to_result(spec, idx, idx_type):

    # Unpack spec
    bfcl_category = spec["bfcl_category"]
    model_name = spec["model_name"]
    dedup = spec["dedup"]
    out_dir = spec["out_dir"]
    gen_mode = spec["gen_mode"]
    n_tool_calls = spec["n_tool_calls"]

    # Get paths
    test_file = f"gorilla_openfunctions_v1_test_{bfcl_category}.json"
    questions_path = os.path.join("./data", test_file)
    solutions_path = os.path.join("./data/possible_answer", test_file)

    model_name_escaped = model_name.replace("/", "_")
    dedup_str = "dedup" if dedup else ""
    model_dirs = get_directories(out_dir)
    model_dir = get_model_dir(model_dirs, gen_mode, model_name_escaped, dedup_str, n_tool_calls)
    generations_path = os.path.join(out_dir, model_dir, "generations", test_file)
    scores_path = os.path.join(out_dir, model_dir, "scores", bfcl_category + "_score.json")

    # Load all files
    questions = load_jsonl(questions_path)
    solutions = load_jsonl(solutions_path)
    generations = load_jsonl(generations_path)
    scores = load_jsonl(scores_path)

    # Index into the exact question, solution, generation, score
    question, solution, generation, score = index_it(idx, questions, solutions, generations, scores, idx_type=idx_type)
    values = extract_values(question, solution, generation, score)

    result = spec | values | {"idx": idx, "idx_type": idx_type}
    return result

# Pretty Print

In [14]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "function": "magenta",
    }

    for message in messages:
        if message["role"] == "system":
            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "user":
            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and message.get("function_call"):
            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
        elif message["role"] == "assistant" and not message.get("function_call"):
            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
        elif message["role"] == "function":
            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))

In [15]:
def pp_result(result, width=200, verbose=0):
    print("-"*120)
    dedup_str = "-dedup" if dedup else ""
    print(f"Example {result['idx']} of {result['bfcl_category']} BFCL\t\t\t\t\t({result['model_name']}-{result['gen_mode']} with {result['n_tool_calls']}{dedup_str})")
    print("-"*120, end="\n\n")

    color_map = {}
    print(colored(f"User Query:\n{result['user_query']}", "yellow"), end="\n\n")

    print(colored(f"Valid: {result['valid']}", "blue"), end="\n\n")

    print(colored(f"Error:\n{result['error']}\n{result['error_type']}", "blue"), end="\n\n")

    print(colored("My Tool Calls:", "red"))
    for tool_call in result['tool_calls']:
        print(colored(tool_call, "red"))
    print()

    print(colored("Correct Tool Calls:", "green"))
    for tool_name, tool_args in result['solution'].items():
        print(colored(f"{{'tool_name': '{tool_name}', 'tool_arguments': {tool_args}}}", "green"))
    print()

    print(colored(f"Raw Result:\n{result['raw_result']}", "yellow"), end="\n\n")

    if verbose >= 1:

        print(colored(f"Decoded Result:\n{result['decoded_result']}", "blue"), end="\n\n")

        print("Given Tool Calls:")
        for tool in result["tools"]:
            pp(tool, width=width)
        print()

    if verbose >= 2:
        print("Messages:")
        pretty_print_conversation(result['messages'])

# Run it

In [18]:
# Inputs
bfcl_category = "parallel_function"

dedup = False
gen_mode = "meta_tool"
n_tool_calls = "auto"
model_name = "databricks/dbrx-instruct"
out_dir = "./outputs"

spec = {
    "bfcl_category": bfcl_category,
    "gen_mode": gen_mode,
    "n_tool_calls": n_tool_calls,
    "model_name": model_name,
    "dedup": dedup,
    "out_dir": out_dir,
}

In [19]:
idx = 0
result = spec_to_result(spec, idx, idx_type="error")
pp_result(result)

------------------------------------------------------------------------------------------------------------------------
Example 0 of parallel_function BFCL					(databricks/dbrx-instruct-meta_tool with auto)
------------------------------------------------------------------------------------------------------------------------

[33mUser Query:
Calculate the resistance of a wire with a length of 5m and cross sectional area 0.01m² with resistivity of copper and aluminum[0m

[34mValid: False[0m

[34mError:
['Wrong number of functions.']
parallel_function_checker_no_order:wrong_count[0m

[31mMy Tool Calls:[0m
[31m{'tool_name': 'calculate_resistance', 'tool_arguments': {'length': 5, 'area': 0.01, 'resistivity': 'copper'}}[0m

[32mCorrect Tool Calls:[0m
[32m{'tool_name': 'calculate_resistance_1', 'tool_arguments': {'length': [5], 'area': [0.01], 'resistivity': ['copper']}}[0m
[32m{'tool_name': 'calculate_resistance_2', 'tool_arguments': {'length': [5], 'area': [0.01], 'resisti