# Creting Pydantic Models from BFCL dataset

Code based on this repository:
> Connesson, Rémi. (Apr 2024). Outlines Function Call Gorilla Leaderboard Experiment. GitHub. https://github.com/remiconnesson/outlines-func-call-gorilla-leaderboard-experiment/tree/main.

In [1]:
import pandas as pd
from pathlib import Path
import json

In [2]:
categories = ['simple', 'java', 'javascript']
pydantic_models_dir = Path("..").resolve() / "data/processed" / "pydantic_models"

In [3]:
# Code copied from :
# https://github.com/ShishirPatil/gorilla/blob/1d8d51d0d091c33e730d38745745005c9bc7dfc0/berkeley-function-call-leaderboard/model_handler/constant.py#L13
# https://github.com/ShishirPatil/gorilla/blob/1d8d51d0d091c33e730d38745745005c9bc7dfc0/berkeley-function-call-leaderboard/model_handler/utils.py#L9

GORILLA_TO_OPENAPI = {
    "integer": "integer",
    "number": "number",
    "float": "number",
    "string": "string",
    "boolean": "boolean",
    "bool": "boolean",
    "array": "array",
    "list": "array",
    "dict": "object",
    "object": "object",
    "tuple": "array",
    "any": "string",
    "byte": "integer",
    "short": "integer",
    "long": "integer",
    "double": "number",
    "char": "string",
    "ArrayList": "array",
    "Array": "array",
    "HashMap": "object",
    "Hashtable": "object",
    "Queue": "array",
    "Stack": "array",
    "Any": "string",
    "String": "string",
    "Bigint": "integer",
}

def _cast_to_openai_type(properties, mapping, test_category):
    for key, value in properties.items():
        if "type" not in value:
            properties[key]["type"] = "string"
        else:
            var_type = value["type"]
            if mapping == GORILLA_TO_OPENAPI and var_type == "float":
                properties[key]["format"] = "float"
                properties[key]["description"] += " This is a float type value."
            if var_type in mapping:
                properties[key]["type"] = mapping[var_type]
            else:
                properties[key]["type"] = "string"

        # Currently support:
        # - list of any
        # - list of list of any
        # - list of dict
        # - list of list of dict
        # - dict of any

        if properties[key]["type"] == "array" or properties[key]["type"] == "object":
            if "properties" in properties[key]:
                properties[key]["properties"] = _cast_to_openai_type(
                    properties[key]["properties"], mapping, test_category
                )
            elif "items" in properties[key]:
                properties[key]["items"]["type"] = mapping[
                    properties[key]["items"]["type"]
                ]
                if (
                    properties[key]["items"]["type"] == "array"
                    and "items" in properties[key]["items"]
                ):
                    properties[key]["items"]["items"]["type"] = mapping[
                        properties[key]["items"]["items"]["type"]
                    ]
                elif (
                    properties[key]["items"]["type"] == "object"
                    and "properties" in properties[key]["items"]
                ):
                    properties[key]["items"]["properties"] = _cast_to_openai_type(
                        properties[key]["items"]["properties"], mapping, test_category
                    )
    return properties

In [4]:
df_results = pd.DataFrame(columns=['iterator', 'category', 'question', 'pydantic_model_file'])

for category in categories:
    df = pd.read_json(f'../data/raw/gorilla_openfunctions_v1_test_{category}.json', lines=True)

    get_model_file = lambda i: pydantic_models_dir / f"model_{i:03d}.py"
    
    for i, _ in df.iterrows():

        schema = json.dumps({
            "title": df.loc[i, "function"]["name"],
            "type": "object",
            "description": df.loc[i, "function"]["description"],
            "properties": _cast_to_openai_type(df.loc[i, "function"]["parameters"]["properties"], GORILLA_TO_OPENAPI, category),
            "required": df.loc[i, "function"]["parameters"]["required"]
        }, indent=2)

        with open("json_schema.json", "w") as f:
            f.write(schema)
                    
        model_file = get_model_file(i)
        !datamodel-codegen  --input json_schema.json --input-file-type jsonschema --output { model_file }
        
         # Create a temporary DataFrame and concatenate it
        temp_df = pd.DataFrame({
            'iterator': [i],
            'category': [category],
            'question': [df.loc[i, "question"]],
            'pydantic_model_file': [str(model_file)]
        })
        df_results = pd.concat([df_results, temp_df], ignore_index=True)




    iterator    category                                           question  \
0          0      simple  Find the area of a triangle with a base of 10 ...   
1          1      simple  Calculate the factorial of 5 using math functi...   
2          2      simple  Calculate the hypotenuse of a right triangle g...   
3          3      simple  Find the roots of a quadratic equation with co...   
4          4      simple  Solve a quadratic equation where a=2, b=6, and...   
..       ...         ...                                                ...   
545       45  javascript  How can I asynchronously retrieve a map of rem...   
546       46  javascript  How can I update the property 'version' of an ...   
547       47  javascript  How can I calculate the difference in days bet...   
548       48  javascript  How can I update the DOM event listeners from ...   
549       49  javascript  How can I determine the appropriate boolean st...   

                                   pydantic_model_f

In [5]:
# save to csv

df_results.to_csv('../data/processed/pydantic_models.csv', index=False)