In [None]:
import warnings
import requests
import json
import pandas as pd
from tqdm import tqdm
from openai import OpenAI


warnings.filterwarnings("ignore")

# Data

In [None]:
TRAIN_DATA_FILEPATH = "../resources/train_df.csv"
TEST_DATA_FILEPATH = "../resources/test_df.csv"
OPENAI_FILEPATH = "../openai_key.txt"
LLM = "gpt-4o-mini"

In [None]:
test_df = pd.read_csv(TEST_DATA_FILEPATH)
train_df = pd.read_csv(TRAIN_DATA_FILEPATH)

# Utils

In [None]:
def ask_gpt_for_category(client: OpenAI, prompt: str, categories: list[str]) -> dict[str, str]:
    response = client.responses.create(
        model=LLM,
        input=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        max_tool_calls=1,
        tools=[{
            "name": "result",
            "type": "function",
            "description": "Use this tool to format answer",
            "parameters": {
                "type": "object",
                "properties": {
                    "confidence": {"type": "number", "minimum": 0, "maximum": 100},
                    "explanation": {"type": "string"},
                    "category": {"type": "string", "enum": categories},
                },
                "required": ["confidence", "explanation", "category"],
            },
            "additionalProperties": False
        }]
    )
    return json.loads(response.output[0].arguments)

In [None]:
def row_to_text_input(df: pd.DataFrame, i: int) -> str:
    text = f"""
    Supplier name = {df["supplier_name"].iloc[i]}
    Product name = {df["supplier_reference_description"].iloc[i]}
    Product price = {df["purchase_price"].iloc[i]}
    Materials = {df["materials"].iloc[i]}
    """
    return text

# Prompts templates

In [None]:
prompt_0_template = """
<system>
    You are an assitant that helps with product categorization. 
    Products are typically realted with house, home, garden items.
</system>


<task>
    1. Analyze the given product described in a "product" tag. 
    2. Try to find the best category in the "categories" tag.
</task>


<higher-class>
    {higher_class}
</higher-class>


<product>
    {product}
</product>


<categories>
    {categories}
</categories>


<answer-schema>
    1. confidence: how much are you sure of your answer? 0 means totally not sure, near 50 means you are bit sure and bit not sure, 100 means that you are totally sure.
    2. expalnation: a very short explanation of your choice
    3. category: the choosen category
</answer-schema>
"""

# Classification tree

In [None]:
tree = (
    pd.concat([test_df, train_df])
    .groupby(["main", "sub", "detail", "level4"])
    .item_id
    .nunique()
    .reset_index()
    .drop("item_id", axis=1)
)
tree

# Demo

In [None]:
openai_client = OpenAI(api_key=open(OPENAI_FILEPATH, "r").read())

In [None]:
results = []
n = 20
test_df = test_df.sample(len(test_df))


for i in tqdm(range(n)):
    test_row = row_to_text_input(test_df, i)

    # main
    higher_class = "Now, you generate the highest general level classification."
    categories = list(tree.main.unique())
    if len(categories) > 1:
        prompt = prompt_0_template.format(product=test_row, categories=categories, higher_class=higher_class)
        main_class = ask_gpt_for_category(openai_client, prompt, categories)['category']
    else:
        main_class = categories[0]
    
    # sub
    higher_class = f"The general classification is {main_class}"
    categories = list(tree[tree.main == main_class]["sub"].unique())
    if len(categories) > 1:
        prompt = prompt_0_template.format(product=test_row, categories=categories, higher_class=higher_class)
        sub_class = ask_gpt_for_category(openai_client, prompt, categories)['category']
    else:
        sub_class = categories[0]

    # detail
    higher_class = f"The general classifcation is {main_class}/{sub_class}"
    categories = list(tree[(tree.main == main_class) & (tree["sub"] == sub_class)]["detail"].unique())
    if len(categories) > 1:
        prompt = prompt_0_template.format(product=test_row, categories=categories, higher_class=higher_class)
        detail_class = ask_gpt_for_category(openai_client, prompt, categories)['category']
    else:
        detail_class = categories[0]

    # level4
    higher_class = f"The general classifcation is {main_class}/{sub_class}/{detail_class}"
    categories = list(tree[(tree.main == main_class) & (tree["sub"] == sub_class) & (tree["detail"] == detail_class)]["level4"].unique())
    if len(categories) > 1:
        prompt = prompt_0_template.format(product=test_row, categories=categories, higher_class=higher_class)
        level4_class = ask_gpt_for_category(openai_client, prompt, categories)['category']
    else:
        level4_class = categories[0]


    results.append({
        "item_id": test_df["item_id"].iloc[i],
        "main": main_class,
        "sub": sub_class,
        "detail": detail_class,
        "level4": level4_class
    })


In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
main_score, sub_score, detail_score, level4_score, total_score = 0, 0, 0, 0, 0

for i in range(len(results_df)):
    success = 0

    if results_df["main"].iloc[i] == test_df["main"].iloc[i]:
        main_score += 1
        success += 1
    
    if results_df["sub"].iloc[i] == test_df["sub"].iloc[i]:
        sub_score += 1
        success += 1

    if results_df["detail"].iloc[i] == test_df["detail"].iloc[i]:
        detail_score += 1
        success += 1

    if results_df["level4"].iloc[i] == test_df["level4"].iloc[i]:
        level4_score += 1
        success += 1
    
    if success == 4:
        total_score += 1

In [None]:
print("Main score = ", round(main_score/len(results_df), 3))
print("Sub score = ", round(sub_score/len(results_df), 3))
print("Detail score = ", round(detail_score/len(results_df), 3))
print("Level4 score = ", round(level4_score/len(results_df), 3))
print("Total score = ", round(total_score/len(results_df), 3))