In [1]:
import warnings
import requests
import json
import os
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import pprint
import numpy as np
from pydantic import BaseModel, Field
from typing import Literal

In [2]:
os.chdir("..")
warnings.filterwarnings("ignore")
load_dotenv(".env")

True

In [3]:
from src.ai.describe_image import describe_image, AnswerSchema
from src.ai.openai_connector import OpenAIConnector
from src.services import BlobService

# Data

In [4]:
TRAIN_DATA_FILEPATH = "resources/train_df.csv"
TEST_DATA_FILEPATH = "resources/test_df.csv"
LLM = "gpt-4o-mini"

In [5]:
test_df = pd.read_csv(TEST_DATA_FILEPATH)
train_df = pd.read_csv(TRAIN_DATA_FILEPATH)

# Utils

In [34]:
def row_to_text_input(df: pd.DataFrame, i: int, img_desc: AnswerSchema) -> str:
    text = f"""
    Supplier name = {df["supplier_name"].iloc[i]}
    Product name = {df["supplier_reference_description"].iloc[i]}
    Product price = {df["purchase_price"].iloc[i]}
    Materials = {df["materials"].iloc[i]}
    Possible weight (grams): = {df["weight"].iloc[i]}
    Lenght (cm) = {df["length"].iloc[i]}
    Height (cm) = {df["height"].iloc[i]}
    Width (cm) = {df["width"].iloc[i]}

    Image description:
    1. Look: {img_desc.look}
    2. Potential usage: {img_desc.potential_usage}
    3. Materials:{img_desc.materials}
    """
    return text

In [7]:
def get_classification_schema(categories: list[str]):
    class ClassificationSchema(BaseModel):
        """Answer schema for the classification"""
        confidence: int = Field(description="How much are you sure for your answer?")
        explanation: str = Field(description="Explain your reasoning here")
        category: Literal[*categories] = Field(description="The category you has choosen")
    return ClassificationSchema

# Classification tree

In [8]:
tree = (
    pd.concat([test_df, train_df])
    .groupby(["main", "sub", "detail", "level4"])
    .item_id
    .nunique()
    .reset_index()
    .drop("item_id", axis=1)
)
tree

Unnamed: 0,main,sub,detail,level4
0,Decoration,Candles & Candle Holders,Candle Snuffers,Unspecified
1,Decoration,Candles & Candle Holders,Candles,Unspecified
2,Decoration,Candles & Candle Holders,Candlesticks,Unspecified
3,Decoration,Candles & Candle Holders,Hurricane Lights & Lanterns,Unspecified
4,Decoration,Candles & Candle Holders,LED Candles,Unspecified
...,...,...,...,...
148,Tableware,Wine & Bar Accessories,Decanters & Bottles,Unspecified
149,Tableware,Wine & Bar Accessories,Drink Dispensers,Unspecified
150,Tableware,Wine & Bar Accessories,Ice Buckets,Unspecified
151,Tableware,Wine & Bar Accessories,Wine Accessories,Unspecified


# Demo

In [9]:
prompt_template = """
<system>
    You are an assitant that helps with product categorization. 
    Products are typically realted with house, home, garden items.
</system>


<task>
    1. Analyze the given product described in a "product" tag. 
    2. Try to find the best category in the "categories" tag.
</task>


<higher-class>
    {higher_class}
</higher-class>


<product>
    {product}
</product>


<categories>
    {categories}
</categories>


<answer-schema>
    {answer_schema}
</answer-schema>
"""

In [10]:
openai_conn = OpenAIConnector()
blob_service = BlobService()

In [11]:
blobs = blob_service.get_blobs("images")
names_sizes = {x["name"]: x["size"] for x in blobs}

## Match test records to images on Azure

In [12]:
urls = []
for i in tqdm(range(len(test_df))):
    item_id = test_df.iloc[i]["item_id"]
    names = [x for x in names_sizes.keys() if x.startswith(str(item_id))]
    if names:
        max_size_idx = np.argmax([names_sizes[x] for x in names])
        name = names[max_size_idx]
        url = blob_service.get_image_url(name)
    else:
        url = None

    urls.append(url)

100%|██████████| 1000/1000 [00:31<00:00, 31.72it/s]


In [13]:
len(urls)

1000

In [14]:
test_df["image_url"] = urls
test_df.dropna(subset="image_url", inplace=True)
test_df.reset_index(drop=True, inplace=True)

## Run classification

In [45]:
results = []
n = 100
test_df = test_df.sample(len(test_df))


for i in tqdm(range(n)):
    image_b64 = blob_service.get_image_string(str(test_df.iloc[i]["item_id"])+".jpg")
    img_desc = describe_image(f"data:image/jpeg;base64,{image_b64}")
    test_row = row_to_text_input(test_df, i, img_desc)

    # main
    higher_class = "Now, you generate the highest general level classification."
    categories = [x for x in list(tree.main.unique()) if x != "Unspecified"]
    if len(categories) == 0:
        main_class == "Unspecified"
    elif len(categories) == 1:
        main_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema()
        )
        main_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]
    
    # sub
    higher_class = f"The general classification is {main_class}"
    current_tree = tree[tree.main == main_class]
    categories = [x for x in list(current_tree["sub"].unique()) if x != "Unspecified"]
    if len(categories) == 0:
        sub_class = "Unspecified"
    elif len(categories) == 1:
        sub_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema()
        )
        sub_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]

    # detail
    higher_class = f"The general classification is {main_class}/{sub_class}"
    current_tree = tree[(tree.main == main_class) & (tree["sub"] == sub_class)]
    categories = [x for x in list(current_tree["detail"].unique()) if x != "Unspecified"]
    if len(categories) == 0:
        detail_class = "Unspecified"
    elif len(categories) == 1:
        detail_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema()
        )
        detail_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]

    # level4
    higher_class = f"The general classification is {main_class}/{sub_class}/{detail_class}"
    current_tree = tree[(tree.main == main_class) & (tree["sub"] == sub_class) & (tree["detail"] == detail_class)]
    categories = [x for x in list(current_tree["level4"].unique()) if x != "Unspecified"]
    if len(categories) == 0:
        level4_class = "Unspecified"
    elif len(categories) == 1:
        level4_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema()
        )
        level4_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]

            
    results.append({
        "item_id": test_df["item_id"].iloc[i],
        "pred_main": main_class,
        "pred_sub": sub_class,
        "pred_detail": detail_class,
        "pred_level4": level4_class,
        "product_representation": test_row
    })


100%|██████████| 100/100 [16:20<00:00,  9.81s/it]


In [51]:
results_df = pd.merge(
    left=test_df,
    right=pd.DataFrame(results),
    on="item_id",
    how="right"
)
results_df.drop(columns=["image_url"], axis=1).to_csv("resources/results_gpt_4o_mini.csv", index=False)

In [50]:
results_df

Unnamed: 0,item_id,supplier_id,supplier_name,season,supplier_reference,supplier_reference_description,purchase_price,length,width,height,...,level4,item_type,materials,dataset,image_url,pred_main,pred_sub,pred_detail,pred_level4,product_representation
0,73210,1312,Umiss,2026_AW,CS-13_60cm_Pantone 5625U,120gsm paper with magnet ， no metallic gold pa...,8.42,42.5,42.5,60.0,...,Decorative Trees,Tree,"PAPER (95.00%), MAGNETS (5.00%)",test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Decorative Materials,Unspecified,Unspecified,\n Supplier name = Umiss \n Product name...
1,64401,431,Elegant Garden,2026_AW,E1064A13-13,ceramic pear 11.5x11.5x18.5,2.19,11.5,11.5,18.5,...,Flowers & Plants,Pear,"STONEWARE (90.00%), GLAZE (10.00%)",test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Decorative Materials,Unspecified,Unspecified,\n Supplier name = Elegant Garden\n Prod...
2,66020,722,Rich Home Design,2026_AW,FSM2525-2071-LS-B-50*170CM-H122,mirror,16.50,50.0,4.0,170.0,...,Unspecified,Standing mirror,"POLYSTYRENE (40.00%), MIRROR (45.00%), MDF (10...",test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Mirrors,Standing Mirrors,Unspecified,\n Supplier name = Rich Home Design\n Pr...
3,49836,1219,Wayart,2026_SS,WAJ-22043,"Big flower vase, red 19.5*19.5*28",8.90,19.5,19.5,28.0,...,Unspecified,Vase,DOLOMITE (100.00%),test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Flower Pots & Vases,Vases,Unspecified,\n Supplier name = Wayart\n Product name...
4,49842,1219,Wayart,2026_SS,WAP-24111 blue white,Dolomite fish deco vase 21.6*13.4*17,4.10,21.6,13.4,17.0,...,Unspecified,Vase,DOLOMITE (100.00%),test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Flower Pots & Vases,Vases,Unspecified,\n Supplier name = Wayart\n Product name...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,62680,837,New Land Co Ltd,2026_AW,XT-1616BGD,Glass candle holder,2.89,16.0,16.0,16.0,...,Unspecified,Candle holder,GLASS (100.00%),test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Candles & Candle Holders,Candlesticks,Unspecified,\n Supplier name = New Land Co Ltd\n Pro...
96,70218,425,Linshu Ronghua,2026_AW,2505101-1-S,D30 H7 wicker - willow,0.80,30.0,30.0,7.0,...,Wreath,Wreath,WICKER (100.00%),test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Decorative Materials,Unspecified,Unspecified,\n Supplier name = Linshu Ronghua\n Prod...
97,49743,654,Hongguang,2026_SS,HG04515D0-722-G2475a,"11.5""VASE",17.02,29.0,29.0,27.0,...,Unspecified,Vase,"CLAY (95.00%), GLAZE (5.00%)",test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Flower Pots & Vases,Vases,Unspecified,\n Supplier name = Hongguang\n Product n...
98,67912,16,Meizhou Jinfang,2026_AW,R6817-1/WH,Deer's father with GS-IP44-4.5V-9W transformer...,20.60,79.0,22.0,120.0,...,Unspecified,Reindeer,"IRON (50.00%), PLASTIC (20.00%), COPPER (20.00...",test,https://softwaremindprojectawdo.blob.core.wind...,Decoration,Wall Decoration,Wall Objects,Unspecified,\n Supplier name = Meizhou Jinfang\n Pro...


In [48]:
main_score, sub_score, detail_score, level4_score, total_score = 0, 0, 0, 0, 0

for i in range(len(results_df)):
    success = 0

    if results_df["main"].iloc[i] == test_df["main"].iloc[i]:
        main_score += 1
        success += 1
    
    if results_df["sub"].iloc[i] == test_df["sub"].iloc[i]:
        sub_score += 1
        success += 1

    if results_df["detail"].iloc[i] == test_df["detail"].iloc[i]:
        detail_score += 1
        success += 1

    if results_df["level4"].iloc[i] == test_df["level4"].iloc[i]:
        level4_score += 1
        success += 1
    
    if success == 4:
        total_score += 1

In [49]:
print("Main score = ", round(main_score/len(results_df), 3))
print("Sub score = ", round(sub_score/len(results_df), 3))
print("Detail score = ", round(detail_score/len(results_df), 3))
print("Level4 score = ", round(level4_score/len(results_df), 3))
print("Total score = ", round(total_score/len(results_df), 3))

Main score =  1.0
Sub score =  1.0
Detail score =  1.0
Level4 score =  1.0
Total score =  1.0
