In [1]:
import warnings
import os
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import pprint
import numpy as np
from pydantic import BaseModel, Field
from typing import Literal
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("..")
warnings.filterwarnings("ignore")
load_dotenv(".env")

True

In [3]:
from src.ai.describe_image import describe_image, AnswerSchema
from src.ai.openai_connector import OpenAIConnector
from src.services import BlobService

# Data

In [4]:
TRAIN_DATA_FILEPATH = "resources/train_df.csv"
TEST_DATA_FILEPATH = "resources/test_df.csv"
LLM = "gpt-4o-mini"

In [5]:
test_df = pd.read_csv(TEST_DATA_FILEPATH)
train_df = pd.read_csv(TRAIN_DATA_FILEPATH)

# Utils

In [6]:
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [7]:
def example_input(df: pd.DataFrame, i: int) -> str:
    return f"""
    Supplier name = {df["supplier_name"].iloc[i]}
    Product name = {df["supplier_reference_description"].iloc[i]}
    Product price = {df["purchase_price"].iloc[i]}
    Materials = {df["materials"].iloc[i]}
    Possible weight (grams): = {df["weight"].iloc[i]}
    Lenght (cm) = {df["length"].iloc[i]}
    Height (cm) = {df["height"].iloc[i]}
    Width (cm) = {df["width"].iloc[i]}
    """

In [8]:
def row_to_text_input(df: pd.DataFrame, i: int, img_desc: AnswerSchema) -> str:
    text = f"""
    Supplier name = {df["supplier_name"].iloc[i]}
    Product name = {df["supplier_reference_description"].iloc[i]}
    Product price = {df["purchase_price"].iloc[i]}
    Materials = {df["materials"].iloc[i]}
    Possible weight (grams): = {df["weight"].iloc[i]}
    Lenght (cm) = {df["length"].iloc[i]}
    Height (cm) = {df["height"].iloc[i]}
    Width (cm) = {df["width"].iloc[i]}

    Image description:
    1. Look: {img_desc.look}
    2. Potential usage: {img_desc.potential_usage}
    3. Materials:{img_desc.materials}
    """
    return text

In [9]:
def get_classification_schema(categories: list[str]):
    class ClassificationSchema(BaseModel):
        """Answer schema for the classification"""
        confidence: int = Field(description="How much are you sure for your answer?")
        explanation: str = Field(description="Explain your reasoning here")
        category: Literal[*categories] = Field(description="The category you has choosen")
    return ClassificationSchema

# Classification tree

In [10]:
tree = (
    train_df
    .groupby(["main", "sub", "detail", "level4"])
    .item_id
    .nunique()
    .reset_index()
    .drop("item_id", axis=1)
)
tree

Unnamed: 0,main,sub,detail,level4
0,Decoration,Candles & Candle Holders,Candle Snuffers,Unspecified
1,Decoration,Candles & Candle Holders,Candles,Unspecified
2,Decoration,Candles & Candle Holders,Candlesticks,Unspecified
3,Decoration,Candles & Candle Holders,Hurricane Lights & Lanterns,Unspecified
4,Decoration,Candles & Candle Holders,LED Candles,Unspecified
...,...,...,...,...
147,Tableware,Wine & Bar Accessories,Decanters & Bottles,Unspecified
148,Tableware,Wine & Bar Accessories,Drink Dispensers,Unspecified
149,Tableware,Wine & Bar Accessories,Ice Buckets,Unspecified
150,Tableware,Wine & Bar Accessories,Wine Accessories,Unspecified


# Demo

In [11]:
prompt_template = """
<system>
    You are an assitant that helps with product categorization. 
    Products are typically realted with house, home, garden items.
</system>


<task>
    1. Analyze the given product described in a "product" tag. 
    2. Try to find the best category in the "categories" tag.
</task>


<higher-class>
    {higher_class}
</higher-class>


<product>
    {product}
</product>


<categories>
    {categories}
</categories>


<examples-of-similair-products>
    {examples}
</examples-of-similair-products>


<answer-schema>
    {answer_schema}
</answer-schema>
"""

In [12]:
openai_conn = OpenAIConnector()
blob_service = BlobService()
embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

In [13]:
blobs = blob_service.get_blobs("images")
names_sizes = {x["name"]: x["size"] for x in blobs}

# Refrence data - built from train_df

In [14]:
embeddings = [
    embedder.encode(example_input(train_df, i))
    for i in tqdm(range(len(train_df)))
]

100%|██████████| 6823/6823 [07:57<00:00, 14.28it/s]


In [15]:
embeddings_data = train_df[["item_id"]]
embeddings_data["emb"] = embeddings

In [16]:
reference_data = train_df[["item_id", "supplier_name", "supplier_reference_description", "main", "sub", "detail", "level4"]]

In [17]:
def find_n_nearest_examples(
    test_row: str, 
    reference_data: pd.DataFrame, 
    embeddings_data: pd.DataFrame, 
    embedder: SentenceTransformer,
    n: int = 5
):
    current_emb = embedder.encode(test_row)
    embeddings_data["score"] = embeddings_data["emb"].apply(lambda x: cosine_similarity(current_emb, x))
    embeddings_data.sort_values("score", ascending=False, inplace=True)
    items_ids = embeddings_data["item_id"].to_list()
    items_ids = items_ids[:min([n, len(items_ids)])]

    examples = ""
    for x in items_ids:
        row = reference_data[reference_data["item_id"] == x]

        examples += f"""
        Supplier name = {row["supplier_name"].iloc[0]}
        Product name = {row["supplier_reference_description"].iloc[0]}
        Classification = {row["main"].iloc[0]} / {row["sub"].iloc[0]} / {row["detail"].iloc[0]} / {row["level4"].iloc[0]}
        """
        
    return examples

# Run classification - on the test_df

In [34]:
results = []
test_df = test_df.sample(len(test_df))
records = 0

for i in tqdm(range(len(test_df))):
    if records == 100:
        break

    try:
        image_b64 = blob_service.get_image_string(str(test_df.iloc[i]["item_id"])+".jpg")
        records += 1
    except:
        continue

    image_url = f"data:image/jpeg;base64,{image_b64}"
    img_desc = describe_image(image_url)
    test_row = row_to_text_input(test_df, i, img_desc)
    test_row_short = example_input(test_df, i)
    examples = find_n_nearest_examples(
        test_row=test_row_short, 
        reference_data=reference_data, 
        embeddings_data=embeddings_data, 
        embedder=embedder,
        n=10
    )

    # main
    higher_class = "Now, you generate the highest general level classification."
    categories = [x for x in list(tree.main.unique()) if x != "Unspecified"]
    if len(categories) == 0:
        main_class == "Unspecified"
    elif len(categories) == 1:
        main_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema(),
            examples=examples
        )
        main_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]
    

    # sub
    higher_class = f"The general classification is {main_class}"
    current_tree = tree[tree.main == main_class]
    categories = [x for x in list(current_tree["sub"].unique()) if x != "Unspecified"]
    if len(categories) == 0:
        sub_class = "Unspecified"
    elif len(categories) == 1:
        sub_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema(),
            examples=examples
        )
        sub_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]


    # detail
    higher_class = f"The general classification is {main_class}/{sub_class}"
    current_tree = tree[(tree.main == main_class) & (tree["sub"] == sub_class)]
    categories = [x for x in list(current_tree["detail"].unique()) if x != "Unspecified"]
    if len(categories) == 0:
        detail_class = "Unspecified"
    elif len(categories) == 1:
        detail_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema(),
            examples=examples
        )
        detail_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]


    # level4
    higher_class = f"The general classification is {main_class}/{sub_class}/{detail_class}"
    current_tree = tree[(tree.main == main_class) & (tree["sub"] == sub_class) & (tree["detail"] == detail_class)]
    categories = [x for x in list(current_tree["level4"].unique()) if x != "Unspecified"]
    if len(categories) == 0:
        level4_class = "Unspecified"
    elif len(categories) == 1:
        level4_class = categories[0]
    else:
        ClassificationSchema = get_classification_schema(categories)
        prompt = prompt_template.format(
            higher_class=higher_class,
            categories=categories,
            product=test_row,
            answer_schema=ClassificationSchema.model_json_schema(),
            examples=examples
        )
        level4_class = openai_conn.request_wih_function_calling(
            input_messages=[OpenAIConnector.create_human_message(prompt)],
            schema=ClassificationSchema,
            llm=LLM
        )["category"]

            
    results.append({
        "item_id": test_df["item_id"].iloc[i],
        "pred_main": main_class,
        "pred_sub": sub_class,
        "pred_detail": detail_class,
        "pred_level4": level4_class,
        "product_representation": test_row
    })


 12%|█▏        | 123/1000 [15:45<1:52:18,  7.68s/it]


In [35]:
results_df = pd.merge(
    left=test_df,
    right=pd.DataFrame(results),
    on="item_id",
    how="right"
)

In [36]:
results_df

Unnamed: 0,item_id,supplier_id,supplier_name,season,supplier_reference,supplier_reference_description,purchase_price,length,width,height,...,detail,level4,item_type,materials,dataset,pred_main,pred_sub,pred_detail,pred_level4,product_representation
0,57186,463,"Linshu Lingyun Arts and Crafts Co., Ltd.",2026_SS,LYST2407 natural-large,L: 38.5x21.5xH7cm,0.10,38.5,21.5,7.0,...,Decorative Objects,Decorative Trays,Decorative tray,"PAPER (70.00%), IRON (30.00%)",test,Decoration,Decoration Storage,Storage Baskets,Unspecified,\n Supplier name = Linshu Lingyun Arts and ...
1,65608,381,Langhao,2026_AW,J11525-1_brown,"Owl shaped glass jar - spray brown(173U),#L, 1...",2.46,12.5,11.0,14.2,...,Storage Jars,Unspecified,Decorative Jar,"GLASS (99.00%), PAINT (1.00%)",test,Decoration,Decoration Storage,Storage Jars,Unspecified,\n Supplier name = Langhao\n Product nam...
2,70401,89,Gmb Trade Linkers,2026_SS,64409-1,FLOWER VASE,10.40,17.0,17.0,24.0,...,Unspecified,Unspecified,,(%),test,Decoration,Flower Pots & Vases,Vases,Unspecified,\n Supplier name = Gmb Trade Linkers\n P...
3,70218,425,Linshu Ronghua,2026_AW,2505101-1-S,D30 H7 wicker - willow,0.80,30.0,30.0,7.0,...,Other,Wreath,Wreath,WICKER (100.00%),test,Decoration,Home Accessories,Decorative Objects,Objects,\n Supplier name = Linshu Ronghua\n Prod...
4,59741,353,Evergreat International Co,2026_SS,EG034000 medium,"D32XH22,BD27cm",4.58,32.0,32.0,22.0,...,Storage Baskets,Unspecified,Storage Basket,"PAPER (50.00%), IRON (50.00%)",test,Decoration,Decoration Storage,Storage Baskets,Unspecified,\n Supplier name = Evergreat International ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,72184,809,Shenyang New Seasons Arts&craf,2026_AW,XH-F0014,37CM RED GLASS CHRISTMAS TREE SIZE:D17X37,5.22,17.0,17.0,37.0,...,Decorative Objects,Decorative Trees,Tree,GLASS (100.00%),test,Decoration,Home Accessories,Decorative Objects,Decorative Trees,\n Supplier name = Shenyang New Seasons Art...
96,57236,912,Top Pound,2026_SS,67-19369L,white plant stand + D27*H57.5,6.00,27.0,27.0,57.5,...,Side Tables,Unspecified,Side table,IRON (100.00%),test,Furniture,Tables,Side Tables,Unspecified,\n Supplier name = Top Pound\n Product n...
97,63965,26,"Vidrios San Miguel, S.L.L.",2026_SS,4860DB22,JARRON DAROCA 70 CM,18.74,27.0,27.0,72.0,...,Floor Vases,Unspecified,Floor vase,GLASS (1.00%),test,Decoration,Flower Pots & Vases,Floor Vases,Unspecified,"\n Supplier name = Vidrios San Miguel, S.L...."
98,62210,759,Xingcheng Arts And Crafts,2026_AW,XCC252478Y,FRAMED WALL ART WITH PLASTER 60X60X3CM,6.98,60.0,60.0,3.0,...,Paintings,Unspecified,,"MDF (30.00%), PAPER (10.00%), POLYSTYRENE (30....",test,Decoration,Wall Decoration,Framed Pictures,Unspecified,\n Supplier name = Xingcheng Arts And Craft...


In [41]:
main_score, sub_score, detail_score, level4_score, total_score = 0, 0, 0, 0, 0

for i in range(len(results_df)):
    success = 0

    if results_df["main"].iloc[i] == results_df["pred_main"].iloc[i]:
        main_score += 1
        success += 1
    
    if results_df["sub"].iloc[i] == results_df["pred_sub"].iloc[i]:
        sub_score += 1
        success += 1

    if results_df["detail"].iloc[i] == results_df["pred_detail"].iloc[i]:
        detail_score += 1
        success += 1

    if results_df["level4"].iloc[i] == results_df["pred_level4"].iloc[i]:
        level4_score += 1
        success += 1
    
    if success == 4:
        total_score += 1

In [42]:
print("Main score = ", round(main_score/len(results_df), 3))
print("Sub score = ", round(sub_score/len(results_df), 3))
print("Detail score = ", round(detail_score/len(results_df), 3))
print("Level4 score = ", round(level4_score/len(results_df), 3))
print("Total score = ", round(total_score/len(results_df), 3))

Main score =  0.83
Sub score =  0.73
Detail score =  0.59
Level4 score =  0.84
Total score =  0.58


In [44]:
results_df.to_csv("resources/results_few_shot_gpt4omini.csv", index=False)