In [None]:
!pip install openai pandas

In [52]:
import openai
import pandas as pd
from openai import OpenAI
import json
import re

### Define Taxonomy Data

In [21]:
# Sample taxonomy data
taxonomy_data = {
    "Category": ["Beverages", "Snacks", "Household"],
    "Subcategory": ["Soft Drinks", "Chips", "Cleaning Supplies"]
}
taxonomy_df = pd.DataFrame(taxonomy_data)

# Sample vendor catalog data with product descriptions
vendor_catalog_data = {
    "Product Description": [
        "Coca Cola 12oz can",
        "Potato Chips 150g bag",
        "Multi-surface cleaner 500ml",
        "Energy drink 500ml",
        "Corn chips 200g bag"
    ]
}
vendor_catalog_df = pd.DataFrame(vendor_catalog_data)

In [8]:
OPENAI_API_KEY = "YOUR_API_KEY_HERE"
client = OpenAI(api_key=OPENAI_API_KEY)

### Define Classification Function

In [54]:
def classify_product(product_description, taxonomy):

    prompt = (
        f"Classify the following product into the correct category and subcategory based on this taxonomy: {taxonomy}. "
        f"Return the result as a JSON object with 'category' and 'subcategory' keys.\n\nProduct: {product_description}"
    )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    raw_content = response.choices[0].message.content.strip()

    try:
        extracted_content = extract_response(raw_content)
        result = json.loads(extracted_content)
        return result.get("category"), result.get("subcategory")
    except json.JSONDecodeError as e:
        # Handle parsing error
        print(f"JSON parsing error: {e}")
        print("Returning None for both category and subcategory due to parsing failure.\n")
        return None, None

In [55]:
def extract_response(input_string: str) -> str:
    regex = r'```json\s*([\s\S]*?)\s*```'
    matches = re.search(regex, input_string)

    if matches:
        return matches.group(1).strip()

    return input_string

### Classify Products in Vendor Catalog

In [56]:
# Convert taxonomy to a string format for use in the prompt
taxonomy_json = taxonomy_df.to_dict(orient="records")
taxonomy_json_str = json.dumps(taxonomy_json, indent=2)

# Classify each product and unpack the category and subcategory
vendor_catalog_df[['Category', 'Subcategory']] = vendor_catalog_df['Product Description'].apply(
    lambda x: pd.Series(classify_product(x, taxonomy_json_str))
)

### View Results

In [57]:
vendor_catalog_df.head()

Unnamed: 0,Product Description,Category,Subcategory
0,Coca Cola 12oz can,Beverages,Soft Drinks
1,Potato Chips 150g bag,Snacks,Chips
2,Multi-surface cleaner 500ml,Household,Cleaning Supplies
3,Energy drink 500ml,Beverages,Soft Drinks
4,Corn chips 200g bag,Snacks,Chips
