## Read data

In [3]:
import pandas as pd
df_products_translated=pd.read_csv('../data/brazilian_e-commerce/olist_products_dataset_translated.csv')
df_products_translated.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0,Perfumes
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,Arts
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0,Sports and Leisure
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0,Baby Products
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0,Household Essentials


In [4]:
df_products_translated['product_category_name_english'].value_counts()

product_category_name_english
Bed and Bathroom Products             3029
Sports and Leisure                    2867
Furniture and Decoration              2657
Beauty and Health Products            2444
Household Essentials                  2335
                                      ... 
Home Comfort (2)                         5
Children's and Teenagers' Clothing       5
PC Gamer                                 3
Insurance and Services                   2
Music CDs and DVDs                       1
Name: count, Length: 72, dtype: int64

In [5]:
df_products_translated['product_category_name_english'].unique()

array(['Perfumes', 'Arts', 'Sports and Leisure', 'Baby Products',
       'Household Essentials', 'Musical Instruments', 'Cool Stuff',
       'Furniture and Decoration', 'Electrical Appliances', 'Toys',
       'Bed and Bathroom Products',
       'Construction Tools and Safety Equipment', 'Computer Accessories',
       'Beauty and Health Products', 'Luggage and Travel Accessories',
       'Garden Tools', 'Office Furniture', 'Automotive Products',
       'Electronics', 'Footwear and Fashion', 'Telephony',
       'Paper Products', 'Handbags and Fashion Accessories',
       'Personal Care Products', 'Home Construction',
       'Watches and Gift Items', 'Construction Tools for Construction',
       'Pet Shop', 'Electrical Portables',
       'Agriculture, Industry, and Commerce', nan,
       'Living Room Furniture', 'Signage and Security Equipment',
       'Climate Control', 'Consoles and Games',
       'Books and General Interest Products',
       'Underwear and Beach Fashion', "Men's Clothi

In [6]:
df_products_translated['product_category_name_english'].nunique()

72

## Taxonomy reduction: consolidated the product taxonomy from 72 detailed categories to 7 high-level groups

In [7]:
import requests
import json

def ollama_chat(prompt):
    url = "http://localhost:11434/api/chat"
    payload = {
        "model": "llama3:8b",
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(url, json=payload, stream=True)

    full_output = ""

    for line in response.iter_lines():
        if not line:
            continue

        try:
            data = json.loads(line.decode("utf-8"))
        except json.JSONDecodeError:
            print("Skipping invalid line:", line)
            continue

        # Collect streamed content
        if "message" in data and "content" in data["message"]:
            full_output += data["message"]["content"]

    return full_output


In [27]:
categories = df_products_translated['product_category_name_english'].unique().tolist()

prompt = f"""
You are a categorization expert.

Below is a list of product categories ({len(categories)} items):
{categories}

Please:
1. Compress them into a smaller number of broader categories.
2. Each broad category must be logical and semantically consistent.
3. Return ONLY JSON in this format:

{{
  "clusters": [
    {{
      "cluster_name": "string",
      "items": ["item1", "item2", ...]
    }}
  ]
}}
"""

response = ollama_chat(prompt)
print(response)


After analyzing the list of product categories, I was able to compress them into 14 broader categories. Here are the results in JSON format:

{
  "clusters": [
    {
      "cluster_name": "Beauty and Health",
      "items": ["Perfumes", "Baby Products", "Personal Care Products", "Underwear and Beach Fashion", "Men's Clothing", "Women's Fashion Clothing", "Diapers and Hygiene"]
    },
    {
      "cluster_name": "Electronics",
      "items": ["Musical Instruments", "Computer Accessories", "Electronics", "Telephony", "Tablets and Image Printing", "PC Gamer"]
    },
    {
      "cluster_name": "Home and Living",
      "items": ["Furniture and Decoration", "Bed and Bathroom Products", "Living Room Furniture", "Kitchen and Dining Room Furniture", "Bedding and Upholstery", "Home Comfort", "Home Construction", "Household Appliances", "La Cuisine"]
    },
    {
      "cluster_name": "Leisure and Entertainment",
      "items": ["Arts", "Sports and Leisure", "Consoles and Games", "DVDs and Blu-r

In [19]:
print(response)


After analyzing the list of product categories, I've compressed them into a smaller number of broader categories that meet the requirements. Here is the result in JSON format:

{
  "clusters": [
    {
      "cluster_name": "Home and Living",
      "items": ["Household Essentials", "Furniture and Decoration", "Bed and Bathroom Products", "Kitchenware and Food Preparation Tools", "Living Room Furniture", "Bedroom Furniture", "Home Comfort", "Home Construction"]
    },
    {
      "cluster_name": "Electronics and Entertainment",
      "items": ["Electronics", "Musical Instruments", "Consoles and Games", "Audio", "Tablets and Image Printing", "PC Gamer", "Music CDs and DVDs"]
    },
    {
      "cluster_name": "Fashion and Apparel",
      "items": ["Footwear and Fashion", "Handbags and Fashion Accessories", "Men's Clothing", "Women's Fashion Clothing", "Children's and Teenagers' Clothing", "Sports Fashion", "Underwear and Beach Fashion"]
    },
    {
      "cluster_name": "Health and Beaut

## Category Remapping: map the original categories to this consolidated taxonomy

In [20]:
import re
import json

def extract_json(text):
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError("No JSON found.")
    return json.loads(match.group(0))


In [21]:
clusters_json = extract_json(response)


In [22]:
mapping = {}

for cluster in clusters_json["clusters"]:
    name = cluster["cluster_name"]
    for item in cluster["items"]:
        mapping[item] = name

mapping

{'Household Essentials': 'Home and Living',
 'Furniture and Decoration': 'Work and Industry',
 'Bed and Bathroom Products': 'Home and Living',
 'Kitchenware and Food Preparation Tools': 'Home and Living',
 'Living Room Furniture': 'Home and Living',
 'Bedroom Furniture': 'Home and Living',
 'Home Comfort': 'Home and Living',
 'Home Construction': 'Home and Living',
 'Electronics': 'Electronics and Entertainment',
 'Musical Instruments': 'Electronics and Entertainment',
 'Consoles and Games': 'Leisure and Recreation',
 'Audio': 'Electronics and Entertainment',
 'Tablets and Image Printing': 'Electronics and Entertainment',
 'PC Gamer': 'Electronics and Entertainment',
 'Music CDs and DVDs': 'Electronics and Entertainment',
 'Footwear and Fashion': 'Fashion and Apparel',
 'Handbags and Fashion Accessories': 'Fashion and Apparel',
 "Men's Clothing": 'Fashion and Apparel',
 "Women's Fashion Clothing": 'Fashion and Apparel',
 "Children's and Teenagers' Clothing": 'Fashion and Apparel',
 'Sp

In [23]:
df_products_translated["compressed_category"] = (
    df_products_translated["product_category_name_english"].map(mapping)
)
df_products_translated

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,compressed_category
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0,Perfumes,Health and Beauty
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,Arts,Leisure and Recreation
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0,Sports and Leisure,Leisure and Recreation
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0,Baby Products,Health and Beauty
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0,Household Essentials,Home and Living
...,...,...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0,Furniture and Decoration,Work and Industry
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0,Building Tools and Lighting,Work and Industry
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0,Bed and Bathroom Products,Home and Living
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0,Computer Accessories,Work and Industry


In [24]:
df_products_translated['compressed_category'].value_counts()

compressed_category
Home and Living                  5911
Work and Industry                5318
Health and Beauty                4297
Leisure and Recreation           4109
Travel and Transportation        3499
Fashion and Apparel              1221
Electronics and Entertainment     877
Name: count, dtype: int64

In [25]:
df_products_translated['compressed_category'].unique()

array(['Health and Beauty', 'Leisure and Recreation', 'Home and Living',
       'Electronics and Entertainment', nan, 'Work and Industry',
       'Travel and Transportation', 'Fashion and Apparel'], dtype=object)

In [26]:
df_products_translated['compressed_category'].nunique()

7