## Read data

In [2]:
import pandas as pd
df_products_translated=pd.read_csv('../data/brazilian_e-commerce/olist_products_dataset_translated.csv')
df_products_translated.head()

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0,Perfumery
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,Arts
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0,Sports and Leisure
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0,Baby Products
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0,Household Essentials


## Data inspection

In [3]:
df_products_translated['product_category_name_english'].value_counts()

product_category_name_english
Bed and Bathroom                3029
Sports and Leisure              2867
Furniture and Decoration        2657
Beauty and Health               2444
Household Essentials            2335
                                ... 
Home Comfort (2)                   5
Children's and Teen Clothing       5
PC Gaming                          3
Insurance and Services             2
CDs and Music DVDs                 1
Name: count, Length: 71, dtype: int64

In [4]:
df_products_translated['product_category_name_english'].unique()

array(['Perfumery', 'Arts', 'Sports and Leisure', 'Baby Products',
       'Household Essentials', 'Musical Instruments', 'Cool Stuff',
       'Furniture and Decoration', 'Electrical Appliances', 'Toys',
       'Bed and Bathroom', 'Construction Tools and Safety',
       'Computer Accessories', 'Beauty and Health',
       'Travel Bags and Accessories', 'Garden Tools', 'Office Furniture',
       'Automotive', 'Electronics', 'Fashion and Footwear', 'Telephony',
       'Stationery', 'Fashion Accessories and Bags', 'PCs',
       'Home Construction', 'Watches and Gifts',
       'Construction Tools for Construction', 'Pet Shop',
       'Agriculture, Industry, and Commerce', nan,
       'Living Room Furniture', 'Signage and Security',
       'Air Conditioning', 'Consoles and Games',
       'Books and General Interest', 'Fashion Underwear and Beach Wear',
       "Men's Clothing", 'Kitchen Furniture and Outdoor Spaces',
       'Industry, Commerce, and Business', 'Fixed Phone',
       'Building To

In [5]:
df_products_translated['product_category_name_english'].nunique()

71

In [6]:
import sys
print(sys.executable)


/opt/miniconda3/bin/python


In [7]:
import sys
!{sys.executable} -m pip install sentence-transformers



In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

categories = df_products_translated['product_category_name_english'].unique().tolist()
embeddings = model.encode(categories)
X = np.array(embeddings)
print(f"Embeddings shape: {X.shape}")  # (72, embedding_dim)


  from .autonotebook import tqdm as notebook_tqdm


Embeddings shape: (72, 384)


In [9]:
categories = (
    df_products_translated['product_category_name_english']
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

In [10]:
embeddings = model.encode(categories)
X = np.array(embeddings)
print(X.shape)


(71, 384)


In [11]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

scores = {}
for k in range(8, 26):
    kmeans_tmp = KMeans(n_clusters=k, random_state=42, n_init=100).fit(X)
    score = silhouette_score(X, kmeans_tmp.labels_)
    scores[k] = score
    print(f"k={k}, silhouette score={score:.3f}")

best_k = max(scores, key=scores.get)
print(f"Optimal number of clusters: {best_k}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


k=8, silhouette score=0.094
k=9, silhouette score=0.094
k=10, silhouette score=0.108
k=11, silhouette score=0.103
k=12, silhouette score=0.116
k=13, silhouette score=0.121
k=14, silhouette score=0.122
k=15, silhouette score=0.134
k=16, silhouette score=0.141
k=17, silhouette score=0.150
k=18, silhouette score=0.141
k=19, silhouette score=0.141
k=20, silhouette score=0.151
k=21, silhouette score=0.150
k=22, silhouette score=0.154
k=23, silhouette score=0.157
k=24, silhouette score=0.154
k=25, silhouette score=0.150
Optimal number of clusters: 23


In [12]:
from sklearn.cluster import KMeans

# Fit KMeans with n_init=100 to prevent empty clusters
kmeans = KMeans(
    n_clusters=best_k,
    random_state=42,
    n_init=100
).fit(X)

labels = kmeans.labels_

# Map each category to its numeric cluster
category_map = {
    cat: f"Cluster {label}" 
    for cat, label in zip(categories, labels)
}

# Add numeric cluster column to the DataFrame
df_products_translated["compressed_category"] = (
    df_products_translated["product_category_name_english"]
    .map(category_map)
)
df_products_translated

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,compressed_category
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0,Perfumery,Cluster 19
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,Arts,Cluster 14
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0,Sports and Leisure,Cluster 13
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0,Baby Products,Cluster 7
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0,Household Essentials,Cluster 4
...,...,...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0,Furniture and Decoration,Cluster 9
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0,Building Tools and Lighting,Cluster 8
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0,Bed and Bathroom,Cluster 9
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0,Computer Accessories,Cluster 11


In [94]:
import requests, json

def ollama_chat(prompt, model_name="llama3:8b"):
    url = "http://localhost:11434/api/chat"
    payload = {"model": model_name, "messages": [{"role": "user", "content": prompt}]}
    response = requests.post(url, json=payload, stream=True)
    full_output = ""
    for line in response.iter_lines():
        if not line: 
            continue
        try:
            data = json.loads(line.decode("utf-8"))
        except json.JSONDecodeError:
            continue
        if "message" in data and "content" in data["message"]:
            full_output += data["message"]["content"]
    return full_output.strip()


In [96]:
# Generate descriptive names for each numeric cluster
cluster_names = {}

for i in range(best_k):
    items = [cat for cat, label in category_map.items() if label == f"Cluster {i}"]
    prompt = (
        f"Cluster {i}: Name a meaningful category for these items:\n{items}\n"
        "Return only a single short descriptive name without quotes or punctuation."
    )
    cluster_names[f"Cluster {i}"] = ollama_chat(prompt)


In [97]:
seen = set()
unique_names = {}

for k, name in cluster_names.items():
    if name in seen:
        name = f"{name} ({k})"  # append cluster number if duplicate
    seen.add(name)
    unique_names[k] = name


In [98]:
df_products_translated["compressed_category"] = (
    df_products_translated["compressed_category"]
    .map(unique_names)
)


In [99]:
df_products_translated['compressed_category'].unique()

array(['Gift Shops', 'Creative Pursuits', 'Economic Activities',
       'Baby Essentials', 'Home Furnishings', 'Sound', 'Toys',
       'Household Furnishings', 'Building Supplies', 'Gaming Equipment',
       'Wellness', 'Luggage', 'Consumer Goods', 'Apparel Categories',
       'Phone Systems', 'Office Supplies', '"Home Essentials"', nan,
       '"Publications"', 'Holiday Supplies', 'Edibles',
       'Cooking Essentials', 'Visual Media', 'Home Entertainment'],
      dtype=object)

In [100]:
df_products_translated['compressed_category'].nunique()

23

In [101]:
# Check how many numeric clusters exist
print(len(category_map.values()), len(set(category_map.values())))

# Check cluster name counts
from collections import Counter
Counter(df_products_translated['compressed_category'])


71 23


Counter({'Household Furnishings': 6300,
         'Baby Essentials': 3671,
         'Home Furnishings': 3150,
         'Economic Activities': 3115,
         'Wellness': 2444,
         'Consumer Goods': 2417,
         'Gaming Equipment': 1989,
         'Gift Shops': 1601,
         'Building Supplies': 1449,
         'Phone Systems': 1250,
         'Luggage': 1198,
         'Office Supplies': 942,
         'Toys': 789,
         nan: 610,
         'Sound': 374,
         'Apparel Categories': 372,
         '"Publications"': 370,
         '"Home Essentials"': 341,
         'Edibles': 267,
         'Holiday Supplies': 91,
         'Creative Pursuits': 74,
         'Cooking Essentials': 51,
         'Home Entertainment': 49,
         'Visual Media': 37})

In [102]:
df_products_translated.to_csv("products_with_compressed_categories.csv", index=False)


In [103]:
mapping = df_products_translated[['product_category_name_english', 'compressed_category']].drop_duplicates()
print(mapping.sort_values('compressed_category'))


     product_category_name_english compressed_category
577                   Home Comfort   "Home Essentials"
70               Home Construction   "Home Essentials"
4713              Home Comfort (2)   "Home Essentials"
329                Technical Books      "Publications"
176     Books and General Interest      "Publications"
...                            ...                 ...
6                       Cool Stuff                Toys
854     Tablets and Image Printing        Visual Media
3195          Film and Photography        Visual Media
22               Beauty and Health            Wellness
105                            NaN                 NaN

[72 rows x 2 columns]


In [104]:
import pandas as pd

# Group original categories by compressed category
category_mapping_table = (
    df_products_translated
    .groupby('compressed_category')['product_category_name_english']
    .apply(list)
    .reset_index()
)

# Optional: sort by compressed category
category_mapping_table = category_mapping_table.sort_values('compressed_category')

# Display the table
print(category_mapping_table)


      compressed_category                      product_category_name_english
0       "Home Essentials"  [Home Construction, Home Construction, Home Co...
1          "Publications"  [Books and General Interest, Technical Books, ...
2      Apparel Categories  [Fashion and Footwear, Fashion Underwear and B...
3         Baby Essentials  [Baby Products, Toys, Baby Products, Baby Prod...
4       Building Supplies  [Construction Tools and Safety, Garden Tools, ...
5          Consumer Goods  [Automotive, Electronics, Automotive, Automoti...
6      Cooking Essentials  [Kitchenware, Kitchenware, Oven, and Coffee Pr...
7       Creative Pursuits  [Arts, Arts, Arts, Arts, Arts and Handicrafts,...
8     Economic Activities  [Sports and Leisure, Sports and Leisure, Sport...
9                 Edibles  [Beverages, Beverages, Food and Beverages, Bev...
10       Gaming Equipment  [Computer Accessories, Computer Accessories, C...
11             Gift Shops  [Perfumery, Perfumery, Pet Shop, Perfumery, Pe...