## Read data

In [434]:
import pandas as pd

df_closed_deals=pd.read_csv('../data/brazilian_e-commerce/olist_closed_deals_dataset.csv')
df_closed_deals.head()

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue
0,5420aad7fec3549a85876ba1c529bd84,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26 19:58:54,pet,online_medium,cat,,,,reseller,,0.0
1,a555fb36b9368110ede0f043dfc3b9a0,bbb7d7893a450660432ea6652310ebb7,09285259593c61296eef10c734121d5b,d3d1e91a157ea7f90548eef82f1955e3,2018-05-08 20:17:59,car_accessories,industry,eagle,,,,reseller,,0.0
2,327174d3648a2d047e8940d7d15204ca,612170e34b97004b3ba37eae81836b4c,b90f87164b5f8c2cfa5c8572834dbe3f,6565aa9ce3178a5caf6171827af3a9ba,2018-06-05 17:27:23,home_appliances,online_big,cat,,,,reseller,,0.0
3,f5fee8f7da74f4887f5bcae2bafb6dd6,21e1781e36faf92725dde4730a88ca0f,56bf83c4bb35763a51c2baab501b4c67,d3d1e91a157ea7f90548eef82f1955e3,2018-01-17 13:51:03,food_drink,online_small,,,,,reseller,,0.0
4,ffe640179b554e295c167a2f6be528e0,ed8cb7b190ceb6067227478e48cf8dde,4b339f9567d060bcea4f5136b9f5949e,d3d1e91a157ea7f90548eef82f1955e3,2018-07-03 20:17:45,home_appliances,industry,wolf,,,,manufacturer,,0.0


## Data inspection

In [435]:
df_closed_deals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 842 entries, 0 to 841
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   mql_id                         842 non-null    object 
 1   seller_id                      842 non-null    object 
 2   sdr_id                         842 non-null    object 
 3   sr_id                          842 non-null    object 
 4   won_date                       842 non-null    object 
 5   business_segment               841 non-null    object 
 6   lead_type                      836 non-null    object 
 7   lead_behaviour_profile         665 non-null    object 
 8   has_company                    63 non-null     object 
 9   has_gtin                       64 non-null     object 
 10  average_stock                  66 non-null     object 
 11  business_type                  832 non-null    object 
 12  declared_product_catalog_size  69 non-null     flo

In [436]:
df_closed_deals.isnull().sum()

mql_id                             0
seller_id                          0
sdr_id                             0
sr_id                              0
won_date                           0
business_segment                   1
lead_type                          6
lead_behaviour_profile           177
has_company                      779
has_gtin                         778
average_stock                    776
business_type                     10
declared_product_catalog_size    773
declared_monthly_revenue           0
dtype: int64

In [437]:
df_closed_deals.duplicated().sum()

np.int64(0)

In [438]:
df_closed_deals['seller_id'].nunique()

842

In [439]:
df_closed_deals['mql_id'].nunique()

842

In [440]:
df_closed_deals['business_segment'].nunique()

33

In [441]:
df_closed_deals['business_segment'].value_counts()

business_segment
home_decor                         105
health_beauty                       93
car_accessories                     77
household_utilities                 71
construction_tools_house_garden     69
audio_video_electronics             64
computers                           34
pet                                 30
food_supplement                     28
food_drink                          26
sports_leisure                      25
bed_bath_table                      22
bags_backpacks                      22
toys                                20
fashion_accessories                 19
home_office_furniture               14
stationery                          13
phone_mobile                        13
small_appliances                    12
handcrafted                         12
baby                                10
music_instruments                    9
books                                9
jewerly                              8
watches                              8
home_app

In [442]:
df_closed_deals['business_segment'].isnull().sum()

np.int64(1)

In [443]:
df_closed_deals[df_closed_deals['business_segment'].isnull()]


Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue
186,a93dc621a446eb77129989e557dd50d0,ec5b3cd9d6bf0a880edfda73562a7cea,9d12ef1a7eca3ec58c545c678af7869c,2695de1affa7750089c0455f8ce27021,2018-06-11 14:29:08,,industry,,,,,reseller,,0.0


## Business segment standardization & mapping

In [444]:
import sys
print(sys.executable)

/opt/miniconda3/bin/python


In [445]:
import sys
!{sys.executable} -m pip install sentence-transformers

python(75108) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [446]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

business_segments = df_closed_deals['business_segment'].unique().tolist()
embeddings = model.encode(business_segments)
X = np.array(embeddings)
print(f"Embeddings shape: {X.shape}")  

Embeddings shape: (34, 384)


In [447]:
business_segments = (
    df_closed_deals['business_segment']
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

In [448]:
embeddings = model.encode(business_segments)
X = np.array(embeddings)
print(X.shape)   

(33, 384)


In [449]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

scores = {}
for k in range(5, 16):
    kmeans_tmp = KMeans(n_clusters=k, random_state=42, n_init=100).fit(X)
    score = silhouette_score(X, kmeans_tmp.labels_)
    scores[k] = score
    print(f"k={k}, silhouette score={score:.3f}")

best_k = max(scores, key=scores.get)
print(f"Optimal number of clusters: {best_k}")

k=5, silhouette score=0.063
k=6, silhouette score=0.064
k=7, silhouette score=0.064
k=8, silhouette score=0.068
k=9, silhouette score=0.077
k=10, silhouette score=0.072
k=11, silhouette score=0.070
k=12, silhouette score=0.072
k=13, silhouette score=0.067
k=14, silhouette score=0.071
k=15, silhouette score=0.067
Optimal number of clusters: 9


In [450]:
from sklearn.cluster import KMeans

# Fit KMeans with n_init=20 to prevent empty clusters
kmeans = KMeans(
    n_clusters=best_k,
    random_state=42,
    n_init=100
).fit(X)

labels = kmeans.labels_

# Map each category to its numeric cluster
business_segment_map = {
    cat: f"Cluster {label}" 
    for cat, label in zip(business_segments, labels)
}

# Add numeric cluster column to the DataFrame
df_closed_deals['compressed_business_segment'] = (
    df_closed_deals['business_segment']
    .map(business_segment_map)
)
df_closed_deals.head()

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue,compressed_business_segment
0,5420aad7fec3549a85876ba1c529bd84,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26 19:58:54,pet,online_medium,cat,,,,reseller,,0.0,Cluster 6
1,a555fb36b9368110ede0f043dfc3b9a0,bbb7d7893a450660432ea6652310ebb7,09285259593c61296eef10c734121d5b,d3d1e91a157ea7f90548eef82f1955e3,2018-05-08 20:17:59,car_accessories,industry,eagle,,,,reseller,,0.0,Cluster 1
2,327174d3648a2d047e8940d7d15204ca,612170e34b97004b3ba37eae81836b4c,b90f87164b5f8c2cfa5c8572834dbe3f,6565aa9ce3178a5caf6171827af3a9ba,2018-06-05 17:27:23,home_appliances,online_big,cat,,,,reseller,,0.0,Cluster 3
3,f5fee8f7da74f4887f5bcae2bafb6dd6,21e1781e36faf92725dde4730a88ca0f,56bf83c4bb35763a51c2baab501b4c67,d3d1e91a157ea7f90548eef82f1955e3,2018-01-17 13:51:03,food_drink,online_small,,,,,reseller,,0.0,Cluster 5
4,ffe640179b554e295c167a2f6be528e0,ed8cb7b190ceb6067227478e48cf8dde,4b339f9567d060bcea4f5136b9f5949e,d3d1e91a157ea7f90548eef82f1955e3,2018-07-03 20:17:45,home_appliances,industry,wolf,,,,manufacturer,,0.0,Cluster 3


In [451]:
import requests, json

def ollama_chat(prompt, model_name="llama3:8b"):
    url = "http://localhost:11434/api/chat"
    payload = {"model": model_name, "messages": [{"role": "user", "content": prompt}]}
    response = requests.post(url, json=payload, stream=True)
    full_output = ""
    for line in response.iter_lines():
        if not line: 
            continue
        try:
            data = json.loads(line.decode("utf-8"))
        except json.JSONDecodeError:
            continue
        if "message" in data and "content" in data["message"]:
            full_output += data["message"]["content"]
    return full_output.strip()


In [452]:
# Generate descriptive names for each numeric cluster
cluster_names = {}

for i in range(best_k):
    items = [cat for cat, label in business_segment_map.items() if label == f"Cluster {i}"]
    prompt = (
        f"Cluster {i}: Name a meaningful category for these items:\n{items}\n"
        "Return only a single short descriptive name without quotes or punctuation."
    )
    cluster_names[f"Cluster {i}"] = ollama_chat(prompt)

In [453]:
seen = set()
unique_names = {}

for k, name in cluster_names.items():
    if name in seen:
        name = f"{name} ({k})"  # append cluster number if duplicate
    seen.add(name)
    unique_names[k] = name


In [454]:
df_closed_deals["compressed_business_segment"] = (
    df_closed_deals["compressed_business_segment"]
    .map(unique_names)
)
df_closed_deals.head()

Unnamed: 0,mql_id,seller_id,sdr_id,sr_id,won_date,business_segment,lead_type,lead_behaviour_profile,has_company,has_gtin,average_stock,business_type,declared_product_catalog_size,declared_monthly_revenue,compressed_business_segment
0,5420aad7fec3549a85876ba1c529bd84,2c43fb513632d29b3b58df74816f1b06,a8387c01a09e99ce014107505b92388c,4ef15afb4b2723d8f3d81e51ec7afefe,2018-02-26 19:58:54,pet,online_medium,cat,,,,reseller,,0.0,Occasions
1,a555fb36b9368110ede0f043dfc3b9a0,bbb7d7893a450660432ea6652310ebb7,09285259593c61296eef10c734121d5b,d3d1e91a157ea7f90548eef82f1955e3,2018-05-08 20:17:59,car_accessories,industry,eagle,,,,reseller,,0.0,Accessories
2,327174d3648a2d047e8940d7d15204ca,612170e34b97004b3ba37eae81836b4c,b90f87164b5f8c2cfa5c8572834dbe3f,6565aa9ce3178a5caf6171827af3a9ba,2018-06-05 17:27:23,home_appliances,online_big,cat,,,,reseller,,0.0,Home Furnishings
3,f5fee8f7da74f4887f5bcae2bafb6dd6,21e1781e36faf92725dde4730a88ca0f,56bf83c4bb35763a51c2baab501b4c67,d3d1e91a157ea7f90548eef82f1955e3,2018-01-17 13:51:03,food_drink,online_small,,,,,reseller,,0.0,Wellness
4,ffe640179b554e295c167a2f6be528e0,ed8cb7b190ceb6067227478e48cf8dde,4b339f9567d060bcea4f5136b9f5949e,d3d1e91a157ea7f90548eef82f1955e3,2018-07-03 20:17:45,home_appliances,industry,wolf,,,,manufacturer,,0.0,Home Furnishings


In [455]:
df_closed_deals["compressed_business_segment"].unique()

array(['Occasions', 'Accessories', 'Home Furnishings', 'Wellness',
       'Consumer Goods', 'Home Entertainment', 'CraftedGoods',
       'Accessories (Cluster 7)', 'Electronics', nan], dtype=object)

In [456]:
df_closed_deals["compressed_business_segment"].nunique()

9

In [457]:
# Check how many numeric clusters exist
print(len(business_segment_map.values()), len(set(business_segment_map.values())))

# Check cluster name counts
from collections import Counter
Counter(df_closed_deals['compressed_business_segment'])

33 9


Counter({'Home Furnishings': 300,
         'Wellness': 147,
         'Accessories': 118,
         'Electronics': 86,
         'Consumer Goods': 78,
         'Occasions': 49,
         'Home Entertainment': 30,
         'CraftedGoods': 25,
         'Accessories (Cluster 7)': 8,
         nan: 1})

In [458]:
mapping = df_closed_deals[['business_segment', 'compressed_business_segment']].drop_duplicates()
print(mapping.sort_values('compressed_business_segment'))

                    business_segment compressed_business_segment
1                    car_accessories                 Accessories
53                    bags_backpacks                 Accessories
23               fashion_accessories                 Accessories
24                           jewerly     Accessories (Cluster 7)
348                            gifts              Consumer Goods
237                          perfume              Consumer Goods
6                          computers              Consumer Goods
13                              toys              Consumer Goods
94                             books              Consumer Goods
22                           watches              Consumer Goods
199                      handcrafted                CraftedGoods
15                        stationery                CraftedGoods
161                     phone_mobile                 Electronics
69                 music_instruments                 Electronics
33           audio_video_

In [459]:
import pandas as pd

business_segment_mapping_table = (
    df_closed_deals
    .groupby('compressed_business_segment')['business_segment']
    .apply(lambda x: sorted(set(x.dropna().astype(str))))   # FIX: drop NaN + convert to str + sort
    .reset_index()
)

business_segment_mapping_table = business_segment_mapping_table.sort_values('compressed_business_segment')

print(business_segment_mapping_table)



  compressed_business_segment  \
0                 Accessories   
1     Accessories (Cluster 7)   
2              Consumer Goods   
3                CraftedGoods   
4                 Electronics   
5          Home Entertainment   
6            Home Furnishings   
7                   Occasions   
8                    Wellness   

                                    business_segment  
0  [bags_backpacks, car_accessories, fashion_acce...  
1                                          [jewerly]  
2  [books, computers, gifts, perfume, toys, watches]  
3                          [handcrafted, stationery]  
4  [audio_video_electronics, music_instruments, p...  
5  [air_conditioning, games_consoles, sports_leis...  
6  [bed_bath_table, construction_tools_house_gard...  
7               [baby, other, party, pet, religious]  
8       [food_drink, food_supplement, health_beauty]  


In [460]:
df_closed_deals.to_csv('prep_closed_deals_LLM_business_category_mapping.csv', index=False)