In [43]:
from transformers import AutoModel , AutoTokenizer
import pandas as pd
import torch
from sklearn.cluster import KMeans
from tqdm import tqdm

In [44]:
# If gpu is available set the device onto gpu otherwise cpu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
# read data
df = pd.read_csv("/content/drive/MyDrive/BA Project/productSampled.csv",encoding='utf-8')
# print a sample of data
df.head()

Unnamed: 0,id,category_name,titles,min_price,max_price,avg_price,min_num_shops,max_num_shops,avg_num_shops
0,1867826,میکروسکوپ,میکروسکوپ اپتیکی سلسترون مدل 44121 cgl,1900000.0,2082000.0,1928314.0,1,2,1.655303
1,9418195,پرینتر لیزری و چند کاره,پرینتر حرارتی canon selphy cp1000 پرينتر چاپ ...,,,,0,0,0.0
2,527022,منگنه و پانچ,دستگاه منگنه فلزی مپد مدل essentials کد e 352...,50000.0,65000.0,63575.42,1,1,1.0
3,3190482,فلاسک، قمقمه و ظروف سفر,فلاسک هنری مدل roxana 450 کد sm 259 فلاسک هنر...,359000.0,359000.0,359000.0,0,1,0.054545
4,5297025,مخلوط کن برقی,مخلوط کن پاناسونیک mx n800g آسیاب مخلوط کن پا...,1280000.0,1449000.0,1332843.0,6,12,9.434912


In [27]:
# print number of each category sample
df['category_name'].value_counts()

Unnamed: 0_level_0,count
category_name,Unnamed: 1_level_1
کتاب و مجلات,1410
کیف و کاور گوشی,979
ساعت مچی عقربه‌ ای و دیجیتالی,699
فایل‌های دانلودی,679
کیف و کوله,376
...,...
بارهنگ,1
طلق و شیرازه,1
چراغ اسپورت خودرو,1
سبزی خشک کن,1


In [28]:
number_of_categories = len(df['category_name'].value_counts().index)
number_of_categories

2667

In [47]:
tokenizer = AutoTokenizer.from_pretrained("sbunlp/fabert")
embedder = AutoModel.from_pretrained("sbunlp/fabert").to(DEVICE)

Some weights of BertModel were not initialized from the model checkpoint at sbunlp/fabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
def format_instruction(sample):
    return f"""
        {sample['category_name']}
        [SEP]
        {sample['titles']}
    """.strip()



In [49]:
prompts = []
for idx in tqdm(range(len(df))):
    prompts.append(
        format_instruction(df.iloc[idx])
    )

100%|██████████| 36123/36123 [00:01<00:00, 18683.83it/s]


In [50]:
import torch
from tqdm import tqdm

# Define your batch size
BATCH_SIZE = 64
MAX_SENTENCE_LENGTH = 128

embeds = []
num_batches = len(prompts) // BATCH_SIZE + (1 if len(prompts) % BATCH_SIZE != 0 else 0)

for batch_idx in tqdm(range(num_batches)):
    # Determine the start and end indices of the current batch
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min((batch_idx + 1) * BATCH_SIZE, len(prompts))

    # Get the batch of prompts
    batch_prompts = prompts[start_idx:end_idx]

    # Tokenize the batch of prompts
    tokenized_texts = tokenizer(batch_prompts, return_tensors="pt", truncation=True, padding=True, max_length=MAX_SENTENCE_LENGTH)

    # Move tokenized tensors to the GPU (if available)
    input_ids = tokenized_texts['input_ids'][:,:MAX_SENTENCE_LENGTH].to(DEVICE)
    attention_mask = tokenized_texts['attention_mask'][:,:MAX_SENTENCE_LENGTH].to(DEVICE)

    # Get embeddings for the batch
    with torch.no_grad():  # Ensure gradients are not computed for efficiency
        out = embedder(input_ids, attention_mask).last_hidden_state.mean(dim=1).cpu().detach().numpy()

    # Append embeddings to the list
    embeds.extend(out)



100%|██████████| 565/565 [03:00<00:00,  3.13it/s]


In [51]:
class KMeans:
    def __init__(self, n_clusters, max_iter=100, tol=1e-4):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.centroids = None

    def initialize_centroids(self, X):
        # Randomly choose `n_clusters` points from X as initial centroids
        indices = torch.randperm(X.size(0))[:self.n_clusters]
        self.centroids = X[indices]

    def compute_distances(self, X):
        # Compute the distance between each point and each centroid
        distances = torch.cdist(X, self.centroids, p=2)
        return distances

    def fit(self, X):
        # Move data to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        X = X.to(device)

        # Initialize centroids
        self.initialize_centroids(X)

        for i in tqdm(range(self.max_iter)):
            old_centroids = self.centroids.clone()

            # Step 1: Assign clusters based on the closest centroid
            distances = self.compute_distances(X)
            cluster_labels = torch.argmin(distances, dim=1)

            # Step 2: Recompute centroids
            new_centroids = torch.stack([X[cluster_labels == k].mean(dim=0) for k in range(self.n_clusters)])

            # Check for convergence (using L2 norm)
            if torch.norm(new_centroids - old_centroids, p=2) < self.tol:
                print(f'Converged at iteration {i}')
                break

            self.centroids = new_centroids

        # Store the final cluster assignments
        return cluster_labels.cpu()

    def predict(self, X):
        X = X.to(self.centroids.device)  # Make sure data is on the same device as centroids
        distances = self.compute_distances(X)
        return torch.argmin(distances, dim=1).cpu()

In [56]:
import torch
import numpy as np

embeds_array = np.vstack(embeds)  # Stack the list of numpy arrays into one 2D array
embeds_tensor = torch.tensor(embeds_array)  # Convert to PyTorch tensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embeds_tensor = embeds_tensor.to(device)
kmeans = KMeans(n_clusters=1000, max_iter=10000,tol=1e-4)
cluster_labels = kmeans.fit(embeds_tensor)
print(cluster_labels)

  0%|          | 46/10000 [00:04<18:00,  9.21it/s]

Converged at iteration 46
tensor([854, 312, 746,  ..., 815, 685, 359])





In [57]:
cluster_labels = np.array(cluster_labels)
print("Cluster assignments:", cluster_labels)

Cluster assignments: [854 312 746 ... 815 685 359]


In [58]:
df = pd.read_csv("/content/drive/MyDrive/BA Project/productSampled.csv",encoding='utf-8')
df['cluster_label'] = cluster_labels

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36123 entries, 0 to 36122
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             36123 non-null  int64  
 1   category_name  36123 non-null  object 
 2   titles         36123 non-null  object 
 3   min_price      31172 non-null  float64
 4   max_price      31172 non-null  float64
 5   avg_price      31172 non-null  float64
 6   min_num_shops  36123 non-null  int64  
 7   max_num_shops  36123 non-null  int64  
 8   avg_num_shops  36123 non-null  float64
 9   cluster_label  36123 non-null  int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 2.8+ MB


In [59]:
df.to_csv('/content/drive/MyDrive/BA Project/productClustered3.csv', index=False,encoding='utf-8-sig')

In [60]:
df = df.groupby("cluster_label")
for cluster_label, group in df:
    print(f"Cluster {cluster_label}:")
    print(list(set(group['category_name'].tolist())))  # List of category names in this cluster
    print("-" * 50)  # Separator between clusters

Cluster 0:
['تبر، بیل و کلنگ', 'سوهان و سنباده کارگاهی', 'قیچی و ابزار باغبانی', 'لوازم جانبی ابزارآلات', 'سیم لخت\u200cکن و پرس کابل', 'چمن زن و داس موتوری', 'پشم چین دام', 'قلم تراش و غلاف', 'قاشق، چنگال و کارد', 'سنبه و لوازم نشانه گذاری', 'قیچی ورق و مفتول بر', 'سایر لوازم صافکاری', 'سنباده دیسکی، نواری و لرزان', 'شیار زن، کاشی و مرمر بر', 'فرز و مینی فرز', 'شانه و برس', 'میخ، پیچ و رول پلاک', 'سایر ابزار دستی و تجهیزات کارگاهی', 'سوسیس و کالباس']
--------------------------------------------------
Cluster 1:
['موکت', 'کف پوش، دیوار پوش و پارکت']
--------------------------------------------------
Cluster 2:
['سایر لوازم تزئینی خودرو', 'طبق خودرو', 'آرم خودرو', 'سایر لوازم خودرو']
--------------------------------------------------
Cluster 3:
['مکرومه بافی', 'پولک و منجوق']
--------------------------------------------------
Cluster 4:
['سه چرخه', 'کامیون و تریلی اسباب بازی', 'ماشین، قطار، کشتی اسباب بازی', 'موتور اسباب بازی', 'ماکت ماشین', 'اتوبوس و مینی بوس اسباب بازی', 'ماشین اسباب 