In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [2]:
df = pd.read_csv(
    "D:/Projects/nlp_qa_platform/data/processed/final_dataset.csv",
    encoding="latin1",
    low_memory=False
)

embeddings = np.load(
    "D:/Projects/nlp_qa_platform/data/embeddings/question_embeddings.npy"
)

print(df.shape)
print(embeddings.shape)


(25000, 6)
(25000, 384)


In [3]:
NUM_CLUSTERS = 30



In [4]:
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS,
    random_state=42,
    n_init=10
)

cluster_labels = kmeans.fit_predict(embeddings)


In [5]:
df["cluster_id"] = cluster_labels
df[["Processed_Text", "cluster_id"]].head()


Unnamed: 0,Processed_Text,cluster_id
0,compare excel file ssis foreach loop container...,1
1,black screen appear randomly take image tap us...,4
2,java include file jar file intellij idea devel...,5
3,add summary column reporting services matrix s...,6
4,function loop call return value dict understan...,12


In [6]:
score = silhouette_score(embeddings, cluster_labels)
print("Silhouette Score:", score)


Silhouette Score: 0.02770848199725151


In [7]:
for cid in range(5):
    print(f"\n--- Cluster {cid} ---")
    sample_texts = df[df["cluster_id"] == cid]["Processed_Text"].head(3)
    for t in sample_texts:
        print("-", t[:120])




--- Cluster 0 ---
- possible count code execution step prossible report calculation execute section code know benchmark gem report runtime b
- linux program run exit ssh non root user search google sit irc week talk friend devoutly align linux receive solid answe
- immunity debugger give name variable address work immunity debugger deal lot address form get tricky call nest way choos

--- Cluster 1 ---
- compare excel file ssis foreach loop container introduction multiple excel file loop foreach loop container ssis package
- pass object python powershell want integration python script powershell script need option pass object class powershell 
- file string manipulation prolog new prolog know technique use want file name want want function return list string mean 

--- Cluster 2 ---
- have trouble select item drop value dynamic webdriver webdriver drop form value dynamic number change time form open sel
- issue post ajax mysql data base update pretty sure code good test issue async et

In [8]:
df.to_csv(
    "D:/Projects/nlp_qa_platform/data/processed/final_dataset_with_clusters.csv",
    index=False
)
