In [None]:
import json
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_distances
from collections import defaultdict

INPUT_FILE = "user_queries.json"
MODEL_CACHE_DIR = "/home/ana/ACS/rag/models"

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    questions = json.load(f)

# load embedding model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', cache_folder=MODEL_CACHE_DIR)

# generate embeddings
embeddings = model.encode(questions, convert_to_tensor=False)

# agglomerative clustering using the distance matrix
cosine_dist_matrix = cosine_distances(embeddings)
clustering_model = AgglomerativeClustering(
    metric='precomputed',
    linkage='average',
    distance_threshold=0.3,
    n_clusters=None,
)

labels = clustering_model.fit_predict(cosine_dist_matrix)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Collect clustered questions
clusters = defaultdict(list)
for question, label in zip(questions, labels):
    clusters[int(label)].append(question)

# Save to JSON
with open("clusters.json", "w", encoding="utf-8") as f:
    json.dump(clusters, f, ensure_ascii=False, indent=2, sort_keys=True)

print(f"Clustered {len(questions)} questions into {len(clusters)} clusters.")

Clustered 941 questions into 331 clusters.


In [3]:
# form clusters
clusters = defaultdict(list)
for question, label in zip(questions, labels):
    clusters[int(label)].append(question)

# group clusters by length
length_clusters = defaultdict(list)
for key, value in clusters.items():
    length_clusters[len(value)].append(value)

sorted_clusters = dict(sorted(length_clusters.items(), reverse=True))

OUTPUT_FILE = "clusters.json"

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(sorted_clusters, f, ensure_ascii=False, indent=2)

print(f"Created {len(clusters)} clusters with {len(length_clusters)} unique lengths.")
print(f"Clusters are grouped by length and saved to {OUTPUT_FILE}")

Created 331 clusters with 17 unique lengths.
Clusters are grouped by length and saved to clusters.json
