In [1]:
import json
import numpy as np
from sklearn.cluster import KMeans

# Load embeddings from the JSON files
embedding_files = ['embeddings/animals.json', 'embeddings/companies.json', 'embeddings/computer_components.json', 'embeddings/events.json',
                  'embeddings/landmarks.json','embeddings/movies.json','embeddings/people.json',
                  'embeddings/research_fields.json','embeddings/university.json','embeddings/vehicles.json', 'embeddings/prompts.json']

all_embeddings = []
file_names = []  # To store the corresponding file names

# Iterate through each file and load its content
for file in embedding_files:
    with open(file, 'r') as f:
        data = json.load(f)  # Load the data from each file
        
        # Add embeddings and file names
        embeddings = np.array([item['embedding'] for item in data])  # Extract 'embedding' key
        all_embeddings.append(embeddings)  # Add the embeddings
        file_names.extend([item['file_name'] for item in data])  # Store file names

# Combine all embeddings into a single array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Check the shape of the combined embeddings
print("Combined embeddings shape:", all_embeddings.shape)

Combined embeddings shape: (210, 1536)


In [2]:
from sklearn.decomposition import PCA

pca = PCA(n_components=9)  # Reduce to 9 dimensions
reduced_embeddings = pca.fit_transform(all_embeddings)

kmeans40 = KMeans(n_clusters=40, init='k-means++',n_init=20, max_iter=300, random_state=42)
labels40 = kmeans40.fit_predict(reduced_embeddings)

print(labels40)

[31 31 31 31 31  5  5  5  5  5 29 29 29 29 29 29 29 29 29 29  6  6  6  6
  6 33 33 33 33 33 16 16 16 16 16 36 36 36 36 36  9  9  9  9  9 30 30 30
 30 30  9  9  9  9  9 35 35 35 35 35 18 18 18 18 18  1  1  1  1  1 38 38
 38 38 38  7  7  7  7  7 19 19 19 19 19 12 12 12 12 12 20 20 20 20 20  2
  2  2  2  2 25 25 25 25 25 17 17 17 17 17 34 34 34 34 34 10 10 10 10 10
 13 13 13 13 13 22 22 22 22 22  3  3  3  3  3 15 15 15 15 15 14 14 14 14
 14 23 23 23 23 23 28 28 28 28 28  0  0  0  0  0  8  8  8  8  8 21 21 21
 21 21 11 11 11 11 11 27 27 27 27 27 24 24 24 24 24 32 32 32 32 32  4  4
  4  4  4 26 26 26 26 26 37 20 39 30 38 28 16  1 25 27]


In [3]:
import os, re

def extract_entity(fn):
    base = os.path.splitext(fn)[0]
    base = base.replace('prompt_', '').replace('_prompt', '')
    return re.sub(r'\d+$', '', base)

true_entities = [extract_entity(fn) for fn in file_names]

# H1: define 10 “domain” groups
expected_groups = {
    0: ['havel','klaus','einstein','mandela'],
    1: ['chemistry','economics','mathematics','physics'],
    2: ['amazon','google','microsoft','tencent'],
    3: ['airplane','boat','car','train'],
    4: ['boston_tea','moon_landing','velvet_revolution','world_war'],
    5: ['buckingham_palace','charles_bridge','eiffel_tower','fuji'],
    6: ['cat','dolphin','elephant','giraffe'],
    7: ['cpu','gpu','motherboard','ram'],
    8: ['cuni','mit','oxford','vse'],
    9: ['lord_rings','matrix','pirates','star_wars']
}
entity_to_group = {ent: grp for grp, ents in expected_groups.items() for ent in ents}

# H2: one cluster per entity
unique_entities = sorted(set(true_entities))
entity_to_idx    = {ent: idx for idx, ent in enumerate(unique_entities)}

# Build ground-truth labels
groundtruth10_prompt = [entity_to_group[ent]   for ent in true_entities]
groundtruth40_prompt = [entity_to_idx[ent]     for ent in true_entities]

invalid = [e for e in true_entities if e not in entity_to_group]
if invalid:
    raise ValueError(f"Unmapped in H1 groups: {set(invalid)}")

print(groundtruth10_prompt)
print(groundtruth40_prompt)
# Print the clusters with the corresponding filenames
for cluster_num in range(40):
    cluster_members = [file_names[idx] for idx in range(len(labels40)) if labels40[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 6, 7, 0, 1, 2, 4, 9, 8]
[6, 6, 6, 6, 6, 11, 11, 11, 11, 11, 15, 15, 15, 15, 15, 17, 17, 17, 17, 17, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 26, 26, 26, 26, 26, 35, 35, 35, 35, 35, 9, 9, 9, 9, 9, 19, 19, 19, 19, 19, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 3, 3, 3, 3, 3, 28, 28, 28, 28, 28, 37, 37, 37, 37, 37, 39, 39, 39, 39, 39, 4, 4, 4, 4, 4, 7, 7, 7, 7, 7, 13, 13, 13, 13, 13, 16, 16, 16, 16, 16

In [4]:
from sklearn.metrics import rand_score
rand_index = rand_score(groundtruth40_prompt, labels40)

print(f"Rand Index: {rand_index}")

Rand Index: 0.9963545226703121


In [5]:
from sklearn.metrics import adjusted_rand_score

ari40 = adjusted_rand_score(groundtruth40_prompt, labels40)
print(f"Adjusted Rand Index: {ari40:.4f}")

Adjusted Rand Index: 0.9130


In [6]:
# Apply k-means clustering with 10 clusters
kmeans10 = KMeans(n_clusters=10, random_state=42)
labels10 = kmeans10.fit_predict(reduced_embeddings)

# Check the cluster assignments
print(labels10)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 0 9 9 9 9 9 3 3 3 3 3 9 9 9 9 9 9 9 9 9 9 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 9 2 6 1 5 4 0 8 3]


In [7]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(10):
    cluster_members = [file_names[idx] for idx in range(len(labels10)) if labels10[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

Cluster 0: ['boston_tea1.txt', 'boston_tea2.txt', 'boston_tea3.txt', 'boston_tea4.txt', 'boston_tea5.txt', 'moon_landing1.txt', 'moon_landing2.txt', 'moon_landing3.txt', 'moon_landing4.txt', 'moon_landing5.txt', 'world_war1.txt', 'world_war2.txt', 'world_war3.txt', 'world_war4.txt', 'world_war5.txt', 'einstein1.txt', 'einstein2.txt', 'einstein3.txt', 'einstein4.txt', 'einstein5.txt', 'mandela1.txt', 'mandela2.txt', 'mandela3.txt', 'mandela4.txt', 'mandela5.txt', 'mit1.txt', 'mit2.txt', 'mit3.txt', 'mit4.txt', 'mit5.txt', 'prompt_moon_landing.txt']
Cluster 1: ['velvet_revolution1.txt', 'velvet_revolution2.txt', 'velvet_revolution3.txt', 'velvet_revolution4.txt', 'velvet_revolution5.txt', 'havel1.txt', 'havel2.txt', 'havel3.txt', 'havel4.txt', 'havel5.txt', 'klaus1.txt', 'klaus2.txt', 'klaus3.txt', 'klaus4.txt', 'klaus5.txt', 'prompt_havel.txt']
Cluster 2: ['cat1.txt', 'cat2.txt', 'cat3.txt', 'cat4.txt', 'cat5.txt', 'dolphin1.txt', 'dolphin2.txt', 'dolphin3.txt', 'dolphin4.txt', 'dolphin

In [8]:
from sklearn.metrics import rand_score
rand_index = rand_score(groundtruth10_prompt, labels10)

print(f"Rand Index: {rand_index}")

Rand Index: 0.9646844383686489


In [9]:
from sklearn.metrics import adjusted_rand_score

ari10 = adjusted_rand_score(groundtruth10_prompt, labels10)
print(f"Adjusted Rand Index: {ari10:.4f}")

Adjusted Rand Index: 0.7992


In [10]:
from sklearn.metrics import pair_confusion_matrix
def pairwise_prf(y_true, y_pred):
    tn, fp, fn, tp = pair_confusion_matrix(y_true, y_pred).ravel()
    prec = tp / (tp + fp) if tp + fp > 0 else 0.0
    rec  = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1   = 2 * prec * rec / (prec + rec) if prec + rec > 0 else 0.0
    return {"precision": prec, "recall": rec, "f1": f1}

In [11]:
metrics10 = pairwise_prf(groundtruth10_prompt, labels10)
print(f"Pairwise Precision: {metrics10['precision']:.4f}")
print(f"Pairwise Recall:    {metrics10['recall']   :.4f}")
print(f"Pairwise F1-score:  {metrics10['f1']       :.4f}")

Pairwise Precision: 0.8046
Pairwise Recall:    0.8333
Pairwise F1-score:  0.8187


In [12]:
metrics40 = pairwise_prf(groundtruth40_prompt, labels40)
print(f"Pairwise Precision: {metrics40['precision']:.4f}")
print(f"Pairwise Recall:    {metrics40['recall']   :.4f}")
print(f"Pairwise F1-score:  {metrics40['f1']       :.4f}")

Pairwise Precision: 0.8776
Pairwise Recall:    0.9556
Pairwise F1-score:  0.9149
