In [1]:
import json
import numpy as np
from sklearn.cluster import KMeans

# Load embeddings from the JSON files
embedding_files = ['embeddings/animals.json', 'embeddings/companies.json', 'embeddings/computer_components.json', 'embeddings/events.json',
                  'embeddings/landmarks.json','embeddings/movies.json','embeddings/people.json',
                  'embeddings/research_fields.json','embeddings/university.json','embeddings/vehicles.json', 'embeddings/prompts.json']

all_embeddings = []
file_names = []  # To store the corresponding file names

# Iterate through each file and load its content
for file in embedding_files:
    with open(file, 'r') as f:
        data = json.load(f)  # Load the data from each file
        
        # Add embeddings and file names
        embeddings = np.array([item['embedding'] for item in data])  # Extract 'embedding' key
        all_embeddings.append(embeddings)  # Add the embeddings
        file_names.extend([item['file_name'] for item in data])  # Store file names

# Combine all embeddings into a single array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Check the shape of the combined embeddings
print("Combined embeddings shape:", all_embeddings.shape)

Combined embeddings shape: (210, 1536)


In [2]:
from sklearn.decomposition import PCA

pca = PCA(n_components=9)  # Reduce to 9 dimensions (adjust based on your data)
reduced_embeddings = pca.fit_transform(all_embeddings)

kmeans40 = KMeans(n_clusters=40, init='k-means++', n_init=10, max_iter=300, random_state=42)
clusters = kmeans40.fit_predict(reduced_embeddings)

print(clusters)

[ 2  2  2  2  2 28 28 28 28 28 13 13 13 13 13 13 13 13 13 13 21 21 21 21
 21  6  6  6  6  6 35 35 35 35 35 21 21 21 21 21  3  3  3  3  3 30 30 30
 30 30 37 37 37 37 37 34 34 34 34 34 19 19 19 19 19  8  8  8  8  8 12 12
 12 12 12 18 18 18 18 18  4  4  4  4  4 15 15 15 15 15 10 10 10 10 10 27
 27 27 27 27 24 24 24 24 24 23 23 23 23 23 31 31 31 31 31  1  1  1  1  1
 14 14 14 14 14 39 39 39 39 39  5  5  5  5  5 17 17 17 17 17 25 25 25 25
 25 26 26 26 26 26  0  0  0  0  0 33 33 33 33 33  9  9  9  9  9 22 22 22
 22 22 11 11 11 11 11 32 32 32 32 32  7  7  7  7  7 20 20 20 20 20 16 16
 16 16 16 29 29 29 29 29 38 10 36 30 12  0 35  8 24 32]


In [3]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(40):  # Assuming there are 4 clusters
    cluster_members = [file_names[idx] for idx in range(len(clusters)) if clusters[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

Cluster 0: ['mathematics1.txt', 'mathematics2.txt', 'mathematics3.txt', 'mathematics4.txt', 'mathematics5.txt', 'prompt_mathematics.txt']
Cluster 1: ['star_wars1.txt', 'star_wars2.txt', 'star_wars3.txt', 'star_wars4.txt', 'star_wars5.txt']
Cluster 2: ['cat1.txt', 'cat2.txt', 'cat3.txt', 'cat4.txt', 'cat5.txt']
Cluster 3: ['cpu1.txt', 'cpu2.txt', 'cpu3.txt', 'cpu4.txt', 'cpu5.txt']
Cluster 4: ['buckingham_palace1.txt', 'buckingham_palace2.txt', 'buckingham_palace3.txt', 'buckingham_palace4.txt', 'buckingham_palace5.txt']
Cluster 5: ['klaus1.txt', 'klaus2.txt', 'klaus3.txt', 'klaus4.txt', 'klaus5.txt']
Cluster 6: ['google1.txt', 'google2.txt', 'google3.txt', 'google4.txt', 'google5.txt']
Cluster 7: ['airplane1.txt', 'airplane2.txt', 'airplane3.txt', 'airplane4.txt', 'airplane5.txt']
Cluster 8: ['moon_landing1.txt', 'moon_landing2.txt', 'moon_landing3.txt', 'moon_landing4.txt', 'moon_landing5.txt', 'prompt_moon_landing.txt']
Cluster 9: ['cuni1.txt', 'cuni2.txt', 'cuni3.txt', 'cuni4.txt', 

In [4]:
y_pred40_prompt = [
    0, 0, 0, 0, 0,0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,5, 6, 6, 6, 6, 6,6, 7, 7, 7, 7, 7,7, 8, 8, 8, 8, 8,8, 
    9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,  
    16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22,  
    23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
    30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 
    36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39 
]

In [5]:
from sklearn.metrics import rand_score
rand_index = rand_score(clusters, y_pred40_prompt)

print(f"Rand Index: {rand_index}")

Rand Index: 0.9907951697425381


In [6]:
# Apply k-means clustering with 10 clusters
kmeans10 = KMeans(n_clusters=10, random_state=42)
clusters = kmeans10.fit_predict(reduced_embeddings)

# Check the cluster assignments
print(clusters)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 0 9 9 9 9 9 3 3 3 3 3 9 9 9 9 9 9 9 9 9 9 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 9 2 6 1 5 4 0 8 3]


In [7]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(10):  # Assuming there are 4 clusters
    cluster_members = [file_names[idx] for idx in range(len(clusters)) if clusters[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

Cluster 0: ['boston_tea1.txt', 'boston_tea2.txt', 'boston_tea3.txt', 'boston_tea4.txt', 'boston_tea5.txt', 'moon_landing1.txt', 'moon_landing2.txt', 'moon_landing3.txt', 'moon_landing4.txt', 'moon_landing5.txt', 'world_war1.txt', 'world_war2.txt', 'world_war3.txt', 'world_war4.txt', 'world_war5.txt', 'einstein1.txt', 'einstein2.txt', 'einstein3.txt', 'einstein4.txt', 'einstein5.txt', 'mandela1.txt', 'mandela2.txt', 'mandela3.txt', 'mandela4.txt', 'mandela5.txt', 'mit1.txt', 'mit2.txt', 'mit3.txt', 'mit4.txt', 'mit5.txt', 'prompt_moon_landing.txt']
Cluster 1: ['velvet_revolution1.txt', 'velvet_revolution2.txt', 'velvet_revolution3.txt', 'velvet_revolution4.txt', 'velvet_revolution5.txt', 'havel1.txt', 'havel2.txt', 'havel3.txt', 'havel4.txt', 'havel5.txt', 'klaus1.txt', 'klaus2.txt', 'klaus3.txt', 'klaus4.txt', 'klaus5.txt', 'prompt_havel.txt']
Cluster 2: ['cat1.txt', 'cat2.txt', 'cat3.txt', 'cat4.txt', 'cat5.txt', 'dolphin1.txt', 'dolphin2.txt', 'dolphin3.txt', 'dolphin4.txt', 'dolphin

In [8]:
y_pred10_prompt = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,4,
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,5,
    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,7,
    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,8,
    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,9
]

In [9]:
from sklearn.metrics import rand_score
rand_index = rand_score(clusters, y_pred10_prompt)

print(f"Rand Index: {rand_index}")

Rand Index: 0.9050808840282525
