In [1]:
import json
import numpy as np
from sklearn.cluster import KMeans

# Load embeddings from the JSON files
embedding_files = ['embeddings/animals.json', 'embeddings/companies.json', 'embeddings/computer_components.json', 'embeddings/events.json',
                  'embeddings/landmarks.json','embeddings/movies.json','embeddings/people.json',
                  'embeddings/research_fields.json','embeddings/university.json','embeddings/vehicles.json', 'embeddings/prompts.json']

all_embeddings = []
file_names = []  # To store the corresponding file names

# Iterate through each file and load its content
for file in embedding_files:
    with open(file, 'r') as f:
        data = json.load(f)  # Load the data from each file
        
        # Add embeddings and file names
        embeddings = np.array([item['embedding'] for item in data])  # Extract 'embedding' key
        all_embeddings.append(embeddings)  # Add the embeddings
        file_names.extend([item['file_name'] for item in data])  # Store file names

# Combine all embeddings into a single array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Check the shape of the combined embeddings
print("Combined embeddings shape:", all_embeddings.shape)

Combined embeddings shape: (210, 1536)


In [2]:
from sklearn.cluster import KMeans

# Apply k-means clustering with 40 clusters
kmeans40 = KMeans(n_clusters=40, random_state=42)
clusters = kmeans40.fit_predict(all_embeddings)

# Check the cluster assignments
print(clusters)

[30 30 30 30 30  6  6  6  6  6 34 34 34 34 34 34 34 34 34 34 29 29 29 29
 29 14 14 14 14 14  3  3  3  3  3 25 25 25 25 25 37 37 37 37 37 27 27 27
 27 27 21 21 21 21 21  9  9  9  9  9 17 17 17 17 17 12 12 12 12 12 36 36
 36 36 36  0  0  0  0  0 22 22 22 22 22 35 35 35 35 35  4  4  4  4  4 19
 19 19 19 19 20 20 20 20 20 13 13 13 13 13 10 10 10 10 10 24 24 24 24 24
 26 26 26 26 26  1  1  1  1  1 31 31 31 31 31 23 23 23 23 23  8  8  8  8
  8 15 15 15 15 15 18 18 18 18 18 11 11 11 11 11  5  5  5  5  5  7  7  7
  7  7 16 16 16 16 16 33 33 33 33 33  2  2  2  2  2 28 28 28 28 28 32 32
 32 32 32 38 38 38 38 38  2  4 34 27  1 18 39 12 24 33]


In [3]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(40):  # Assuming there are 4 clusters
    cluster_members = [file_names[idx] for idx in range(len(clusters)) if clusters[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")


Cluster 0: ['world_war1.txt', 'world_war2.txt', 'world_war3.txt', 'world_war4.txt', 'world_war5.txt']
Cluster 1: ['havel1.txt', 'havel2.txt', 'havel3.txt', 'havel4.txt', 'havel5.txt', 'prompt_havel.txt']
Cluster 2: ['airplane1.txt', 'airplane2.txt', 'airplane3.txt', 'airplane4.txt', 'airplane5.txt', 'prompt_airplane.txt']
Cluster 3: ['microsoft1.txt', 'microsoft2.txt', 'microsoft3.txt', 'microsoft4.txt', 'microsoft5.txt']
Cluster 4: ['eiffel_tower1.txt', 'eiffel_tower2.txt', 'eiffel_tower3.txt', 'eiffel_tower4.txt', 'eiffel_tower5.txt', 'prompt_eiffel_tower.txt']
Cluster 5: ['cuni1.txt', 'cuni2.txt', 'cuni3.txt', 'cuni4.txt', 'cuni5.txt']
Cluster 6: ['dolphin1.txt', 'dolphin2.txt', 'dolphin3.txt', 'dolphin4.txt', 'dolphin5.txt']
Cluster 7: ['mit1.txt', 'mit2.txt', 'mit3.txt', 'mit4.txt', 'mit5.txt']
Cluster 8: ['chemistry1.txt', 'chemistry2.txt', 'chemistry3.txt', 'chemistry4.txt', 'chemistry5.txt']
Cluster 9: ['ram1.txt', 'ram2.txt', 'ram3.txt', 'ram4.txt', 'ram5.txt']
Cluster 10: ['p

In [4]:
y_pred40_prompt = [
    0, 0, 0, 0, 0,0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,5, 6, 6, 6, 6, 6,6, 7, 7, 7, 7, 7,7, 8, 8, 8, 8, 8,8, 
    9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,  
    16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22,  
    23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
    30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 
    36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39 
]

In [5]:
from sklearn.metrics import rand_score
rand_index = rand_score(clusters, y_pred40_prompt)

print(f"Rand Index: {rand_index}")

Rand Index: 0.9914786967418546


In [6]:
# Apply k-means clustering with 10 clusters
kmeans10 = KMeans(n_clusters=10, random_state=42)
clusters = kmeans10.fit_predict(all_embeddings)

# Check the cluster assignments
print(clusters)

[3 3 3 3 3 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 7 7 7 7 7 1 1 1 1
 1 0 0 0 0 0 8 8 8 8 8 1 1 1 1 1 5 5 5 5 5 5 5 5 5 5 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 8 4 1 2 6 7 7 9]


In [7]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(10):  # Assuming there are 4 clusters
    cluster_members = [file_names[idx] for idx in range(len(clusters)) if clusters[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

Cluster 0: ['boston_tea1.txt', 'boston_tea2.txt', 'boston_tea3.txt', 'boston_tea4.txt', 'boston_tea5.txt', 'world_war1.txt', 'world_war2.txt', 'world_war3.txt', 'world_war4.txt', 'world_war5.txt']
Cluster 1: ['velvet_revolution1.txt', 'velvet_revolution2.txt', 'velvet_revolution3.txt', 'velvet_revolution4.txt', 'velvet_revolution5.txt', 'charles_bridge1.txt', 'charles_bridge2.txt', 'charles_bridge3.txt', 'charles_bridge4.txt', 'charles_bridge5.txt', 'einstein1.txt', 'einstein2.txt', 'einstein3.txt', 'einstein4.txt', 'einstein5.txt', 'havel1.txt', 'havel2.txt', 'havel3.txt', 'havel4.txt', 'havel5.txt', 'klaus1.txt', 'klaus2.txt', 'klaus3.txt', 'klaus4.txt', 'klaus5.txt', 'mandela1.txt', 'mandela2.txt', 'mandela3.txt', 'mandela4.txt', 'mandela5.txt', 'prompt_havel.txt']
Cluster 2: ['economics1.txt', 'economics2.txt', 'economics3.txt', 'economics4.txt', 'economics5.txt', 'mathematics1.txt', 'mathematics2.txt', 'mathematics3.txt', 'mathematics4.txt', 'mathematics5.txt', 'physics1.txt', 'ph

In [8]:
y_pred10_prompt = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,3,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,4,
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,5,
    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,6,
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,7,
    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,8,
    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,9
]

In [10]:
from sklearn.metrics import rand_score
rand_index = rand_score(clusters, y_pred10_prompt)

print(f"Rand Index: {rand_index}")

Rand Index: 0.8888585099111415
