In [1]:
import json
import numpy as np
from sklearn.cluster import KMeans

# Load embeddings from the JSON files
embedding_files = ['embeddings/animals.json', 'embeddings/companies.json', 'embeddings/computer_components.json', 'embeddings/events.json',
                  'embeddings/landmarks.json','embeddings/movies.json','embeddings/people.json',
                  'embeddings/research_fields.json','embeddings/university.json','embeddings/vehicles.json']

all_embeddings = []
file_names = []  # To store the corresponding file names

# Iterate through each file and load its content
for file in embedding_files:
    with open(file, 'r') as f:
        data = json.load(f)  # Load the data from each file
        
        # Add embeddings and file names
        embeddings = np.array([item['embedding'] for item in data])  # Extract 'embedding' key
        all_embeddings.append(embeddings)  # Add the embeddings
        file_names.extend([item['file_name'] for item in data])  # Store file names

# Combine all embeddings into a single array
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Check the shape of the combined embeddings
print("Combined embeddings shape:", all_embeddings.shape)


Combined embeddings shape: (200, 1536)


In [2]:
from sklearn.decomposition import PCA

pca = PCA(n_components=9)  # Reduce to 9 dimensions (adjust based on your data)
reduced_embeddings = pca.fit_transform(all_embeddings)

kmeans40 = KMeans(n_clusters=40, init='k-means++', n_init=10, max_iter=300, random_state=42)
clusters = kmeans40.fit_predict(reduced_embeddings)

print(clusters)



[ 3  3  3  3  3 32 32 32 32 32 18 18 18 18 18 39 39 39 39 39 19 19 19 19
 19 36 36 36 36 36  9  9  9  9  9 37 37 37 37 37 38 38 38 38 38 33 33 33
 33 33  4  4  4  4  4 30 30 30 30 30 15 15 15 15 15 23 23 23 23 23 34 34
 34 34 34 10 10 10 10 10 11 11 11 11 11 20 20 20 20 20  8  8  8  8  8 29
 29 29 29 29 22 22 22 22 22 17 17 17 17 17 25 25 25 25 25  1  1  1  1  1
 12 12 12 12 12  5  5  5  5  5 21 21 21 21 21 13 13 13 13 13 14 14 14 14
 14  0  0  0  0  0 27 27 27 27 27 24 24 24 24 24 35 35 35 35 35  7  7  7
  7  7 16 16 16 16 16  2  2  2  2  2 28 28 28 28 28 31 31 31 31 31 26 26
 26 26 26  6  6  6  6  6]


In [3]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(40):  # Assuming there are 4 clusters
    cluster_members = [file_names[idx] for idx in range(len(clusters)) if clusters[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

Cluster 0: ['economics1.txt', 'economics2.txt', 'economics3.txt', 'economics4.txt', 'economics5.txt']
Cluster 1: ['star_wars1.txt', 'star_wars2.txt', 'star_wars3.txt', 'star_wars4.txt', 'star_wars5.txt']
Cluster 2: ['vse1.txt', 'vse2.txt', 'vse3.txt', 'vse4.txt', 'vse5.txt']
Cluster 3: ['cat1.txt', 'cat2.txt', 'cat3.txt', 'cat4.txt', 'cat5.txt']
Cluster 4: ['motherboard1.txt', 'motherboard2.txt', 'motherboard3.txt', 'motherboard4.txt', 'motherboard5.txt']
Cluster 5: ['havel1.txt', 'havel2.txt', 'havel3.txt', 'havel4.txt', 'havel5.txt']
Cluster 6: ['train1.txt', 'train2.txt', 'train3.txt', 'train4.txt', 'train5.txt']
Cluster 7: ['mit1.txt', 'mit2.txt', 'mit3.txt', 'mit4.txt', 'mit5.txt']
Cluster 8: ['eiffel_tower1.txt', 'eiffel_tower2.txt', 'eiffel_tower3.txt', 'eiffel_tower4.txt', 'eiffel_tower5.txt']
Cluster 9: ['microsoft1.txt', 'microsoft2.txt', 'microsoft3.txt', 'microsoft4.txt', 'microsoft5.txt']
Cluster 10: ['world_war1.txt', 'world_war2.txt', 'world_war3.txt', 'world_war4.txt', 

In [4]:
y_pred40 = [
    0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 
    9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11,12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,  
    16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22,  
    23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29,
    30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 
    36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39 
]

In [5]:
from sklearn.metrics import rand_score
rand_index = rand_score(clusters, y_pred40)

print(f"Rand Index: {rand_index}")

Rand Index: 1.0


In [6]:
# Apply k-means clustering with 10 clusters
kmeans10 = KMeans(n_clusters=10, random_state=42)
clusters = kmeans10.fit_predict(reduced_embeddings)

# Check the cluster assignments
print(clusters)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 4 4 4 4 4 4 4 4 4 4 0 0 0 0
 0 4 4 4 4 4 5 5 5 5 5 7 7 7 7 7 5 5 5 5 5 5 5 5 5 5 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 8 8 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 4 4 4 4 4 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 7 7 7 7 7 4 4 4 4 4 7 7 7 7 7 7 7 7 7 7 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [7]:
# Now print the clusters with the corresponding filenames
for cluster_num in range(10):  # Assuming there are 4 clusters
    cluster_members = [file_names[idx] for idx in range(len(clusters)) if clusters[idx] == cluster_num]
    print(f"Cluster {cluster_num}: {cluster_members}")

Cluster 0: ['velvet_revolution1.txt', 'velvet_revolution2.txt', 'velvet_revolution3.txt', 'velvet_revolution4.txt', 'velvet_revolution5.txt', 'havel1.txt', 'havel2.txt', 'havel3.txt', 'havel4.txt', 'havel5.txt', 'klaus1.txt', 'klaus2.txt', 'klaus3.txt', 'klaus4.txt', 'klaus5.txt']
Cluster 1: ['einstein1.txt', 'einstein2.txt', 'einstein3.txt', 'einstein4.txt', 'einstein5.txt', 'chemistry1.txt', 'chemistry2.txt', 'chemistry3.txt', 'chemistry4.txt', 'chemistry5.txt', 'economics1.txt', 'economics2.txt', 'economics3.txt', 'economics4.txt', 'economics5.txt', 'mathematics1.txt', 'mathematics2.txt', 'mathematics3.txt', 'mathematics4.txt', 'mathematics5.txt', 'physics1.txt', 'physics2.txt', 'physics3.txt', 'physics4.txt', 'physics5.txt']
Cluster 2: ['cat1.txt', 'cat2.txt', 'cat3.txt', 'cat4.txt', 'cat5.txt', 'dolphin1.txt', 'dolphin2.txt', 'dolphin3.txt', 'dolphin4.txt', 'dolphin5.txt', 'elephant1.txt', 'elephant2.txt', 'elephant3.txt', 'elephant4.txt', 'elephant5.txt', 'giraffe1.txt', 'giraffe

In [8]:
y_pred10 = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9
]


In [9]:
from sklearn.metrics import rand_score
rand_index = rand_score(clusters, y_pred10)

print(f"Rand Index: {rand_index}")

Rand Index: 0.9623115577889447
