In [9]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json

from sklearn.cluster import *
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

In [10]:
def cluster_embeddings(embeddings):
    """
    Clusters embeddings using agglomerative clustering.

    Args:
        embeddings (dict): A dictionary of embeddings, where the keys are file names and the values are the embeddings.

    Returns:
        The resulting clustering model, linkage matrix, and a dictionary that maps each embedding to the name of the file it represents.
    """
    vectors_to_file = {}
    vectors = []

    for f in embeddings:
        vectors_to_file[tuple(embeddings[f])] = f
        vectors.append(embeddings[f])

    clustering = AgglomerativeClustering().fit(vectors)
    linked = linkage(vectors, 'single')

    return clustering, linked, vectors_to_file



def plot_dendrogram(linked, vector_to_file):
    """
    Plots a dendrogram of the clustering results.

    Args:
        linked (numpy.ndarray): The linkage matrix.
        vector_to_file (dict): A dictionary that maps each embedding to the name of the file it represents.
    """
    vectors_names = [vector_to_file[tuple(x)] for x in vectors]
    plt.figure(figsize=(10, 7))
    dendrogram(linked,
               orientation='top',
               labels=vectors_names,
               distance_sort='descending',
               show_leaf_counts=True)
    plt.show()


In [None]:
# Function to load in the embeddings and turn them into a vector array
def load_embeddings(path):
    with open(path, 'r') as f:
        embeddings = json.load(f)
    return embeddings


In [20]:
import numpy as np
import scipy.cluster.hierarchy as hierarchy

# Generate some random data
X = np.random.rand(10, 2)

# Compute the hierarchical clustering
Z = hierarchy.linkage(X, method='ward')

# Extract the centroids of each level of the hierarchy
centroids = []
for k in range(1, len(X)):
    labels = hierarchy.fcluster(Z, k, criterion='maxclust')
    unique_labels = np.unique(labels)
    level_centroids = []
    for label in unique_labels:
        level_centroids.append(np.mean(X[labels == label], axis=0))
    centroids.append(level_centroids)


In [22]:
X

array([[0.80655508, 0.45545351],
       [0.70211636, 0.2110738 ],
       [0.34586884, 0.6060408 ],
       [0.43556986, 0.43936679],
       [0.83630713, 0.28320299],
       [0.19527305, 0.65594397],
       [0.14625262, 0.21947993],
       [0.93919713, 0.55684005],
       [0.8296766 , 0.23581856],
       [0.74171588, 0.71356386]])

In [11]:
file_path = "embeddings.json"
with open(file_path, "r") as f:
    embeddings = json.load(f)
    
clustering, linked, vectors_to_file = cluster_embeddings(embeddings)
plot_dendrogram(linked, vectors_to_file)


NameError: name 'vectors' is not defined

In [13]:
def reduce_dimensions(vec_embeddings):
    pca = PCA(n_components=2)
    pca.fit(vec_embeddings)
    pca_encodings = pca.transform(vec_embeddings)
    return pca_encodings

def plot_space(labels, vectors):
    fig, ax = plt.subplots()
    x = vectors[:,0]
    y = vectors[:,1]
    ax.scatter(x, y)
    plt.rc('font', size=7)
    for i, label in enumerate(labels):
        ax.annotate(label, (x[i], y[i]))
    plt.show()

def runReduction(embeddings_f = "embeddings.json", out_f="reduced_embeddings_2d.csv"):
    with open(embeddings_f,'r') as infile:
        data = json.load(infile)
    embedding_labels = list(data.keys())
    # embedding_size = len(data[embedding_labels[0]])
    embeddings = np.array([data[label] for label in embedding_labels])
    latent_space = reduce_dimensions(embeddings)
    # plot_space(embedding_labels, latent_space)
    # Stores the reduction as a dataframe of filenames and their corresponding coordinates
    df = pd.DataFrame(latent_space, columns=['x','y'])
    df['filename'] = embedding_labels
    df.to_csv('reduced_embeddings_2d.csv', index=False)

runReduction()