# Sketchformer Embedding UMAP Viz

In [None]:
import numpy as np
import os
from quickdraw import QuickDrawData
import cv2 as cv2
from scipy.spatial import distance as dstnc
import xlsxwriter

Read the sketchformer embedding and find the average mass of the embedding.

In [None]:
reference_dict = dict()
centroids = dict()
global reference_dict
directory = "./deep-sketch/data-qd/cont_embed/"
filename = "QD_150_samples_embeddings_cont.npz"

# can be optimized with
embeddings = np.load(directory + filename,
                      allow_pickle=True, encoding="latin1")
for embedding in embeddings["embeddings"]:
    embed_vector = embedding[0]
    key_id = embedding[1]
    class_name_raw = embedding[2]
    class_name = class_name_raw.split(str(key_id))[0]
    if class_name in reference_dict.keys():
        current_list = reference_dict[class_name]
        current_list[0].append(key_id)
        current_list[1].append(embed_vector)
        reference_dict[class_name] = current_list
    else:
        # This is the structure of ref_dict.
        reference_dict[class_name] = [[key_id], [embed_vector]]

for key in reference_dict.keys():
    category_wise_embeddings = reference_dict[key][1]
    category_centroid = np.average(category_wise_embeddings, axis=0)
    centroids[key] = category_centroid[0]
    # Comment above and comment out below to use flatten array instead averaging
    # centroids[key] = np.array(category_centroid[0]).flatten()

In [None]:
keys = np.array(list(centroids.keys()))
values = np.array(list(centroids.values()))

In [None]:
from sklearn.decomposition import PCA
import altair as alt
import pandas as pd

# Dimension reduction and clustering libraries
import umap
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=5,
    min_dist=0.25,
    n_components=2,
    random_state=42,
).fit_transform(values)

df = pd.DataFrame(clusterable_embedding, columns = ['x','y'])
df["key"] = keys

In [None]:
alt.Chart(df).mark_circle(size=60).encode(
    x= 'x',
    y= 'y',
    color='key',
    tooltip=['key']
).interactive()

In [None]:
embed_df = pd.DataFrame(values)
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
embed_df.to_csv('cont-out.zip', index=False,
          compression=compression_opts)

Now, read the sketchformer tok-dict preprocessed files embeddings.

In [None]:
reference_dict = dict()
centroids = dict()
directory = "./deep-sketch/data-qd/tok_dict/"
filename = "QD_150_samples_embeddings_cont.npz"

global reference_dict
for filename in os.listdir(directory):
    if filename.endswith(".npz"):
        file_name = filename
        embeddings = np.load(directory + file_name, allow_pickle=True, encoding="latin1")  # can be optimized with
        for embedding in embeddings["embeddings"]:
            embed_vector = embedding[0]
            key_id = embedding[1]
            class_name_raw = embedding[2]
            class_name = class_name_raw.split(str(key_id))[0]
            if class_name in reference_dict.keys():
                current_list = reference_dict[class_name]
                current_list[0].append(key_id)
                current_list[1].append(embed_vector)
                reference_dict[class_name] = current_list
            else:
                reference_dict[class_name] = [[key_id], [embed_vector]]   # This is the structure of ref_dict.

for key in reference_dict.keys():
    category_wise_embeddings = reference_dict[key][1]
    category_centroid = np.average(category_wise_embeddings, axis=0)
    centroids[key] = category_centroid[0]      # {class_name: centroid vector, ... }

In [None]:
keys = np.array(list(centroids.keys()))
values = np.array(list(centroids.values()))

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=15,
    min_dist=0.25,
    n_components=2,
    random_state=42,
).fit_transform(values)

df = pd.DataFrame(clusterable_embedding, columns = ['x','y'])
df["key"] = keys

In [None]:
alt.Chart(df).mark_circle(size=60).encode(
    x= 'x',
    y= 'y',
    color='key',
    tooltip=['key']
).interactive()

In [None]:
embed_df = pd.DataFrame(values)
compression_opts = dict(method='zip',
                        archive_name='out.csv')  
embed_df.to_csv('tok-dict-out.zip', index=False,
          compression=compression_opts)  