In [None]:
import sys
import os
from pathlib import Path
sys.path.append(os.path.abspath('..'))

import matplotlib.pyplot as plt
import umap
import numpy as np

from utils.data import download_data, get_image_np_arrays

# Load images

In [None]:
download_data()

In [None]:
# WARNING!! The dataset is very large (1.7GB), so this will take a very long time
# Processing 10% of images took me 1 hour to load and compute UMAP embeddings (@ umap embedding_dim=2).
images = get_image_np_arrays(process_size=0.1)

# UMAP embeddings

In [None]:
embedding_dim = 64

flat_images = images.reshape(images.shape[0], -1)
reducer = umap.UMAP(n_components=embedding_dim)
embeddings = reducer.fit_transform(flat_images)

In [None]:
# Save embeddings to a file
project_dir = Path("..").resolve()
embeddings_file = project_dir / "data" / "umap_embeddings" / f"embeddings_d{embedding_dim}_n{len(embeddings)}.npy"
embeddings_file.parent.mkdir(parents=True, exist_ok=True)
np.save(embeddings_file, embeddings)

In [None]:
# Plot embeddings
plt.figure(figsize=(10, 10))
plt.scatter(embeddings[:, 0], embeddings[:, 1], s=1, alpha=0.5)
plt.title('UMAP Embeddings of Dataset')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()