# Demo with CIFAR10 data and image preview of embedded data points
For this, we need to have CIFAR10 data organized in subfolders (one for each class). We can then use the standard Aggregator, which returns an image and the corresponding filename.
We use the image data to extract the embeddings from a pretrained Resnet, and reduce the dimensionality further with UMAP down to just 2 dimensions.
Then, we use a scatter plot that additionally plots the corresponding images when we hover over a data point with the mouse pointer.
We do this only with the test data, since it is smaller and the notebook will execute faster, but you can easily do the same with the training data.

# Prerequisites

In [41]:
import numpy as np
from pathlib import Path
from collections import Counter

import torch
#from torch.utils.data import DataLoader

#import torchvision
from torchvision.datasets import CIFAR10
from torchvision.transforms import Compose, ToTensor, Grayscale, Normalize, Resize
from torchvision.models import resnet18


from hyperpyper.utils import DataSetDumper
from hyperpyper.utils import FolderScanner as fs
from hyperpyper.utils import Pickler
from hyperpyper.utils import EmbeddingPlotter
from hyperpyper.utils import PipelineCache
from hyperpyper.utils import PathList
from hyperpyper.transforms import (FileToPIL,
                            DummyPIL,
                            PILToNumpy,
                            FlattenArray,
                            DebugTransform,
                            ProjectTransform,
                            PyTorchOutput,
                            PyTorchEmbedding,
                            ToDevice,
                            FlattenTensor,
                            CachingTransform)

from hyperpyper.aggregator import DataAggregator, DataSetAggregator

import matplotlib.pyplot as plt

import plotly.graph_objs as go
import plotly.express as px
import ipywidgets as widgets

from IPython.display import display


random_state = 23

In [42]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device:", DEVICE.upper())

Running on device: CPU


In [43]:
ROOT_PATH = Path.home() / "Downloads" / "data"

DATA_PATH = ROOT_PATH / "CIFAR10"

DATA_PATH_TEST = Path(DATA_PATH, "test")
DATA_PATH_TRAIN = Path(DATA_PATH, "train")
DATA_PATH_TRAIN

PosixPath('/home/bernhard/Downloads/data/CIFAR10/train')

In [44]:
CACHE_PATH = DATA_PATH / "tmp"
CACHE_PATH

PosixPath('/home/bernhard/Downloads/data/CIFAR10/tmp')

In [45]:
CIFAR10_train_embedding_resnet18_file = None#Path(CACHE_PATH, "CIFAR10_train_embedding_resnet18.pkl")

In [46]:
#import ssl
# this prevents the following error when trying to download the dataset:
# SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)
#ssl._create_default_https_context = ssl._create_unverified_context

# TODO: sort/sort_order args for Aggregator?

## Create CIFAR10 dataset organized in subfolders indicating class

In [47]:
import ssl
#from torch.utils.data import Dataset
from torchvision.datasets import VisionDataset
from torchvision.transforms import ToTensor

def prepare_VisionDataset(dataset: VisionDataset, root: Path, dst: Path, train: bool=True, download: bool=True) -> VisionDataset:
    """
    Prepare a torchvision dataset by extracting its images into subfolders based on their labels.

    Args:
        dataset (torchvision.datasets.VisionDataset): The dataset object.
        root (pathlib.Path): Root directory where the dataset is located or will be downloaded.
        dst (pathlib.Path): Destination directory for the prepared dataset.
        train (bool, optional): Whether the dataset is for training. Defaults to True.
        download (bool, optional): Whether to download the dataset if not found locally. Defaults to True.

    Returns:
        torch.utils.data.Dataset: Prepared dataset.

    Raises:
        ValueError: If dataset is not a VisionDataset.
    """
    # Sanity check if dataset is really from torchvision.datasets
    if not issubclass(dataset, VisionDataset):
        raise ValueError("The dataset must be from torchvision.datasets (e.g. torchvision.datasets.CIFAR10).")

    # this prevents the following error when trying to download the dataset:
    # SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)
    ssl._create_default_https_context = ssl._create_unverified_context

    ds = dataset(root=root, train=train, transform=ToTensor(), download=download)

    if not dst.exists():
        DataSetDumper(train_dataset, DATA_PATH_TRAIN).dump()

    return ds

In [48]:
#train_dataset = prepare_VisionDataset(CIFAR10, root=DATA_PATH, dst=DATA_PATH_TRAIN)
train_dataset = prepare_VisionDataset(CIFAR10, root=DATA_PATH, dst=DATA_PATH_TEST, train=False)

Files already downloaded and verified


In [49]:

#train_dataset = CIFAR10(root=DATA_PATH, train=True, transform=ToTensor(), download=True)

#if not DATA_PATH_TRAIN.exists():
#    DataSetDumper(train_dataset, DATA_PATH_TRAIN).dump()

In [50]:
weights_pretrained = torch.load("weights_resnet18_cifar10.pth", map_location=DEVICE)

# load model with pre-trained weights
model = resnet18(num_classes=10)
model.load_state_dict(weights_pretrained)

<All keys matched successfully>

In [51]:
#train_files = fs.get_files(DATA_PATH_TRAIN, extensions='.png', recursive=True, relative_to=DATA_PATH)
train_files = fs.get_files(DATA_PATH_TEST, extensions='.png', recursive=True)#, relative_to=DATA_PATH)

len(train_files)

10000

In [52]:
train_files[0]

PosixPath('/home/bernhard/Downloads/data/CIFAR10/test/5/8378.png')

## Define Transformation pipeline
Notice, that we have a FileToPIL Transformation that handles the loading of the image. This enables us to use the standard Aggregator, where we don't need to take care of a DataSet or DataLoader instantiation.
All we need to pass as arguments are a file list and the transformation pipeline, and optionally a batch size.

In [53]:
# Create the transformation pipeline
transform_pipeline = Compose([
    FileToPIL(),
    ToTensor(),
    Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ToDevice(DEVICE),
    PyTorchEmbedding(model, device=DEVICE),
    ToDevice('cpu'),
    FlattenTensor(),
])

## Instantiate Aggregator and extract embeddings
The images are stored in the resulting dict as 'item', and the filenames as 'file'.

In [54]:
#agg = DataAggregator(root=DATA_PATH, files=train_files, transforms=transform_pipeline, batch_size=9)
agg = DataAggregator(files=train_files, transforms=transform_pipeline, batch_size=9)

#train_embedding_result = agg.transform(cache_file=CIFAR10_train_embedding_resnet18_file)
train_X, train_y_files = agg.transform(cache_file=CIFAR10_train_embedding_resnet18_file)

In [55]:
train_y_files

['/home/bernhard/Downloads/data/CIFAR10/test/5/8378.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/1698.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/460.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/7446.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/639.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/2057.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/4988.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/1000.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/9582.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/2259.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/8009.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/3026.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/230.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/8424.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/7528.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/7887.png',
 '/home/bernhard/Downloads/data/CIFAR10/test/5/2051.png',
 '/home/bernhard/

In [56]:
#train_embedding_result['file'] = DATA_PATH / PathList(train_embedding_result['file'])
#train_y_files = DATA_PATH / PathList(train_y_files)

In [57]:
#train_y_files

In [58]:
import hashlib
import pickle

In [59]:
from umap import UMAP

# Create the UMAP reducer instance
reducer = UMAP(n_neighbors=15, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=2, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=random_state, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipeline = Pipeline([
    ('umap', reducer),
])


X = train_X

In [61]:
UMAP_2D_CACHE_PATH = CACHE_PATH / "umap2d"
pipeline = PipelineCache(pipeline, cache_path=UMAP_2D_CACHE_PATH)

In [62]:
train_reduced_embedding = pipeline.fit_transform(X)
train_reduced_embedding.shape

(10000, 2)

In [63]:
cache_file = fs.get_files(UMAP_2D_CACHE_PATH, recursive=True, relative_to=UMAP_2D_CACHE_PATH)

cache_file

[PosixPath('fit_transform/d62e1954e4c029a8533c54a68edde136c1382974e81460576bb38562e6b4025e.pkl')]

In [64]:
train_y = [Path(file).parts[-2] for file in train_y_files]
train_y = np.array(train_y).astype('int')

# Map class index to label
labels = train_dataset.classes
train_y_str = [labels[i] for i in train_y]

In [65]:
ctr = Counter(train_y)
ctr

Counter({5: 1000,
         7: 1000,
         6: 1000,
         2: 1000,
         1: 1000,
         8: 1000,
         4: 1000,
         9: 1000,
         3: 1000,
         0: 1000})

In [66]:
plotter = EmbeddingPlotter(data=train_reduced_embedding,
                           color=train_y_str,
                           file_list=train_y_files,
                           width=1000)
display(plotter.plot())

Box(children=(FigureWidget({
    'data': [{'hovertemplate': '<b>%{hovertext}</b><br><br>color=dog<br>x=%{x}<br…