# Demo with CIFAR10 data and image preview of embedded data points
For this, we need to have CIFAR10 data organized in subfolders (one for each class). We can then use the standard Aggregator, which returns an image and the corresponding filename.
We use the image data to extract the embeddings from a pretrained Resnet, and reduce the dimensionality further with UMAP down to just 2 dimensions.
Then, we use a scatter plot that additionally plots the corresponding images when we hover over a data point with the mouse pointer.
We do this only with the test data, since it is smaller and the notebook will execute faster, but you can easily do the same with the training data.

# Prerequisites

In [None]:
import numpy as np
from pathlib import Path

import torch
from torch.utils.data import DataLoader

import torchvision
from torchvision.datasets import CIFAR10
from torchvision.transforms import Compose, ToTensor, Grayscale, Normalize, Resize
from torchvision.models import resnet18


from transights.utils import DataSetDumper
from transights.utils import FolderScanner as fs
from transights.utils import Pickler
from transights.utils import EmbeddingPlotter
from transights.transforms import (FileToPIL,
                            DummyPIL,
                            PILToNumpy,
                            FlattenArray,
                            DebugTransform,
                            ProjectTransform,
                            PyTorchOutput,
                            PyTorchEmbedding,
                            ToDevice,
                            FlattenTensor,
                            CachingTransform)

from transights.aggregator import DataAggregator, DataSetAggregator

import matplotlib.pyplot as plt

import plotly.graph_objs as go
import plotly.express as px
import ipywidgets as widgets

from IPython.display import display


random_state = 23

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device:", DEVICE.upper())

In [None]:
ROOT_PATH = Path.home() / "Downloads"

DATA_PATH = ROOT_PATH / "data" / "CIFAR10"

DATA_PATH_TRAIN = Path(DATA_PATH, "train")
DATA_PATH_TEST = Path(DATA_PATH, "test")
DATA_PATH_TRAIN

In [None]:
import ssl
# this prevents the following error when trying to download the dataset:
# SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)
ssl._create_default_https_context = ssl._create_unverified_context

## Create CIFAR10 dataset organized in subfolders indicating class

In [None]:
transform = Compose(
    [
        ToTensor(),
    ]
)

In [None]:
train_dataset = CIFAR10(root=DATA_PATH, train=True, transform=transform, download=True)

if not DATA_PATH_TRAIN.exists():
    DataSetDumper(train_dataset, DATA_PATH_TRAIN).dump()

In [None]:
test_dataset = CIFAR10(root=DATA_PATH, train=False, transform=transform, download=True)

if not DATA_PATH_TEST.exists():
    DataSetDumper(test_dataset, DATA_PATH_TEST).dump()

In [None]:
weights_pretrained = torch.load("weights_resnet18_cifar10.pth", map_location=DEVICE)

# load model with pre-trained weights
model = resnet18(num_classes=10)
model.load_state_dict(weights_pretrained)

In [None]:
train_files = fs.get_files(DATA_PATH_TRAIN, extensions='.png', recursive=True)
len(train_files)

In [None]:
test_files = fs.get_files(DATA_PATH_TEST, extensions='.png', recursive=True)
len(test_files)

## Define Transformation pipeline
Notice, that we have a FileToPIL Transformation that handles the loading of the image. This enables us to use the standard Aggregator, where we don't need to take care of a DataSet or DataLoader instantiation.
All we need to pass as arguments are a file list and the transformation pipeline, and optionally a batch size.

In [None]:
# Create the transformation pipeline
transform_pipeline = Compose([
    FileToPIL(),
    ToTensor(),
    Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ToDevice(DEVICE),
    PyTorchEmbedding(model, device=DEVICE),
    ToDevice('cpu'),
    FlattenTensor(),
])

## Instantiate Aggregator and extract embeddings
The images are stored in the resulting dict as 'item', and the filenames as 'file'.

In [None]:
agg = DataAggregator(test_files, transforms=transform_pipeline, batch_size=32)

test_embedding_result = agg.transform()

In [None]:
test_embedding_result['item'].shape

In [None]:
test_embedding_result['file'][0]

# Create 2D Plot

In [None]:
from umap import UMAP

# Create the UMAP reducer instance
reducer = UMAP(n_neighbors=15, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=2, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=random_state, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipeline = Pipeline([
    ('umap', reducer),
])


X = test_embedding_result['item']

In [None]:
pipeline.fit(X)

In [None]:
test_reduced_embedding = pipeline.transform(X)
test_reduced_embedding.shape

In [None]:
test_y = [Path(file).parts[-2] for file in test_embedding_result['file']]
test_y = np.array(test_y).astype('int')

# Map class index to label
labels = test_dataset.classes
test_y_str = [labels[i] for i in test_y]

In [None]:
test_y

# Create 2D Plot
Just pass a 2d array to the EmbeddingPlotter, it will automatically plot accordingly

In [None]:
plotter = EmbeddingPlotter(data=test_reduced_embedding,
                           color=test_y_str,
                           file_list=test_embedding_result['file'],
                           hover_name=test_embedding_result['file'],
                           width=1000)
display(plotter.plot())                           

# Create 2D Density Plot
Just pass 'kde' as color parameter.

In [None]:
plotter = EmbeddingPlotter(data=test_reduced_embedding,
                           color='kde',
                           file_list=test_embedding_result['file'],
                           hover_name=test_embedding_result['file'],
                           width=1000)
display(plotter.plot())

# Create 3D Plot
Just pass a 3d array to the EmbeddingPlotter, it will automatically plot accordingly

In [None]:
from umap import UMAP

# Create the UMAP reducer instance
reducer = UMAP(n_neighbors=15, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=3, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=random_state, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('umap', reducer),
])

In [None]:
pipeline.fit(X)

In [None]:
test_reduced_embedding = pipeline.transform(X)
test_reduced_embedding.shape

In [None]:
plotter = EmbeddingPlotter(data=test_reduced_embedding,
                           color=test_y_str,
                           file_list=test_embedding_result['file'],
                           hover_name=test_embedding_result['file'],
                           width=1000)
display(plotter.plot())                           

In [None]:
plotter = EmbeddingPlotter(data=test_reduced_embedding,
                           color='kde',
                           file_list=test_embedding_result['file'],
                           hover_name=test_embedding_result['file'],
                           width=1000)
display(plotter.plot())