# Demo with CIFAR10 data and image preview of embedded data points
For this, we need to have CIFAR10 data organized in subfolders (one for each class). We can then use the standard Aggregator, which returns an image and the corresponding filename.
We use the image data to extract the embeddings from a pretrained Resnet, and reduce the dimensionality further with UMAP down to just 2 dimensions.
Then, we use a scatter plot that additionally plots the corresponding images when we hover over a data point with the mouse pointer.
We do this only with the test data, since it is smaller and the notebook will execute faster, but you can easily do the same with the training data.

# Prerequisites

In [1]:
import numpy as np
from pathlib import Path

import torch
from torch.utils.data import DataLoader

import torchvision
from torchvision.transforms import Compose, ToTensor, Grayscale, Normalize, Resize
from torchvision import transforms
import torchvision.transforms as transforms
from torchvision.models import resnet18




from transights.utils import FolderScanner as fs
from transights.utils import Pickler
from transights.transforms import (FileToPIL,
                             PILToNumpy,
                             PILtoHist,
                             FlattenArray,
                             DebugTransform,
                             ProjectTransform,
                             PyTorchOutput,
                             PyTorchEmbedding,
                             ToDevice,
                             FlattenTensor,
                             CachingTransform)

from transights.aggregator import DataAggregator

import matplotlib.pyplot as plt

import plotly.graph_objs as go
import plotly.express as px
import ipywidgets as widgets


random_state = 23

In [2]:
def load_image(filename: str):
    with open(filename, "rb") as f:
            im = f.read()
    return im

def plot_2d(data, color=None, hover_name=None):
    fig = px.scatter(x=data[:, 0],
                     y=data[:, 1],
                     color=color,
                     hover_name=hover_name)
    fig.update_traces(marker=dict(size=5,
                                  line=dict(color='black',
                                            width=0.1)))

    fig.update_layout(width=1000, height=800)

    img = widgets.Image(format='png', width=128)
    img.value = load_image(hover_name[0])
    
    def update(trace, points, state):
        if not points.point_inds:
            return
        
        ind = points.point_inds[0]
        fname = hover_name[ind]
        img.value = load_image(fname)

    fig = go.FigureWidget(fig)
    #fig.data[0].on_click(update)
    fig.data[0].on_hover(update)

    layout = widgets.Layout(
        width='100%',
        height='',
        flex_flow='row',
        display='flex'
    )

    return widgets.Box([fig, widgets.VBox([widgets.Label(), img, widgets.Label()])], layout=layout)


In [3]:
import ssl
# this prevents the following error when trying to download the dataset:
# SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)
ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
#DATA_PATH = Path.cwd() / "data" / "CIFAR10"
DATA_PATH = Path.home() / "Documents" / "data" / "CIFAR10"

DATA_PATH_TRAIN = Path(DATA_PATH, "train")
DATA_PATH_TEST = Path(DATA_PATH, "test")

In [5]:
DATA_PATH

WindowsPath('C:/Users/Bernhard/Documents/data/CIFAR10')

## Create CIFAR10 dataset organized in subfolders indicating class

In [6]:
def prepare_CIFAR10(dataset, root):
    # Create subfolders for each class label
    for class_label in range(10):
        class_dir = Path(root, str(class_label))
        class_dir.mkdir(parents=True, exist_ok=True)

    # Organize the dataset into subfolders
    for idx, (image, label) in enumerate(dataset):
        class_dir = Path(root, str(label))
        image_path = Path(class_dir, f"{idx}.png")
        torchvision.utils.save_image(image, image_path)


In [7]:
transform = transforms.Compose(
    [
        transforms.ToTensor(),
    ]
)

train_dataset = torchvision.datasets.CIFAR10(root=DATA_PATH, train=True, transform=transform, download=True)

if not DATA_PATH_TRAIN.exists():
    prepare_CIFAR10(train_dataset, DATA_PATH_TRAIN)

Files already downloaded and verified


In [8]:
test_dataset = torchvision.datasets.CIFAR10(root=DATA_PATH, train=False, transform=transform, download=True)

if not DATA_PATH_TEST.exists():
    prepare_CIFAR10(test_dataset, DATA_PATH_TEST)

Files already downloaded and verified


In [9]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device:", DEVICE.upper())

Running on device: CUDA


In [10]:
weights_pretrained = torch.load("weights_resnet18_cifar10.pth", map_location=DEVICE)

# load model with pre-trained weights
model = resnet18(num_classes=10)
model.load_state_dict(weights_pretrained)

<All keys matched successfully>

In [11]:
train_files = fs.get_files(DATA_PATH_TRAIN, extensions='.png', recursive=True)
len(train_files)

50000

In [12]:
test_files = fs.get_files(DATA_PATH_TEST, extensions='.png', recursive=True)
len(test_files)

10000

## Define Transformation pipeline
Notice, that we have a FileToPIL Transformation that handles the loading of the image. This enables us to use the standard Aggregator, where we don't need to take care of a DataSet or DataLoader instantiation.
All we need to pass as arguments is a file list and the transformation pipeline, and optionally a batch size.

In [13]:
# Create the transformation pipeline
transform_pipeline = Compose([
    FileToPIL(),
    PILtoHist(bins=64),
    FlattenArray(),
])

## Instantiate Aggregator and extract embeddings
The images are stored in the resulting dict as 'item', and the filenames as 'file'.

In [14]:
agg = DataAggregator(test_files, transforms=transform_pipeline, batch_size=32)

test_hist_result = agg.transform()

In [15]:
# Create the transformation pipeline
transform_pipeline = Compose([
    FileToPIL(),
    PILToNumpy(),
    FlattenArray(),
])

agg = DataAggregator(test_files, transforms=transform_pipeline, batch_size=32)

test_PIL_result = agg.transform()

In [16]:
test_PIL_result['item'].shape

torch.Size([10000, 3072])

In [17]:
test_hist_result['item'].shape

torch.Size([10000, 192])

In [78]:
import plotly.express as px

fig = px.bar(test_hist_result['item'][4])
fig.show()

In [18]:
assert False

AssertionError: 

In [23]:
from sklearn.neighbors import NearestNeighbors

def compute_k_nearest_neighbors(histograms, K):
    # Convert histograms to a numpy array for easier processing
    histograms = np.array(histograms)

    # Initialize Nearest Neighbors model
    nn_model = NearestNeighbors(n_neighbors=K, metric='euclidean')
   
    # Fit the model to the histograms
    nn_model.fit(histograms)
   
    # Find K-nearest neighbors for each histogram
    _,neighbors_indices = nn_model.kneighbors(histograms, n_neighbors=K, return_distance=False)
   
    return neighbors_indices


import numpy as np
from sklearn.neighbors import NearestNeighbors

def compute_nearest_neighbors_between_datasets(reference_histograms, query_histograms, K):
    # Convert histograms to numpy arrays for easier processing
    reference_histograms = np.array(reference_histograms)
    query_histograms = np.array(query_histograms)

    # Initialize Nearest Neighbors model with reference dataset
    nn_model = NearestNeighbors(n_neighbors=K, metric='euclidean')
    nn_model.fit(reference_histograms)

    # Find K-nearest neighbors in the reference dataset for each query histogram
    neighbors_indices = nn_model.kneighbors(query_histograms, n_neighbors=K, return_distance=False)

    return neighbors_indices    

In [31]:
def get_nearest_neighbour_idx(X):
    nbrs = NearestNeighbors(n_neighbors=2).fit(X)#, algorithm='ball_tree').fit(X)
    _, indices = nbrs.kneighbors(X)
    
    return indices

In [32]:
hist_neighbors_indices = get_nearest_neighbour_idx(test_hist_result['item'])
hist_neighbors_indices

In [41]:
test = np.unique(hist_neighbors_indices[:,0])
len(test)

10000

In [66]:
test = np.unique(hist_neighbors_indices[:,1])
len(test)

4886

In [67]:
test

array([   0,    3,    4, ..., 9995, 9997, 9999], dtype=int64)

In [68]:
unique_hist_nns = hist_neighbors_indices[test,:]

In [69]:
unique_hist_nns[:,0]

array([   0,    3,    4, ..., 9995, 9997, 9999], dtype=int64)

In [74]:
unique_hist_nns[3531]

array([6967,    4], dtype=int64)

In [75]:
unique_hist_nns[2]

array([   4, 6967], dtype=int64)

In [70]:
common_values = np.intersect1d(unique_hist_nns[:,0], unique_hist_nns[:,1])
common_values

array([   4,    5,    7, ..., 9995, 9997, 9999], dtype=int64)

In [51]:
test_PIL_result['file']

['C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\10.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1001.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1010.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1018.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1022.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1023.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1026.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1027.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1036.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1052.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1067.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1072.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1076.png',
 'C:\\Users\\Bernhard\\Documents\\data\\CIFAR10\\test\\0\\1077.png',
 'C:\\Users\\Bernhard\\Documents\\da

In [61]:
simsplit_train = list(np.array(test_PIL_result['file'])[unique_hist_nns[:,0]])
simsplit_test = list(np.array(test_PIL_result['file'])[unique_hist_nns[:,1]])

In [62]:
len(simsplit_train)

4886

In [63]:
len(simsplit_test)

4886

In [None]:
PIL_neighbors_indices = compute_k_nearest_neighbors(test_PIL_result['item'], K=2)
PIL_neighbors_indices

array([[   0,  591],
       [   1, 4128],
       [   2,  909],
       ...,
       [9997, 7431],
       [9998, 4806],
       [9999, 4629]], dtype=int64)

In [None]:
test = neighbors_indices==PIL_neighbors_indices
test = [i[1] for i in test]
test

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,


In [None]:
np.sum(test)

97

In [None]:
import hashlib
import pickle

In [None]:
from umap import UMAP

# Create the UMAP reducer instance
reducer = UMAP(n_neighbors=15, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=2, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=random_state, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipeline = Pipeline([
    ('umap', reducer),
])


X = test_embedding_result['item']

In [None]:
pipeline.fit(X)

In [None]:
test_reduced_embedding = pipeline.transform(X)
test_reduced_embedding

In [None]:
test_result_y = [Path(file).parts[-2] for file in test_embedding_result['file']]
test_result_y = np.array(test_result_y).astype('float')

In [None]:
embedding_2d_fig = plot_2d(data=test_reduced_embedding,
                           color=test_result_y,
                           hover_name=test_embedding_result['file'])
embedding_2d_fig