# ImageNet, CIFAR10, and SVHN data and image preview of embedded data points
In case anything weird happens, make sure that the caching is used correctly, and nothing is overwritten or misplaced,
e.g. if a cache path is shared with other notebooks.

# Prerequisites

In [25]:
import numpy as np
from pathlib import Path
from collections import Counter

import torch
from torch.utils.data import DataLoader

import torchvision
from torchvision.datasets import CIFAR10, SVHN, ImageNet
from torchvision.transforms import Compose, ToTensor, Grayscale, Normalize, Resize, CenterCrop

from transights.utils import DataSetDumper
from transights.utils import FolderScanner as fs
from transights.utils import Pickler
from transights.utils import EmbeddingPlotter
from transights.utils import PipelineCache
from transights.utils import PathList
from transights.transforms import (FileToPIL,
                            DummyPIL,
                            PILToNumpy,
                            FlattenArray,
                            FlattenList,
                            DebugTransform,
                            ProjectTransform,
                            PyTorchOutput,
                            PyTorchEmbedding,
                            ToDevice,
                            FlattenTensor,
                            CachingTransform,
                            TensorToNumpy,
                            ToArgMax,
                            ToLabel,
                            )

from transights.aggregator import DataAggregator, DataSetAggregator

import matplotlib.pyplot as plt

import plotly.graph_objs as go
import plotly.express as px
import ipywidgets as widgets

from IPython.display import display


random_state = 23

In [2]:
from collections import OrderedDict

def weights_to_openood_model(weights, model):
# Modify the keys to get rid of 'module.' in all the keys
    new_state_dict = OrderedDict([(key.replace('module.', ''), value) for key, value in weights.items()])
    model.load_state_dict(new_state_dict)

    return model

In [3]:
import ssl
# this prevents the following error when trying to download the dataset:
# SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)
ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on device:", DEVICE.upper())

Running on device: CUDA


In [5]:
ROOT_PATH = Path('D:/')
#ROOT_PATH = Path.cwd()
DATA_PATH = ROOT_PATH / "data"
CACHE_PATH = DATA_PATH / "newtmp"

CIFAR10_DATA_PATH = DATA_PATH / "CIFAR10"
CIFAR10_DATA_PATH_TRAIN = Path(CIFAR10_DATA_PATH, "train")
CIFAR10_DATA_PATH_TEST = Path(CIFAR10_DATA_PATH, "test")

SVHN_DATA_PATH = DATA_PATH / "SVHN"
SVHN_DATA_PATH_TRAIN = Path(SVHN_DATA_PATH, "train")
SVHN_DATA_PATH_TEST = Path(SVHN_DATA_PATH, "test")

IMAGENET1K_DATA_PATH = DATA_PATH / "ImageNet"
IMAGENET1K_DATA_PATH_TRAIN = Path(IMAGENET1K_DATA_PATH, "train")
# id->class mapping
#IMAGENET1K_MAPPING_FILE = Path("D:\data\imagenet\imagenet-object-localization-challenge\LOC_synset_mapping.txt")

In [6]:
def load_text_file_into_dict(filename):
   result_dict = {}
   with open(filename, 'r') as file:
       for line_number, line in enumerate(file, start=1):
           # Split each line into two parts (key and value)
           parts = line.strip().split(' ', 1)

           # Ensure there are exactly two parts
           if len(parts) != 2:
               raise ValueError(f"Error in line {line_number}: Each line must contain exactly two entries.")

           key, value = parts[0], parts[1]
           result_dict[key] = value

   return result_dict

## Create CIFAR10 dataset organized in subfolders indicating class

In [7]:
weights_pretrained = torch.load("model.ckpt", map_location=DEVICE)

In [8]:
from resnet18_32x32 import ResNet18_32x32 as resnet18

# load model with pre-trained weights
oodresnet18_model = weights_to_openood_model(weights_pretrained, resnet18(num_classes=10))

In [9]:
IMAGENET1K_train_files = fs.get_files(IMAGENET1K_DATA_PATH_TRAIN, extensions='.png', recursive=True, relative_to=IMAGENET1K_DATA_PATH)
len(IMAGENET1K_train_files)

1281167

In [10]:
IMAGENET1K_train_embedding_pickle_file = Path(CACHE_PATH, "IMAGENET1K_train__oodresnet18__embedding.pkl")

In [11]:
CIFAR10_train_embedding_pickle_file = Path(CACHE_PATH, "CIFAR10_train__oodresnet18__embedding.pkl")
CIFAR10_test_embedding_pickle_file = Path(CACHE_PATH, "CIFAR10_test__oodresnet18__embedding.pkl")
CIFAR10_test_output_pickle_file = Path(CACHE_PATH, "CIFAR10_test__oodresnet18__output.pkl")

In [12]:
SVHN_test_embedding_pickle_file = Path(CACHE_PATH, "SVHN_test__oodresnet18__embedding.pkl")

In [13]:
transform = Compose(
    [
        ToTensor(),
    ]
)

In [14]:
CIFAR10_train_dataset = CIFAR10(root=CIFAR10_DATA_PATH, train=True, transform=transform, download=True)

if not CIFAR10_DATA_PATH_TRAIN.exists():
    DataSetDumper(CIFAR10_train_dataset, CIFAR10_DATA_PATH_TRAIN).dump()

CIFAR10_test_dataset = CIFAR10(root=CIFAR10_DATA_PATH, train=False, transform=transform, download=True)

if not CIFAR10_DATA_PATH_TEST.exists():
    DataSetDumper(CIFAR10_test_dataset, CIFAR10_DATA_PATH_TEST).dump()    

Files already downloaded and verified
Files already downloaded and verified


In [15]:
CIFAR10_train_files = fs.get_files(CIFAR10_DATA_PATH_TRAIN, extensions='.png', recursive=True, relative_to=CIFAR10_DATA_PATH)
len(CIFAR10_train_files)

50000

In [16]:
CIFAR10_test_files = fs.get_files(CIFAR10_DATA_PATH_TEST, extensions='.png', recursive=True, relative_to=CIFAR10_DATA_PATH)
len(CIFAR10_test_files)

10000

In [17]:
SVHN_test_dataset = SVHN(root=SVHN_DATA_PATH, split='test', transform=transform, download=True)

if not SVHN_DATA_PATH_TEST.exists():
    DataSetDumper(SVHN_test_dataset, SVHN_DATA_PATH_TEST).dump(targets=np.unique(SVHN_test_dataset.labels))

Using downloaded and verified file: D:\data\SVHN\test_32x32.mat


In [18]:
SVHN_test_files = fs.get_files(SVHN_DATA_PATH_TEST, extensions='.png', recursive=True, relative_to=SVHN_DATA_PATH)
len(SVHN_test_files)

26032

## Define Transformation pipeline
Notice, that we have a FileToPIL Transformation that handles the loading of the image. This enables us to use the standard Aggregator, where we don't need to take care of a DataSet or DataLoader instantiation.
All we need to pass as arguments are a file list and the transformation pipeline, and optionally a batch size.

In [19]:
# Create the transformation pipeline
embedding_pipeline = Compose([
    FileToPIL(),
    ToTensor(),
    CenterCrop(32),
    #Resize((32, 32)),
    Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ToDevice(DEVICE),
    PyTorchEmbedding(oodresnet18_model, device=DEVICE),
    ToDevice('cpu'),
    FlattenTensor(),
])

## Instantiate Aggregator and extract embeddings
The images are stored in the resulting dict as 'item', and the filenames as 'file'.

In [20]:
def dump_result_info(result):
    print(f"data shape: {result['item'].shape}")
    print(f"target files: {len(result['file'])}")
    print(f"target sample: {result['file'][0]}")

In [21]:
agg = DataAggregator(root=IMAGENET1K_DATA_PATH, files=IMAGENET1K_train_files, transforms=embedding_pipeline, batch_size=320)

IMAGENET1K_train_embedding = agg.transform(cache_file=IMAGENET1K_train_embedding_pickle_file)

dump_result_info(IMAGENET1K_train_embedding)

data shape: torch.Size([1281167, 512])
target files: 1281167
target sample: train\0\1000137.png


In [22]:
agg = DataAggregator(root=CIFAR10_DATA_PATH, files=CIFAR10_train_files, transforms=embedding_pipeline, batch_size=320)

CIFAR10_train_embedding = agg.transform(cache_file=CIFAR10_train_embedding_pickle_file)

dump_result_info(CIFAR10_train_embedding)

data shape: torch.Size([50000, 512])
target files: 50000
target sample: train\0\10008.png


In [23]:
agg = DataAggregator(root=SVHN_DATA_PATH, files=SVHN_test_files, transforms=embedding_pipeline, batch_size=320)

SVHN_test_embedding = agg.transform(cache_file=SVHN_test_embedding_pickle_file)

dump_result_info(SVHN_test_embedding)

data shape: torch.Size([26032, 512])
target files: 26032
target sample: test\0\10021.png


In [26]:
IMAGENET1K_train_embedding['file'] = IMAGENET1K_DATA_PATH / PathList(IMAGENET1K_train_embedding['file'])
CIFAR10_train_embedding['file'] = CIFAR10_DATA_PATH / PathList(CIFAR10_train_embedding['file'])
SVHN_test_embedding['file'] = SVHN_DATA_PATH / PathList(SVHN_test_embedding['file'])

## Instantiate Aggregator and extract outputs
We want to know the performance of the model on CIFAR10 test data.

In [None]:
# Create the transformation pipeline
output_pipeline = Compose([
    FileToPIL(),
    ToTensor(),
    Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ToDevice(DEVICE),
    PyTorchOutput(oodresnet18_model, device=DEVICE),
    ToDevice('cpu'),
    ToArgMax(),
])

In [None]:
agg = DataAggregator(root=CIFAR10_DATA_PATH, files=CIFAR10_test_files, transforms=output_pipeline, batch_size=320)

CIFAR10_test_output = agg.transform(cache_file=CIFAR10_test_output_pickle_file)

dump_result_info(CIFAR10_test_output)

In [None]:
from sklearn.metrics import accuracy_score

CIFAR10_test_y = [Path(file).parts[-2] for file in CIFAR10_test_output['file']]
CIFAR10_test_y = np.array(CIFAR10_test_y).astype('int64')
np.unique(CIFAR10_test_y)


In [None]:
CIFAR10_test_output['item']

In [None]:
CIFAR10_test_pred_y = CIFAR10_test_output['item']
CIFAR10_test_accuracy = accuracy_score(CIFAR10_test_y, CIFAR10_test_pred_y)

print(f"Test Accuracy: {CIFAR10_test_accuracy:.3f}")

In [None]:
np.unique(CIFAR10_test_pred_y)

In [None]:
from umap import UMAP

# Create the UMAP reducer instance
reducer = UMAP(n_neighbors=15, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=2, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=random_state, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

In [None]:
IMAGENET1K_train_CIFAR_train_X = np.vstack((IMAGENET1K_train_embedding['item'], CIFAR10_train_embedding['item']))
IMAGENET1K_train_CIFAR_train_X.shape

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipeline = Pipeline([
    ('umap', reducer),
])

UMAP_2D_CACHE_PATH = CACHE_PATH / "umap2d"
pipeline = PipelineCache(pipeline, cache_path=UMAP_2D_CACHE_PATH)


In [None]:
pipeline.fit(IMAGENET1K_train_CIFAR_train_X)

In [None]:
IMAGENET1K_train_reduced_embedding = pipeline.transform(IMAGENET1K_train_embedding['item'])
IMAGENET1K_train_reduced_embedding.shape

In [None]:
CIFAR10_train_reduced_embedding = pipeline.transform(CIFAR10_train_embedding['item'])
CIFAR10_train_reduced_embedding.shape

In [None]:
SVHN_test_reduced_embedding = pipeline.transform(SVHN_test_embedding['item'])
SVHN_test_reduced_embedding.shape

In [None]:
IMAGENET1K_train_CIFAR_train_SVHN_test_reduced_embedding = np.vstack((IMAGENET1K_train_reduced_embedding, CIFAR10_train_reduced_embedding, SVHN_test_reduced_embedding))
IMAGENET1K_train_CIFAR_train_SVHN_test_reduced_embedding.shape

In [None]:
IMAGENET1K_train_CIFAR_train_SVHN_test__file_list = IMAGENET1K_train_embedding['file'].paths + CIFAR10_train_embedding['file'].paths + SVHN_test_embedding['file'].paths
len(IMAGENET1K_train_CIFAR_train_SVHN_test__file_list)

In [None]:
IMAGENET1K_train_y_str = ["IMAGENET1K"] * len(IMAGENET1K_train_embedding['item'])
CIFAR10_train_y_str = ["CIFAR10"] * len(CIFAR10_train_embedding['item'])
SVHN_test_y_str = ["SVHN"] * len(SVHN_test_embedding['item'])

#IMAGENET1K_train__CIFAR10_train_y_str = IMAGENET1K_train_y_str + CIFAR10_train_y_str
#len(IMAGENET1K_train__CIFAR10_train_y_str)

In [None]:
CIFAR10_train_y = [Path(file).parts[-2] for file in CIFAR10_train_embedding['file']]
CIFAR10_train_y = np.array(CIFAR10_train_y).astype('int')

# Map class index to label
CIFAR10_labels = CIFAR10_train_dataset.classes

CIFAR10_train_y_str = ["CIFAR10_train_" + CIFAR10_labels[i] for i in CIFAR10_train_y]
CIFAR10_test_y_str = ["CIFAR10_test_" + CIFAR10_labels[i] for i in CIFAR10_test_y]

In [None]:
#IMAGENET1K_train__CIFAR10_train__SVHN_test__y_str = IMAGENET1K_train_y_str + CIFAR10_train_y_str + SVHN_test_y_str
IMAGENET1K_train__CIFAR10_train__SVHN_test__y_str = IMAGENET1K_train_y_str + CIFAR10_train_y_str + SVHN_test_y_str
len(IMAGENET1K_train__CIFAR10_train__SVHN_test__y_str)

In [None]:
plotter = EmbeddingPlotter(data=IMAGENET1K_train_CIFAR_train_SVHN_test_reduced_embedding,
                           color=IMAGENET1K_train__CIFAR10_train__SVHN_test__y_str,
                           #color=CIFAR10_SVHN_full_str,
                           file_list=IMAGENET1K_train_CIFAR_train_SVHN_test__file_list,
                           width=1000)

display(plotter.plot())

In [None]:
assert False

In [None]:
IMAGENET1K_train_y = [Path(file).parts[-2] for file in IMAGENET1K_train_embedding['file']]
#IMAGENET1K_train_y = np.array(IMAGENET1K_train_y).astype('int')



In [None]:
IMAGENET1K_train_y

In [None]:
imagenet1k_mapping = load_text_file_into_dict(IMAGENET1K_MAPPING_FILE)
IMAGENET1K_mapped_train_y = [imagenet1k_mapping[k] for k in IMAGENET1K_train_y]

In [None]:
IMAGENET1K_train_str = ["IMAGENET1K_train"] * len(IMAGENET1K_train_y)

In [None]:
IMAGENET1K_test_y = [Path(file).parts[-2] for file in CIFAR10_test_embedding['file']]
IMAGENET1K_test_y = np.array(CIFAR10_test_y).astype('int')

In [None]:
CIFAR10_train_y = [Path(file).parts[-2] for file in CIFAR10_train_embedding['file']]
CIFAR10_train_y = np.array(CIFAR10_train_y).astype('int')

# Map class index to label
CIFAR10_labels = CIFAR10_train_dataset.classes

CIFAR10_train_y_str = ["CIFAR10_train_" + CIFAR10_labels[i] for i in CIFAR10_train_y]
#CIFAR10_test_y_str = ["CIFAR10_test_" + CIFAR10_labels[i] for i in CIFAR10_test_y]

CIFAR10_train_str = ["CIFAR10_train"] * len(CIFAR10_train_y)
#CIFAR10_test_str = ["CIFAR10_test"] * len(CIFAR10_test_y)

In [None]:
CIFAR10_full_y_str = CIFAR10_train_y_str + CIFAR10_test_y_str
len(CIFAR10_full_y_str)

In [None]:
CIFAR10_full_str = CIFAR10_train_str + CIFAR10_test_str
CIFAR10_full_str

In [None]:
CIFAR10_full_file_list = CIFAR10_train_embedding['file'] + CIFAR10_test_embedding['file']
len(CIFAR10_full_file_list)

In [None]:
CIFAR10_test_y

In [None]:
SVHN_train_y = [Path(file).parts[-2] for file in SVHN_train_embedding['file']]
SVHN_train_y = np.array(SVHN_train_y).astype('int')

SVHN_test_y = [Path(file).parts[-2] for file in SVHN_test_embedding['file']]
SVHN_test_y = np.array(SVHN_test_y).astype('int')

# Map class index to label
ctr = Counter(SVHN_train_y)
SVHN_labels = [str(key) for key in list(ctr.keys())]

SVHN_train_y_str = ["SVHN_train_" + SVHN_labels[i] for i in SVHN_train_y]
SVHN_test_y_str = ["SVHN_test_" + SVHN_labels[i] for i in SVHN_test_y]

SVHN_train_str = ["SVHN_train"] * len(SVHN_train_y)
SVHN_test_str = ["SVHN_test"] * len(SVHN_test_y)

In [None]:
SVHN_full_y_str = SVHN_train_y_str + SVHN_test_y_str
len(SVHN_full_y_str)

In [None]:
SVHN_full_str = SVHN_train_str + SVHN_test_str
SVHN_full_str

In [None]:
SVHN_full_file_list = SVHN_train_embedding['file'] + SVHN_test_embedding['file']
len(SVHN_full_file_list)

In [None]:
CIFAR10_SVHN_full_str = CIFAR10_full_str + SVHN_full_str

In [None]:
CIFAR10_SVHN_full_file_list = CIFAR10_full_file_list + SVHN_full_file_list

In [None]:
IMAGENET1K_train_CIFAR_train_y_str = IMAGENET1K_train_str + CIFAR10_train_y_str
np.unique(IMAGENET1K_train_CIFAR_train_y_str)

In [None]:
assert False

In [None]:
from umap import UMAP

# Create the UMAP reducer instance
reducer = UMAP(n_neighbors=15, # default 15, The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation.
               n_components=3, # default 2, The dimension of the space to embed into.
               metric='euclidean', # default 'euclidean', The metric to use to compute distances in high dimensional space.
               n_epochs=1000, # default None, The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. 
               learning_rate=1.0, # default 1.0, The initial learning rate for the embedding optimization.
               init='spectral', # default 'spectral', How to initialize the low dimensional embedding. Options are: {'spectral', 'random', A numpy array of initial embedding positions}.
               min_dist=0.1, # default 0.1, The effective minimum distance between embedded points.
               spread=1.0, # default 1.0, The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are.
               low_memory=False, # default False, For some datasets the nearest neighbor computation can consume a lot of memory. If you find that UMAP is failing due to memory constraints consider setting this option to True.
               set_op_mix_ratio=1.0, # default 1.0, The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
               local_connectivity=1, # default 1, The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level.
               repulsion_strength=1.0, # default 1.0, Weighting applied to negative samples in low dimensional embedding optimization.
               negative_sample_rate=5, # default 5, Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
               transform_queue_size=4.0, # default 4.0, Larger values will result in slower performance but more accurate nearest neighbor evaluation.
               a=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               b=None, # default None, More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``.
               random_state=random_state, # default: None, If int, random_state is the seed used by the random number generator;
               metric_kwds=None, # default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance.
               angular_rp_forest=False, # default False, Whether to use an angular random projection forest to initialise the approximate nearest neighbor search.
               target_n_neighbors=-1, # default -1, The number of nearest neighbors to use to construct the target simplcial set. If set to -1 use the ``n_neighbors`` value.
               #target_metric='categorical', # default 'categorical', The metric used to measure distance for a target array is using supervised dimension reduction. By default this is 'categorical' which will measure distance in terms of whether categories match or are different. 
               #target_metric_kwds=None, # dict, default None, Keyword argument to pass to the target metric when performing supervised dimension reduction. If None then no arguments are passed on.
               #target_weight=0.5, # default 0.5, weighting factor between data topology and target topology.
               transform_seed=42, # default 42, Random seed used for the stochastic aspects of the transform operation.
               verbose=False, # default False, Controls verbosity of logging.
               unique=False, # default False, Controls if the rows of your data should be uniqued before being embedded. 
              )

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define the pipeline
pipeline = Pipeline([
    ('umap', reducer),
])

UMAP_3D_CACHE_PATH = CACHE_PATH / "umap3d"
pipeline = PipelineCache(pipeline, cache_path=UMAP_3D_CACHE_PATH)


In [None]:
pipeline.fit(IMAGENET1K_train_CIFAR_train_X)

In [None]:
CIFAR10_test_reduced_embedding = pipeline.transform(CIFAR10_test_X)
CIFAR10_test_reduced_embedding

In [None]:
plotter = EmbeddingPlotter(data=CIFAR10_test_reduced_embedding,
                           #color=CIFAR10_test_y,
                           color=CIFAR10_test_y_str,
                           file_list=CIFAR10_test_embedding['file'],
                           hover_name=CIFAR10_test_embedding['file'],
                           width=1000)

In [None]:
display(plotter.plot())