In [1]:
try:
    import google.colab  # noqa: F401

    %pip install -q dataeval
except Exception:
    pass

In [2]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets, models

# Drift
from dataeval.detectors.drift import DriftCVM, DriftKS, DriftMMD
from dataeval.metrics.bias import label_parity

# Set a random seed
rng = np.random.default_rng(213)

# Set default torch device for notebook
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_device(device)

In [3]:
# Define the embedding network
class EmbeddingNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Load in pretrained resnet18 model
        self.model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        # Add an additional fully connected layer with an embedding dimension of 128
        self.model.fc = nn.Linear(self.model.fc.in_features, 128)

    def forward(self, x):
        """Run input data through the model"""

        return self.model(x)

In [4]:
embedding_net = EmbeddingNet()

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /home/dataeval/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0% 0.00/44.7M [00:00<?, ?B/s]

 29% 12.9M/44.7M [00:00<00:00, 134MB/s]

 77% 34.5M/44.7M [00:00<00:00, 188MB/s]

100% 44.7M/44.7M [00:00<00:00, 192MB/s]




In [5]:
# Define pretrained model transformations
preprocess = models.ResNet18_Weights.DEFAULT.transforms()

# Load the training dataset
train_ds = datasets.VOCDetection("./data", year="2011", image_set="train", download=False, transform=preprocess)
# Load the "operational" dataset
operational_ds = datasets.VOCDetection("./data", year="2011", image_set="val", download=False, transform=preprocess)


def print_dataset_info(dataset, split_name):
    """Pretty prints dataset name and info"""
    print(f"{split_name}\n{'-' * len(split_name)}\n{dataset}\n")


print_dataset_info(train_ds, "Training")
print_dataset_info(operational_ds, "Operational")

Training
--------
Dataset VOCDetection
    Number of datapoints: 5717
    Root location: ./data
    StandardTransform
Transform: ImageClassification(
               crop_size=[224]
               resize_size=[256]
               mean=[0.485, 0.456, 0.406]
               std=[0.229, 0.224, 0.225]
               interpolation=InterpolationMode.BILINEAR
           )

Operational
-----------
Dataset VOCDetection
    Number of datapoints: 5823
    Root location: ./data
    StandardTransform
Transform: ImageClassification(
               crop_size=[224]
               resize_size=[256]
               mean=[0.485, 0.456, 0.406]
               std=[0.229, 0.224, 0.225]
               interpolation=InterpolationMode.BILINEAR
           )



In [6]:
# Reduce overhead cost by not tracking tensor gradients
@torch.no_grad
def custom_batch(dataset: datasets.VOCDetection, model: nn.Module) -> tuple[torch.Tensor, list[str]]:
    """
    Iterates through the dataset to generate model embeddings and stores labels into a flat list

    Note
    ----
    Due to a bug with the VOCDetection dataset and DataLoaders,
    the batching is done manually
    """

    model.eval()
    BATCH_SIZE = 64  # Can be adjusted depending on available GPU memory
    embeddings = []
    images, labels = [], []

    for i, (image, targets) in enumerate(dataset):  # type: ignore
        # Aggregate images
        images.append(image)
        # Aggregate labels only
        objects = targets["annotation"]["object"]
        labels.extend([obj["name"] for obj in objects])

        if (i + 1) % BATCH_SIZE == 0:
            outputs = model(torch.stack(images))
            embeddings.append(outputs)
            images = []

    # Add last batch even if not full batch size
    embeddings.append(model(torch.stack(images)))

    return torch.vstack(embeddings).cpu(), labels


In [7]:
# This step can take ~1 minute depending on hardware

# Create training batches and labels
train_embs, train_labels = custom_batch(train_ds, model=embedding_net)

# Create operational batches and labels
operational_embs, operational_labels = custom_batch(operational_ds, model=embedding_net)

In [8]:
print(train_embs.shape)
print(operational_embs.shape)

torch.Size([5717, 128])
torch.Size([5823, 128])


In [9]:
# A type alias for all of the drift detectors
DriftDetector = DriftMMD | DriftCVM | DriftKS

# Create a mapping for the detectors to iterate over
detectors: dict[str, DriftDetector] = {
    "MMD": DriftMMD(train_embs),
    "CVM": DriftCVM(train_embs),
    "KS": DriftKS(train_embs),
}

In [10]:
# Iterate and print the name of the detector class and its boolean drift prediction
for name, detector in detectors.items():
    print(f"{name} detected drift? {detector.predict(operational_embs).is_drift}")

MMD detected drift? False
CVM detected drift? False


KS detected drift? False


In [11]:
# Creates a normal distribution around the operational embeddings
noisy_embs = torch.normal(mean=operational_embs)

In [12]:
# Iterate and print the name of the detector class and its boolean drift prediction
for name, detector in detectors.items():
    print(f"{name} detected drift? {detector.predict(noisy_embs).is_drift}")

MMD detected drift? True


CVM detected drift? True


KS detected drift? True


In [13]:
from sklearn import preprocessing

# Turns string labels into integer labels
label_encoder = preprocessing.LabelEncoder()

# Train the label encoder on the training labels
label_encoder.fit(train_labels)

# Transform both training and operational labels into integers
train_encoded = label_encoder.transform(train_labels)
operational_encoded = label_encoder.transform(operational_labels)

In [14]:
# The VOC dataset has 20 classes
label_parity(train_encoded, operational_encoded, num_classes=20)

ParityOutput(score=np.float64(10.122611673242247), p_value=np.float64(0.949856067521638), metadata_names=None, insufficient_data={})