In [1]:
try:
    import google.colab  # noqa: F401

    # specify the version (==X.XX.X) at the end of the statement below when testing version of DataEval other
    # than the latest
    %pip install -q dataeval[torch]
except Exception:
    pass

In [2]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets, models

# Drift
from dataeval.detectors.drift import DriftCVM, DriftKS, DriftMMD
from dataeval.metrics.bias import label_parity

# Set the random value
rng = np.random.default_rng(213)

In [3]:
# Define the embedding network
class EmbeddingNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        self.model.fc = nn.Linear(self.model.fc.in_features, 128)

    def forward(self, x):
        x = self.model(x)
        return x


embedding_net = EmbeddingNet()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_net.to(device)


# Extract embeddings
def extract_embeddings(dataset, model):
    model.eval()

    embeddings = torch.empty(size=(0, 128)).to(device)
    with torch.no_grad():
        images = []
        for i, (img, _) in enumerate(dataset):
            images.append(img)
            if (i + 1) % 64 == 0:
                inputs = torch.stack(images, dim=0).to(device)
                outputs = model(inputs)
                embeddings = torch.vstack((embeddings, outputs))
                images = []
        inputs = torch.stack(images, dim=0).to(device)
        outputs = model(inputs)
        embeddings = torch.vstack((embeddings, outputs))
    return embeddings.detach().cpu().numpy()

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

 13%|█▎        | 6.00M/44.7M [00:00<00:00, 61.2MB/s]

 27%|██▋       | 12.0M/44.7M [00:00<00:00, 60.2MB/s]

 40%|████      | 18.0M/44.7M [00:00<00:00, 61.3MB/s]

 54%|█████▎    | 24.0M/44.7M [00:00<00:00, 61.2MB/s]

 67%|██████▋   | 30.0M/44.7M [00:00<00:00, 61.6MB/s]

 81%|████████  | 36.0M/44.7M [00:00<00:00, 61.2MB/s]

 94%|█████████▍| 42.0M/44.7M [00:00<00:00, 61.6MB/s]

100%|██████████| 44.7M/44.7M [00:00<00:00, 61.7MB/s]




In [4]:
# Define pretrained model transformations
preprocess = models.ResNet18_Weights.DEFAULT.transforms()

# Load the dataset
dataset = datasets.VOCDetection("./data", year="2011", image_set="train", download=False, transform=preprocess)

# Create image embeddings
embeddings = extract_embeddings(dataset, embedding_net)

In [5]:
np.shape(embeddings)

(5717, 128)

In [6]:
# Load the 'operational' dataset
op_dataset = datasets.VOCDetection("./data", year="2011", image_set="val", download=False, transform=preprocess)

# Create image embeddings
op_embeddings = extract_embeddings(op_dataset, embedding_net)

In [7]:
np.shape(op_embeddings)

(5823, 128)

In [8]:
d1 = DriftMMD(embeddings)
d2 = DriftCVM(embeddings)
d3 = DriftKS(embeddings)

In [9]:
d1.predict(op_embeddings).is_drift

False

In [10]:
d2.predict(op_embeddings).is_drift

False

In [11]:
d3.predict(op_embeddings).is_drift

False

In [12]:
perturbed_op_embeddings = np.float32(op_embeddings + np.random.normal(size=np.shape(op_embeddings)))

In [13]:
d1.predict(perturbed_op_embeddings).is_drift

True

In [14]:
d2.predict(perturbed_op_embeddings).is_drift

True

In [15]:
d3.predict(perturbed_op_embeddings).is_drift

True

In [16]:
labels = []
for data in op_dataset:
    objects = data[1]["annotation"]["object"]
    names = []
    for each in objects:
        names.append(each["name"])
    labels.append(names)

In [17]:
# Subset embeddings of images which contain a chair
chair_embeddings = op_embeddings[[("chair" in i) for i in labels], :]

In [18]:
d1.predict(chair_embeddings).is_drift

True

In [19]:
d2.predict(chair_embeddings).is_drift

True

In [20]:
d3.predict(chair_embeddings).is_drift

True

In [21]:
op_labels = []
for data in op_dataset:
    objects = data[1]["annotation"]["object"]
    names = []
    for each in objects:
        names.append(each["name"])
    op_labels.append(names)
op_labels = [x for i in op_labels for x in i]
labels = []
for data in dataset:
    objects = data[1]["annotation"]["object"]
    names = []
    for each in objects:
        names.append(each["name"])
    labels.append(names)
labels = [x for i in labels for x in i]

In [22]:
from sklearn import preprocessing

# Turn string labels into integer labels so the DataEval parity function can read them.
le = preprocessing.LabelEncoder()
le.fit(labels)
label_int = le.transform(labels)
op_label_int = le.transform(op_labels)

In [23]:
label_parity(label_int, op_label_int, 20).p_value

0.949856067521638