In [1]:
# Google Colab Only
try:
    import google.colab  # noqa: F401

    # specify the version of DataEval (==X.XX.X) for versions other than the latest
    %pip install -q dataeval
except Exception:
    pass

In [2]:
from dataeval.metrics.estimators import divergence
from dataeval.utils.data import DataProcessor
from dataeval.utils.data.datasets import MNIST

In [3]:
# Load in the training mnist dataset and use the first 4000
train_ds = MNIST(root="./data/", train=True, size=4000, unit_interval=True)

# Create the data processor and extract the embeddings
train_dp = DataProcessor(train_ds, batch_size=64)
embeddings = train_dp.embeddings.to_tensor()

Determining if data needs to be downloaded
Loaded data successfully
Running data preprocessing steps


In [4]:
print("Number of samples: ", len(embeddings))
print("Image shape:", embeddings[0].shape)

Number of samples:  4000
Image shape: torch.Size([784])


In [5]:
data_a = embeddings[0:2000]
data_b = embeddings[2000:]

In [6]:
div = divergence(data_a, data_b)
print(div)

DivergenceOutput: {'divergence': 0.0, 'errors': np.int64(2026)}


In [7]:
corrupted_ds = MNIST(root="./data", train=True, size=2000, unit_interval=True, corruption="translate")
corrupted_dp = DataProcessor(corrupted_ds, batch_size=64)
corrupted_emb = corrupted_dp.embeddings.to_tensor()

Determining if data needs to be downloaded
Loaded data successfully
Running data preprocessing steps


In [8]:
print("Number of corrupted samples: ", len(corrupted_emb))
print("Corrupted image shape:", corrupted_emb[0].shape)

Number of corrupted samples:  2000
Corrupted image shape: torch.Size([784])


In [9]:
div = divergence(data_a, corrupted_emb)
print(div)

DivergenceOutput: {'divergence': np.float64(0.97), 'errors': np.int64(60)}


In [10]:
### TEST ASSERTION CELL ###
assert div.divergence > 0.95