In [1]:
try:
    import google.colab  # noqa: F401

    # specify the version of DataEval (==X.XX.X) for versions other than the latest
    %pip install -q dataeval
except Exception:
    pass

In [2]:
import numpy as np
import torch
import torchvision.datasets as datasets
import torchvision.transforms.v2 as v2

from dataeval.detectors.linters import Outliers

In [3]:
# Load in the cifar-10 dataset from torchvision
to_tensor = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])
testing_dataset = datasets.CIFAR10("./data", train=False, download=True, transform=to_tensor)
test_data = np.array(testing_dataset.data, dtype=float)

Files already downloaded and verified


In [4]:
# Initialize the Duplicates class
outliers = Outliers(outlier_method="zscore", outlier_threshold=3)

# Evaluate the data
results = outliers.evaluate(test_data)

In [5]:
print(f"Total number of images with an issue: {len(results.issues)}")

Total number of images with an issue: 640


In [6]:
# Show a count of issues by type
issue_count_by_type = {}
for index, issue in results.issues.items():
    for k, v in issue.items():
        issue_count_by_type[k] = issue_count_by_type.setdefault(k, 0) + 1
for issue, count in issue_count_by_type.items():
    print(f"{issue:>10}: {count:<5}")

       var: 84   
      mean: 50   
      skew: 101  
  kurtosis: 109  
   entropy: 206  
brightness: 98   
  contrast: 95   
 sharpness: 196  
       std: 18   
     zeros: 9    
  darkness: 14   


In [7]:
### TEST ASSERTION CELL ###
assert len(results.issues) == 640
assert {
    "var",
    "mean",
    "skew",
    "kurtosis",
    "entropy",
    "brightness",
    "contrast",
    "sharpness",
    "std",
    "zeros",
    "darkness",
} == set(issue_count_by_type)