In [1]:
try:
    import google.colab  # noqa: F401

    # specify the version of DataEval (==X.XX.X) for versions other than the latest
    %pip install -q dataeval
except Exception:
    pass

In [2]:
from dataeval.detectors.linters import Outliers
from dataeval.utils.datasets import CIFAR10

In [3]:
# Load in the CIFAR10 dataset
testing_dataset = CIFAR10("./data", image_set="test", download=True)

In [4]:
# Initialize the Duplicates class
outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)

# Evaluate the data
results = outliers.evaluate(testing_dataset)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [5]:
print(f"Total number of images with an issue: {len(results.issues)}")

Total number of images with an issue: 305


In [6]:
# Show a count of issues by type
issue_count_by_type = {}
for issue in results.issues.values():
    for k, v in issue.items():
        issue_count_by_type[k] = issue_count_by_type.setdefault(k, 0) + 1
for issue in sorted(issue_count_by_type, key=lambda k: issue_count_by_type[k], reverse=True):
    print(f"{issue:>10}: {issue_count_by_type[issue]:<5}")

   entropy: 148  
  kurtosis: 78   
     zeros: 58   
      skew: 51   
brightness: 49   
  contrast: 41   
       var: 36   
      mean: 10   
       std: 3    
 sharpness: 2    
  darkness: 2    


In [7]:
### TEST ASSERTION CELL ###
assert len(results.issues) == 305
assert {
    "var",
    "mean",
    "skew",
    "kurtosis",
    "entropy",
    "brightness",
    "contrast",
    "zeros",
    "sharpness",
    "std",
    "darkness",
} == set(issue_count_by_type)