In [1]:
try:
    import google.colab  # noqa: F401

    # specify the version of DataEval (==X.XX.X) for versions other than the latest
    %pip install -q dataeval maite-datasets
except Exception:
    pass

In [2]:
import polars as pl
from maite_datasets.image_classification import CIFAR10

from dataeval import Metadata
from dataeval.quality import Outliers

_ = pl.Config.set_tbl_rows(-1)

In [3]:
# Load in the CIFAR10 dataset
testing_dataset = CIFAR10("./data", image_set="test", download=True)

In [4]:
# Initialize the Duplicates class
outliers = Outliers(outlier_threshold=("zscore", 3.5))

# Evaluate the data
results = outliers.evaluate(testing_dataset)

In [5]:
print(f"Total number of images with an issue: {len(results.aggregate_by_item())}")

Total number of images with an issue: 319


In [6]:
# View issues by metric
results.aggregate_by_metric()

metric_name,Total
cat,u32
"""entropy""",148
"""zeros""",79
"""kurtosis""",78
"""skew""",50
"""brightness""",49
"""contrast""",41
"""var""",37
"""mean""",10
"""std""",4
"""darkness""",2


In [7]:
# View issues by class
results.aggregate_by_class(Metadata(testing_dataset))

class_name,brightness,contrast,darkness,entropy,kurtosis,mean,sharpness,skew,std,var,zeros,Total
cat,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""airplane""",32,2,0,64,40,7,1,31,1,4,7,189
"""bird""",6,10,0,24,16,2,0,8,1,4,7,78
"""cat""",2,4,1,8,5,0,0,4,0,5,13,42
"""automobile""",0,5,0,14,0,0,0,0,0,5,11,35
"""deer""",1,7,0,10,7,0,0,3,0,0,7,35
"""frog""",0,5,0,7,3,0,0,1,0,4,14,34
"""horse""",0,4,1,6,1,0,0,0,1,6,10,29
"""ship""",5,0,0,4,5,1,1,2,1,3,1,23
"""dog""",2,2,0,4,0,0,0,0,0,3,7,18
"""truck""",1,2,0,7,1,0,0,1,0,3,2,17


In [8]:
### TEST ASSERTION CELL ###
assert results.issues.shape[0] == 500