In [1]:
# Google Colab Only
try:
    import google.colab  # noqa: F401

    # specify the version of DataEval (==X.XX.X) for versions other than the latest
    %pip install -q dataeval maite-datasets
except Exception:
    pass

In [2]:
from dataclasses import asdict

import numpy as np
from maite_datasets.image_classification import MNIST

from dataeval import Metadata
from dataeval.quality import Duplicates
from dataeval.selection import Indices, Select

In [3]:
# Load in the mnist dataset
testing_dataset = MNIST(root="./data/", image_set="test", download=True)

# Get the labels
labels = Metadata(testing_dataset).class_labels

In [4]:
# Creating some indices to duplicate
print("Exact duplicates")
duplicates = {}
for i in [1, 2, 5, 9]:
    matching_indices = np.where(labels == i)[0]
    print(f"\t{i} - ({matching_indices[23]}, {matching_indices[78]})")
    duplicates[int(matching_indices[78])] = int(matching_indices[23])

Exact duplicates
	1 - (180, 663)
	2 - (249, 728)
	5 - (219, 866)
	9 - (212, 773)


In [5]:
# Create a subset with the identified duplicate indices swapped
indices_with_duplicates = [duplicates.get(i, i) for i in range(len(testing_dataset))]
duplicates_ds = Select(testing_dataset, Indices(indices_with_duplicates))

In [6]:
# Initialize the Duplicates class to begin to identify duplicate images.
identifyDuplicates = Duplicates()

# Evaluate the data
results = identifyDuplicates.evaluate(duplicates_ds)

In [7]:
for category, dupe_types in results.data().items():
    for dupe_type, groups in asdict(dupe_types).items():
        if groups is not None:
            print(f"{dupe_type} duplicate {category} : {len(groups)}")
            for group in groups:
                print(f"\t{group}")

exact duplicate items : 4
	[180, 663]
	[212, 773]
	[219, 866]
	[249, 728]
near duplicate items : 129
	{'indices': (14, 8575), 'methods': frozenset({'dhash'}), 'orientation': None}
	{'indices': (39, 2912, 3673, 5534, 5544, 6397, 6809, 6848), 'methods': frozenset({'phash', 'dhash'}), 'orientation': None}
	{'indices': (57, 4039), 'methods': frozenset({'phash'}), 'orientation': None}
	{'indices': (135, 2786, 2997, 3900, 4035, 4104, 5590, 6159, 8488), 'methods': frozenset({'phash', 'dhash'}), 'orientation': None}
	{'indices': (145, 2171), 'methods': frozenset({'dhash'}), 'orientation': None}
	{'indices': (154, 180, 196, 203, 239, 272, 279, 314, 330, 419, 430, 652, 663, 783, 918, 920, 948, 984, 1011, 1075, 1083, 1137, 1238, 1254, 1424, 1657, 1728, 1760, 1830, 2034, 2041, 2164, 2379, 2444, 2473, 2541, 2599, 2822, 2827, 2867, 2878, 3003, 3070, 3152, 3430, 3455, 3480, 3546, 3699, 3919, 4014, 4050, 4069, 4085, 4179, 4190, 4191, 4264, 4386, 4524, 4525, 4651, 4871, 4953, 5211, 5524, 5535, 5553, 56

In [8]:
### TEST ASSERTION CELL ###
assert results.items.exact is not None
assert len(results.items.exact) == len(duplicates)
for k, v in duplicates.items():
    assert [v, k] in results.items.exact