In [1]:
try:
    import google.colab  # noqa: F401

    # specify the version of DataEval (==X.XX.X) for versions other than the latest
    %pip install -q dataeval
except Exception:
    pass

In [2]:
import numpy as np

from dataeval.detectors.linters import Duplicates
from dataeval.utils.data import collate
from dataeval.utils.data.datasets import MNIST

In [3]:
# Load in the mnist dataset
testing_dataset = MNIST(root="./data/", train=False, download=True, unit_interval=True)

# Collate image data and targets
test_data, test_targets, _ = collate(testing_dataset)

Files already downloaded and verified


In [4]:
# Creating some duplicates
print("Exact duplicates")
duplicates = {}
for i in [1, 2, 5, 9]:
    matching_indices = np.where(test_targets.labels == i)[0]
    test_data[matching_indices[78]] = test_data[matching_indices[23]]
    print(f"\t{i} - ({matching_indices[23]}, {matching_indices[78]})")
    duplicates[i] = (matching_indices[23], matching_indices[78], matching_indices[2])

Exact duplicates
	1 - (231, 781)
	2 - (232, 782)
	5 - (235, 785)
	9 - (239, 789)


In [5]:
print("Number of samples: ", len(test_data))

Number of samples:  8920


In [6]:
# Initialize the Duplicates class to begin to identify duplicate images.
identifyDuplicates = Duplicates()

# Evaluate the data
results = identifyDuplicates.evaluate(test_data)

In [7]:
for category, images in results.dict().items():
    print(f"{category} - {len(images)}")
    print(f"\t{images}")

exact - 4
	[[231, 781], [232, 782], [235, 785], [239, 789]]
near - 69
	[[1, 1571, 6631], [31, 7221], [141, 4771, 4851, 5881, 6191, 7791], [147, 3987], [151, 5131], [161, 8141], [171, 2251], [241, 7161], [270, 2910], [281, 751, 951, 2721, 2801], [291, 701, 3911], [311, 5281], [341, 1741], [451, 8591], [711, 3341], [821, 5221], [911, 5421], [921, 4951], [961, 8021], [1019, 4629], [1021, 2931, 8481], [1051, 4101], [1104, 3414], [1281, 2781], [1415, 5635], [1511, 3861], [1531, 1731, 1821, 5811, 6261, 8031], [1670, 2710], [1687, 5827], [1791, 3761], [1831, 5021], [1891, 3791], [2021, 2141, 3661, 8051], [2051, 4471, 4871, 5031, 8191], [2057, 6947], [2101, 4781], [2191, 4741], [2211, 7601, 8421], [2377, 6657], [2407, 3417], [2481, 3541, 7501, 8521, 8571], [2631, 4261], [2791, 5561, 7371, 7841, 8261], [2911, 7551], [3061, 3551], [3151, 6551, 8631], [3177, 6887], [3249, 3367], [3301, 6021, 8361], [3361, 3711, 4051, 4831, 7381, 8641], [3441, 8811], [3571, 5371, 5651, 5761, 6891, 7061, 8231, 8401

In [8]:
### TEST ASSERTION CELL ###
assert len(results.exact) == 4
assert [231, 781] in results.exact
assert [232, 782] in results.exact
assert [235, 785] in results.exact
assert [239, 789] in results.exact