# Duplicate Images & Class Collisions

**Authors:**

- [Angus Mackenzie](https://github.com/AngusTheMack) ([1106817](mailto:1106817@students.wits.ac.za))
- [Nathan Michlo](https://github.com/nmichlo) ([1386161](mailto:1386161@students.wits.ac.za))

**Achievement** Detecting which images are duplicates of each other and of those duplicates, which are labeled with differnt classes.

----------------------

In [1]:
# Utilities
import sys
import os
from tqdm.notebook import tqdm
from pprint import pprint

# Add root of project to PYTHON_PATH so we can import correctly
if os.path.abspath('../') not in {os.path.abspath(path) for path in sys.path}:
    sys.path.insert(0, os.path.abspath('../'))
    
# Import SSIC common stuffs
from ssic.ssic import SSIC
from ssic.util import set_random_seed, cache_data

# if you dont have a .env file set it here
os.environ.setdefault('DATASET_DIR', '~/downloads/datasets/ssic')

# Initialise SSIC paths, data and other stuffs, searches for a .env file in the project with these variables specified, also checkpoints os.environ and sys.path
SSIC.init()

def get_info_by_class():
    """ Group image info by classes - returns a dict of lists """
    # TODO: this should be in SSIC
    images_by_class = defaultdict(list)
    for name, info in SSIC.get_train_image_info().items():
        images_by_class[info['class_id']].append(info)
    return dict(images_by_class)

[[92mLOADED[0m]: 
[[95mSTORAGE_DIR[0m]: [90m/home/nmichlo/workspace/snake-id-old/notebooks/out[0m
[[95mDATASET_DIR[0m]: [90m/home/nmichlo/downloads/datasets/ssic[0m
[[95mDATASET_CLASS_CSV[0m]: [90m/home/nmichlo/downloads/datasets/ssic/class_idx_mapping.csv[0m
[[95mDATASET_TRAIN_DIR[0m]: [90m/home/nmichlo/downloads/datasets/ssic/train[0m
[[95mDATASET_TEST_DIR[0m]: [90m/home/nmichlo/downloads/datasets/ssic/round1[0m


## MD5 Collisions

In [2]:
# TODO: this should be moved into class SSIC

def hash_collisions():
    from collections import defaultdict
    hashes = defaultdict(list)
    for info in tqdm(SSIC.get_train_image_info().values()):
        md5 = os.popen(f"md5sum {info['path']}").read().split(' ')[0]
        hashes[md5].append(info['name'])
    return dict(hashes)

hash_collisions = cache_data(path=os.path.join(SSIC.STORAGE_DIR, 'duplicates.json'), generator=hash_collisions)
matching_hashes = {k: v for k, v in hash_collisions.items() if len(v) > 1}

img_info = SSIC.get_train_image_info()
valid_matching_hashes = {k: [name for name in v if img_info[name]['valid']] for k, v in matching_hashes.items()}
valid_matching_hashes = {k: v for k, v in valid_matching_hashes.items() if v}

[[92mLOADED[0m]: /home/nmichlo/workspace/snake-id-old/notebooks/out/duplicates.json
[[92mLOADED[0m]: /home/nmichlo/workspace/snake-id-old/notebooks/out/img_info.json


In [63]:

print('COLLISIONS: ', sum(len(v) for v in matching_hashes.values()))
print('\033[92mVALID COLLISIONS\033[0m: ', sum(len(v) for v in valid_matching_hashes.values()))
print()

conflicting_matching_hashes = {}
for key, collisions in matching_hashes.items():
    start_id = img_info[collisions[0]]['class_id']
    if all(start_id == img_info[c]['class_id'] for c in collisions):
        continue
    conflicting_matching_hashes[key] = list(collisions)

conflicting_valid_matching_hashes = {}
for key, collisions in valid_matching_hashes.items():
    start_id = img_info[collisions[0]]['class_id']
    if all(start_id == img_info[c]['class_id'] for c in collisions):
        continue
    conflicting_valid_matching_hashes[key] = list(collisions)

print(f'UNIQUE COLLISIONS: {len(matching_hashes)}')
print(f'UNIQUE CONFLICTING COLLISIONS: {len(conflicting_matching_hashes)}')
print(f'UNIQUE VALID COLLISIONS: {len(valid_matching_hashes)}')
print(f'UNIQUE VALID CONFLICTING COLLISIONS: {len(conflicting_valid_matching_hashes)}')
print()
    
print('\033[91mCONFLICTING COLLISIONS\033[0m:', sum(len(v) for v in conflicting_matching_hashes.values()))
print('\033[91mCONFLICTING VALID COLLISIONS\033[0m:', sum(len(v) for v in conflicting_valid_matching_hashes.values()))

COLLISIONS:  1220
[92mVALID COLLISIONS[0m:  1036

UNIQUE COLLISIONS: 515
UNIQUE CONFLICTING COLLISIONS: 25
UNIQUE VALID COLLISIONS: 514
UNIQUE VALID CONFLICTING COLLISIONS: 24

[91mCONFLICTING COLLISIONS[0m: 232
[91mCONFLICTING VALID COLLISIONS[0m: 48


## Conflicting Classes

In [4]:
# THESE MD5 COLLISIONS HAVE CONFLICTING CLASSES
for collisions in conflicting_matching_hashes.values():
    print(collisions, '->', [img_info[name]['class_id'] for name in collisions])

['84a61a4c635527c06438b75cfa413c9f.jpg', 'a2bd3bc0d0da494b90b080215603373d.jpg'] -> [460, 448]
['c2d95843d73fdc7aa4a027d94b5403d6.jpg', 'dc4bb8ad3cbc5ec7cefa49c5f64bb36a.jpg'] -> [639, 635]
['dc85f0b6d1b2d87f55687337a9c0c3de.jpg', '254bcaec0e6ce0faa29a19f65fcab453.jpg'] -> [639, 654]
['2c73a88777fbff01c2662c3a23fdc573.jpg', '41bf434b469af0fadb39c407b2e1f364.jpg'] -> [639, 654]
['a15b8c9fe546a91bf47ce5df88b22ca7.jpg', '7334e90eaa98f966455223a81caddf49.jpg'] -> [639, 697]
['74bf306b43c717d51cc234de32dd772c.jpg', '27ea8fbe301561320593737a6cd3af4a.jpg'] -> [394, 872]
['e0fc1a5b5866441ba537dd5990c55048.jpg', 'f834f5e0c839ff10f808e796228bd5e1.jpg'] -> [394, 4]
['693b99e3c8c9f00a109eac6bce2e00b4.jpg', 'c8bf322e35c32924ddf73fec7f4ff4d8.jpg'] -> [362, 204]
['e39e6c80cdc1002d5a4830c92d237d0c.jpg', 'e9580d63e47988d33524129c53d896a4.jpg'] -> [362, 543]
['2d13e228b8b52e51553fca64fb7f9986.jpg', 'e8903fdb6110daabcbe1270202a951da.jpg'] -> [362, 543]
['6154d719cce4d2ad6cb2dd3f7dbd82b9.jpg', '20dafa2cdd

In [69]:
image_names = set(SSIC.get_train_image_info())
valid_image_names = set(name for name, info in SSIC.get_train_image_info().items() if info['valid'])

duplicate_names                   = set(name for names in matching_hashes.values() for name in sorted(names))
valid_duplicate_names             = set(name for names in valid_matching_hashes.values() for name in sorted(names))
conflicting_duplicate_names = set(name for names in conflicting_matching_hashes.values() for name in sorted(names))
conflicting_valid_duplicate_names = set(name for names in conflicting_valid_matching_hashes.values() for name in sorted(names))

kept_duplicate_names                   = set(sorted(names)[0] for names in matching_hashes.values())
kept_valid_duplicate_names             = set(sorted(names)[0] for names in valid_matching_hashes.values())

# Using invalid
kept = image_names
kept -= (duplicate_names - kept_duplicate_names)
kept -= conflicting_duplicate_names
print(len(kept))
kept_a = tuple(sorted(kept))

# Using valid - should match
kept = valid_image_names
kept -= (valid_duplicate_names - kept_valid_duplicate_names)
kept -= conflicting_valid_duplicate_names
print(len(kept))
kept_b = tuple(sorted(kept))

assert kept_a == kept_b
del kept_a
del kept_b

81871
81871


In [90]:
from collections import defaultdict
import numpy as np

image_info = SSIC.get_train_image_info()

classes_kept = defaultdict(list)
for name in kept:
    classes_kept[image_info[name]['class_id']].append(name)
classes = defaultdict(list)
for name in image_info:
    classes[image_info[name]['class_id']].append(name)

class_kept_counts = {k: len(v) for k, v in classes_kept.items()}
class_counts = {k: len(v) for k, v in classes.items()}

print('class:\t\torig|kept\n-------------------------')
for k in sorted(class_counts, key=class_counts.__getitem__):
    print(f'class-{k}:\t{class_counts[k]} | {class_kept_counts[k]}')

    
counts_kept = list(class_kept_counts.values())
counts = list(class_counts.values())

print('-------------------------')
print('kept:', np.min(counts_kept), np.max(counts_kept), np.mean(counts_kept), np.std(counts_kept), np.median(counts_kept))
print('orig:', np.min(counts), np.max(counts), np.mean(counts), np.std(counts), np.median(counts))

class:		orig|kept
-------------------------
class-784:	517 | 507
class-629:	527 | 521
class-561:	531 | 529
class-273:	583 | 580
class-734:	593 | 591
class-957:	605 | 597
class-526:	629 | 610
class-326:	639 | 631
class-239:	677 | 673
class-653:	715 | 704
class-128:	749 | 744
class-72:	852 | 833
class-1059:	890 | 888
class-811:	910 | 906
class-663:	961 | 951
class-450:	1003 | 991
class-597:	1006 | 1001
class-540:	1053 | 1051
class-654:	1071 | 1061
class-857:	1138 | 1121
class-707:	1148 | 1146
class-635:	1186 | 1177
class-394:	1232 | 1228
class-536:	1368 | 1353
class-543:	1383 | 1343
class-460:	1394 | 1380
class-966:	1471 | 1462
class-140:	1498 | 1493
class-4:	1500 | 1490
class-1625:	1677 | 1672
class-854:	1773 | 1767
class-581:	1908 | 1891
class-508:	2068 | 2055
class-78:	2149 | 2125
class-362:	2154 | 2105
class-639:	2183 | 2171
class-390:	2295 | 2290
class-804:	2555 | 2543
class-448:	2565 | 2539
class-67:	3201 | 3187
class-337:	3472 | 3463
class-697:	4619 | 4572
class-872:	5525 | 5473
c

In [91]:
print(len(counts))

45
