In [25]:
from PIL import Image
import numpy as np, PIL, pandas as pd, json, re, pickle, os
from hashlib import sha1 as hash_fn

from time import time, localtime, asctime, ctime

from ipywidgets import FloatProgress
from IPython.display import Markdown
from stat import S_ISREG, ST_CTIME, ST_MODE

In [2]:
hash_fn("test".encode("UTF-8")).hexdigest()

'a94a8fe5ccb19ba61c4c0873d391e987982fbbd3'

In [17]:
start = 20
end = 40

step_size=20

chunk_file_suffix = [i for i in range(start, end, step_size)]

base_path = "../"

data_dir = "data/intermediate/"
chunking = "chunking/"
hashes = "hashes/"

base_mnist_path = base_path+"mnist/"
by_field_dir = "by_field/"

In [4]:
def write_hash_pickle(img_hash_dict, file_count):
    """
        Writes a pickle of the passed obj, with filename specifying the hash function, the timestamp and the passed file count
        dict: to serialize
        file_count: shows how many files are processed at a glance (make resuming easier)
        
        returns the filename the pickle is written to
    """
    #generate a unique filename component
    timestamp = asctime(localtime(time())).replace(" ", "_").replace(":", ".")

    #create the filename, including the hash algo
    pickle_filename = "by_field_{}_{}_{}.pickle".format(hash_fn.__name__, timestamp, str(file_count))
    pickle_path = data_dir+hashes+pickle_filename
    
    #open the file
    img_hashes = open(pickle_path, "wb")
    
    #dump the hash dictionary
    pickle.dump(img_hash_dict, img_hashes)

    #close the file
    img_hashes.close()

    return pickle_path

In [5]:
def read_hash_pickle(filename):
    """
        Read a pickled img hash dictionary, requires filename
        returns image hash based dictionary
    """
    file = open(filename, "rb")
    img_hash_dict = pickle.loads(file.read())
    file.close()
    return img_hash_dict

In [6]:
re.split("^([0-9A-Fa-f]{2})_", "4A_00000")

['', '4A', '00000']

In [57]:
#get newest filename
path = data_dir+hashes

hash_pickles = os.listdir(path)

pickle_stats = {os.stat(path+pickle)[ST_CTIME]:path+pickle for pickle in hash_pickles}
display(pickle_stats)

{1551842784: 'data/intermediate/hashes/by_field_openssl_sha1_Tue_Mar__5_19.26.24_2019_10000.pickle',
 1551843561: 'data/intermediate/hashes/by_field_openssl_sha1_Tue_Mar__5_19.39.20_2019_20000.pickle',
 1551843619: 'data/intermediate/hashes/by_field_openssl_sha1_Tue_Mar__5_19.40.19_2019_30000.pickle'}

In [59]:
pickle_key_list = [key for key in pickle_stats.keys()]
pickle_key_list.sort(reverse=True)

pickle_path = None
reloaded = None

start_file = 0

if len(pickle_key_list)>0:
    pickle_path = pickle_stats[pickle_key_list[0]]
    pickle_name_split = re.split(r"_([0-9]+)", pickle_path)
    start_file = int(pickle_name_split[-2])
    
display(Markdown("Loading Img Hash Pickle for {} processed files".format(start_file)))

Loading Img Hash Pickle for 30000 files processed

In [60]:
if pickle_path is not None:
    reloaded = read_hash_pickle(pickle_path)

In [84]:
#create dict by hash values
hash_img_dict = {}

if reloaded:
    hash_img_dict = reloaded
    start
    #reloaded = {}

fp = FloatProgress(min=0, max=100)

files_per_pickle_write = 10000

start_file = 0

file_count = 0

display(fp)
#for (root, dirs, files) in os.walk(base_mnist_path+by_field_dir):
for (root, dirs, files) in [(base_mnist_path+by_field_dir+"hsf_0/digit/30", None, ["30_00000.png"])]:
    clean_path = root.replace("\\", "/")
    
    if len(files) > 0:
        
        for filename in files:
            fp.value = (fp.value+1)%100
            file_path = clean_path+"/"+filename
            filename_pieces = re.split("^([0-9A-Za-z]{2})_", filename)
            
            display(filename_pieces)
            
            if len(filename_pieces) < 2:
                display("passing")
                continue
            
            if start_file > file_count:
                file_count += 1
                display("passing")
                continue
            
            this_hasher = hash_fn()
            with open(file_path, "rb") as im:
                data = im.read()
                if data is None:
                    display(Markdown("# Error reading file"))
                    
                this_hasher.update(data)
                
            #explicitly close the file for better resource management
            im.close()
            
            digest = this_hasher.hexdigest()
            
            if digest in hash_img_dict.keys():
                existing_file_with_hash = hash_img_dict[digest]["file_path"]
                markdown_msg = "# Collision!\n ## Existing file: {}\n ## New File: {}".format(existing_file_with_hash, file_path)
                display(Markdown(markdown_msg))
            
            #display(filename_pieces, digest)
            
                
            code = filename_pieces[1]
            #display(code)
            hash_img_dict[digest] = {"file_path": file_path, "char_label": code}
            
            file_count += 1
            
            if file_count % files_per_pickle_write == 0:
                write_hash_pickle(hash_img_dict, file_count)
                
pickle_path = write_hash_pickle(hash_img_dict, "all")  

FloatProgress(value=0.0)

['', '30', '00000.png']

# Collision!
 ## Existing file: ../mnist/by_field/hsf_0/digit/30/30_00000.png
 ## New File: ../mnist/by_field/hsf_0/digit/30/30_00000.png

In [85]:
reloaded = read_hash_pickle(pickle_path)

In [64]:
reloaded == hash_img_dict

True

In [79]:
by_writer_dir = "by_write/"

this_hash = hash_fn()

with open(base_mnist_path+by_writer_dir+"hsf_0/f0000_14/c0000_14/c0000_14_00000.png", "rb") as test:    
    data = test.read()
    this_hash.update(data)
    
digest = this_hash.hexdigest()
    
if digest in hash_img_dict.keys():
    entry = hash_img_dict[digest]
    display(entry)
    code = int(entry["char_label"], 16)
    display(Markdown("This is an image of the character with code 0x{:x} which is \"{}\"".format(code, chr(code))))
else:
    display("Test image not found!")
    
if digest in reloaded.keys():
    entry = reloaded[digest]
    display(entry)
    code = int(entry["char_label"], 16)
    display(Markdown("This is an image of the character with code 0x{:x} which is \"{}\"".format(code, chr(code))))
else:
    display("Test image not found!")

{'file_path': '../mnist/by_field/hsf_0/const/57/57_00000.png',
 'char_label': '57'}

This is an image of the character with code 0x57 which is "W"

{'file_path': '../mnist/by_field/hsf_0/const/57/57_00000.png',
 'char_label': '57'}

This is an image of the character with code 0x57 which is "W"