In [1]:
import cv2
import numpy as np
import os
import shutil 
import time
from PIL import Image
import pandas as pd
from matplotlib import pyplot as plt
from scipy.fftpack import fft, dct
import json
import pickle

In [2]:
## Hashing with Dhash ##

def dhash(image, hash_size=8):
    pixels = prepare_image(image, hash_size, hash_size+1)
    diff = pixels[1:,:] > pixels[:-1,:]
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

In [3]:
## Hashing with Phash ##

def phash(image, hash_size=32):
    pixels = prepare_image(image, hash_size)
    pixels = pixels.astype(np.float)
    pixels *= 1./255
    dct_1 = dct(pixels)
    dctlowfreq = dct_1[:8, 1:9]
    avg = dctlowfreq.mean()
    diff = dctlowfreq > avg
    return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])

In [4]:
## Preparing Data for Hashing ##

def prepare_image(image, size1, size2=None):
    if not size2:
        size2 = size1
    res = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    res = cv2.resize(res, (size1, size2), interpolation=cv2.INTER_AREA)
    return res

In [5]:
## Calculating the hamming Distance ##

def hamming(h1, h2):
    h, d = 0, h1 ^ h2
    while d:
        h += 1
        d &= d - 1
    return h

In [6]:
## Calculating dhash for all the tops image in our inventory ##
## I used dhash over phash as it was performing better ##

if __name__ == '__main__':

    directory = "/home/ankan/Projects/infilect/images"
    file = []
    hashcode =[]
    for fn in os.listdir(directory):
        image_path =  os.path.join(directory,fn)
        image = cv2.imread(image_path)
        hash_code = dhash(image)
        file.append(fn)
        hashcode.append(hash_code)
        

In [7]:
hashcode_df = pd.DataFrame(list(zip(file, hashcode)), columns=['file', 'hashcode'])

In [8]:
## Dumping the hashcode dataframe for further comparision ##

with open('hashcode_df.pickle', 'wb') as handle:
    pickle.dump(hashcode_df,handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
'''
image_name = "TOPE5X8TM6VARFFV.jpg"
image_path =  os.path.join(directory, image_name)
image = cv2.imread(image_path)
hash_code = dhash(image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image.shape
plt.subplot(111),plt.imshow(image),plt.title('ORIGINAL')
plt.show()
print(hash_code)
dist = hamming(524633760037619760, 5145300589440667696)
simm = (64 - dist) * 100 / 64
print(simm)
data = {}
l = []
name = image_name.split('.')[0]
for index, row in hashcode_df.iterrows():
    
    if row.hashcode == hash_code and row.file != image_name:
        l.append(row.file.split('.')[0])
        image = cv2.imread(os.path.join(directory,row.file))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.subplot(111),plt.imshow(image),plt.title('ORIGINAL')
        plt.show()
data[name] = l
'''