**Finding Similar Images**

In [17]:
# get_image_feature_vectors.py
#################################################
# Imports and function definitions
#################################################
# For running inference on the TF-Hub module with Tensorflow
import tensorflow as tf
import tensorflow_hub as hub
# For saving 'feature vectors' into a txt file
import numpy as np
# Glob for reading file names in a folder
import glob
import os.path
#################################################
#################################################
# This function:
# Loads the JPEG image at the given path
# Decodes the JPEG image to a uint8 W X H X 3 tensor
# Resizes the image to 224 x 224 x 3 tensor
# Returns the pre processed image as 224 x 224 x 3 tensor
#################################################
def load_img(path):
# Reads the image file and returns data type of string
 img = tf.io.read_file(path)
# Decodes the image to W x H x 3 shape tensor with type of uint8
 img = tf.io.decode_jpeg(img, channels=3)
# Resizes the image to 224 x 224 x 3 shape tensor
 img = tf.image.resize_with_pad(img, 224, 224)
# Converts the data type of uint8 to float32 by adding a new axis
 # img becomes 1 x 224 x 224 x 3 tensor with data type of float32
 # This is required for the mobilenet model we are using
 img = tf.image.convert_image_dtype(img,tf.float32)[tf.newaxis, ...]
 
 return img
#################################################
# This function:
# Loads the mobilenet model in TF.HUB
# Makes an inference for all images stored in a local folder
# Saves each of the feature vectors in a file
#################################################
def get_image_feature_vectors():
 
 # Definition of module with using tfhub.dev
 module_handle = "https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4"
 # Loads the module
 module = hub.load(module_handle)
# Loops through all images in a local folder
 for filename in glob.glob('/content/drive/MyDrive/ADM/*.jpg'):
 
  print(filename)
# Loads and pre-process the image
  img = load_img(filename)
# Calculate the image feature vector of the img
  features = module(img)
# Remove single-dimensional entries from the 'features' array  
  feature_set = np.squeeze(features)
 
  # Saves the image feature vectors into a file for later use
  outfile_name = os.path.basename(filename) + ".npz"
 
  out_path = os.path.join('/content/drive/MyDrive/ADM/OUTPUT_VEC/',outfile_name)
# Saves the 'feature_set' to a text file
  np.savetxt(out_path, feature_set, delimiter=',')
get_image_feature_vectors()

/content/drive/MyDrive/ADM/img_000000325.jpg
/content/drive/MyDrive/ADM/img_000000319.jpg
/content/drive/MyDrive/ADM/img_0000000274.jpg
/content/drive/MyDrive/ADM/img_000000125.jpg
/content/drive/MyDrive/ADM/img_000000119.jpg
/content/drive/MyDrive/ADM/img_0000000248.jpg
/content/drive/MyDrive/ADM/img_000000290.jpg
/content/drive/MyDrive/ADM/img_0000000314.jpg
/content/drive/MyDrive/ADM/img_000000284.jpg
/content/drive/MyDrive/ADM/img_0000000328.jpg
/content/drive/MyDrive/ADM/img_00000003.jpg
/content/drive/MyDrive/ADM/img_00000017.jpg
/content/drive/MyDrive/ADM/img_00000016.jpg
/content/drive/MyDrive/ADM/img_00000002.jpg
/content/drive/MyDrive/ADM/img_0000000301.jpg
/content/drive/MyDrive/ADM/img_0000000329.jpg
/content/drive/MyDrive/ADM/img_000000285.jpg
/content/drive/MyDrive/ADM/img_000000118.jpg
/content/drive/MyDrive/ADM/img_000000124.jpg
/content/drive/MyDrive/ADM/img_000000291.jpg
/content/drive/MyDrive/ADM/img_0000000249.jpg
/content/drive/MyDrive/ADM/img_0000000114.jpg
/conte

In [None]:
# cluster_image_feature_vectors.py
#################################################
# Imports and function definitions
#################################################
# Numpy for loading image feature vectors from file
import numpy as np
# Time for measuring the process time
import time
# Glob for reading file names in a folder
import glob
import os.path
# json for storing data in json file
import json
# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial
#################################################
#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched
# So it is used to find product id based on the product image name
#################################################
#def match_id(filename):
# with open('/Users/erdemisbilen/Angular/fashionWebScraping
# /jsonFiles/image_data.json') as json_file:
#or file in json_file:
#   seen = json.loads(file)
#for line in seen:
    
#     if filename==line['imageName']:
#      print(line)
#      return line['productId']
#      break
#################################################
#################################################
# This function:
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():
 start_time = time.time()
 
 print("---------------------------------")
 print ("Step.1 - ANNOY index generation - Started at %s" 
 %time.ctime())
 print("---------------------------------")
# Defining data structures as empty dict
 file_index_to_file_name = {}
 file_index_to_file_vector = {}
 file_index_to_product_id = {}
# Configuring annoy parameters
 dims = 1792
 n_nearest_neighbors = 20
 trees = 10000
# Reads all file names which stores feature vectors
 allfiles = glob.glob('/content/drive/MyDrive/ADM/OUTPUT_VEC/*.npz')
 
 t = AnnoyIndex(dims, metric='angular')
for file_index, i in enumerate(allfiles):
# Reads feature vectors and assigns them into the file_vector
  file_vector = np.loadtxt(i)
# Assigns file_name, feature_vectors and corresponding product_id
  file_name = os.path.basename(i).split('.')[0]
  file_index_to_file_name[file_index] = file_name
  file_index_to_file_vector[file_index] = file_vector
  file_index_to_product_id[file_index] = match_id(file_name)
# Adds image feature vectors into annoy index
  t.add_item(file_index, file_vector)
print("---------------------------------")
  print("Annoy index     : %s" %file_index)
  print("Image file name : %s" %file_name)
  print("Product id      : %s" 
  %file_index_to_product_id[file_index])
  print("--- %.2f minutes passed ---------" % ((time.time() -
  start_time)/60))
# Builds annoy index
 t.build(trees)
print ("Step.1 - ANNOY index generation - Finished")
 print ("Step.2 - Similarity score calculation - Started ")
named_nearest_neighbors = []
# Loops through all indexed items
 for i in file_index_to_file_name.keys():
 
  # Assigns master file_name, image feature vectors 
  # and product id values
  master_file_name = file_index_to_file_name[i]
  master_vector = file_index_to_file_vector[i]
  master_product_id = file_index_to_product_id[i]
# Calculates the nearest neighbors of the master item
  nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
# Loops through the nearest neighbors of the master item
  for j in nearest_neighbors:
   
   print(j)
# Assigns file_name, image feature vectors and 
   # product id values of the similar item
   neighbor_file_name = file_index_to_file_name[j]
   neighbor_file_vector = file_index_to_file_vector[j]
   neighbor_product_id = file_index_to_product_id[j]
# Calculates the similarity score of the similar item
   similarity = 1 - spatial.distance.cosine(master_vector,
   neighbor_file_vector)
rounded_similarity = int((similarity * 10000)) / 10000.0
# Appends master product id with the similarity score
   # and the product id of the similar items
   named_nearest_neighbors.append({
     'similarity': rounded_similarity,
     'master_pi': master_product_id,
     'similar_pi': neighbor_product_id})
print("---------------------------------")
 print("Similarity index       : %s" %i)
 print("Master Image file name : %s" %file_index_to_file_name[i])
 print("Nearest Neighbors.     : %s" %nearest_neighbors)
 print("--- %.2f minutes passed ---------" % ((time.time() -
 start_time)/60))
print ("Step.2 - Similarity score calculation - Finished ")
# Writes the 'named_nearest_neighbors' to a json file
 with open('nearest_neighbors.json', 'w') as out:
 json.dump(named_nearest_neighbors, out)
print ("Step.3 - Data stored in 'nearest_neighbors.json' file ")
 print("--- Prosess completed in %.2f minutes ---------" %
 ((time.time() - start_time)/60))
cluster()