In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [1]:
# For running inference on the TF-Hub module.
import tensorflow as tf
import tensorflow_hub as hub

# For saving 'feature vectors' into a txt file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path

In [2]:
# This function:
# Loads the JPEG image at the given path
# Decodes the JPEG image to a uint8 W X H X 3 tensor
# Resizes the image to 224 x 224 x 3 tensor
# Returns the pre processed image as 224 x 224 x 3 tensor
#################################################
def load_img(path):

  # Reads the image file and returns data type of string
  img = tf.io.read_file(path)

  # Decodes the image to W x H x 3 shape tensor with type of uint8
  img = tf.io.decode_jpeg(img, channels=3)

  # Resize the image to 224 x 244 x 3 shape tensor
  img = tf.image.resize_with_pad(img, 224, 224)

  # Converts the data type of uint8 to float32 by adding a new axis
  # This makes the img 1 x 224 x 224 x 3 tensor with the data type of float32
  # This is required for the mobilenet model we are using
  img  = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]

  return img

In [4]:
import certifi
import ssl

ssl._create_default_https_context = ssl._create_unverified_context
#################################################
# This function:
# Loads the mobilenet model in TF.HUB
# Makes an inference for all images stored in a local folder
# Saves each of the feature vectors in a file
#################################################
def get_image_feature_vectors():

  i = 0

  start_time = time.time()

  print("---------------------------------")
  print ("Step.1 of 2 - mobilenet_v2_140_224 - Loading Started at %s" %time.ctime())
  print("---------------------------------")

  # Definition of module with using tfhub.dev handle
  module_handle = "https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4" 
  
  # Load the module
  module = hub.load(module_handle)

  print("---------------------------------")
  print ("Step.1 of 2 - mobilenet_v2_140_224 - Loading Completed at %s" %time.ctime())
  print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  print("---------------------------------")
  print ("Step.2 of 2 - Generating Feature Vectors -  Started at %s" %time.ctime())
 

  # Loops through all images in a local folder
  for filename in glob.glob('trial/*.jpg'): #assuming gif
    i = i + 1

    print("-----------------------------------------------------------------------------------------")
    print("Image count                     :%s" %i)
    print("Image in process is             :%s" %filename)

    # Loads and pre-process the image
    img = load_img(filename)

    # Calculate the image feature vector of the img
    features = module(img)   
  
    # Remove single-dimensional entries from the 'features' array
    feature_set = np.squeeze(features)  

    # Saves the image feature vectors into a file for later use

    outfile_name = os.path.basename(filename).split('.')[0] + ".npz"
    out_path = os.path.join('npz_files/', outfile_name)

    # Saves the 'feature_set' to a text file
    np.savetxt(out_path, feature_set, delimiter=',')

    print("Image feature vector saved to   :%s" %out_path)
  
  print("---------------------------------")
  print ("Step.2 of 2 - Generating Feature Vectors - Completed at %s" %time.ctime())
  print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
  print("--- %s images processed ---------" %i)

In [6]:
#################################################
# Imports and function definitions
#################################################

# Numpy for loading image feature vectors from file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path

# json for storing data in json file
import json

# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial
#################################################


In [5]:
pip install annoy

Collecting annoy
  Downloading annoy-1.16.3.tar.gz (644 kB)
[K     |████████████████████████████████| 644 kB 326 kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25ldone
[?25h  Created wheel for annoy: filename=annoy-1.16.3-cp37-cp37m-macosx_10_9_x86_64.whl size=66641 sha256=e18c73855bcf4f21ae3d973fd84a478b448ece46a7058a17e13bdf52e440197d
  Stored in directory: /Users/samarthhadawale/Library/Caches/pip/wheels/39/36/d4/ee348a7240ca3e8d1fcbf04ebe46d45f2879ccb094a40f5706
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.16.3
Note: you may need to restart the kernel to use updated packages.


In [7]:
#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched 
# So it is used to find product id based on the product image name
#################################################
def match_id(filename):
  with open('image_data.json') as json_file:
    
    for file in json_file:
        seen = json.loads(file)

        for line in seen:
          
          if filename==line['imageName']:
            print(line)
            return line['productId']
            break
#################################################

In [10]:
#################################################
# This function; 
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():

  start_time = time.time()
  
  print("---------------------------------")
  print ("Step.1 - ANNOY index generation - Started at %s" %time.ctime())
  print("---------------------------------")

  # Defining data structures as empty dict
  file_index_to_file_name = {}
  file_index_to_file_vector = {}
  file_index_to_product_id = {}

  # Configuring annoy parameters
  dims = 1792
  n_nearest_neighbors = 20
  trees = 10000

  # Reads all file names which stores feature vectors 
  allfiles = glob.glob('npz_files/*.npz')

  t = AnnoyIndex(dims, metric='angular')

  for file_index, i in enumerate(allfiles):
    
    # Reads feature vectors and assigns them into the file_vector 
    file_vector = np.loadtxt(i)

    # Assigns file_name, feature_vectors and corresponding product_id
    file_name = os.path.basename(i).split('.')[0]
    file_index_to_file_name[file_index] = file_name
    file_index_to_file_vector[file_index] = file_vector
    #file_index_to_product_id[file_index] = match_id(file_name)

    # Adds image feature vectors into annoy index   
    t.add_item(file_index, file_vector)

    print("---------------------------------")
    print("Annoy index     : %s" %file_index)
    print("Image file name : %s" %file_name)
   #print("Product id      : %s" %file_index_to_product_id[file_index])
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))


  # Builds annoy index
  t.build(trees)

  print ("Step.1 - ANNOY index generation - Finished")
  print ("Step.2 - Similarity score calculation - Started ") 
  
  named_nearest_neighbors = []

  # Loops through all indexed items
  for i in file_index_to_file_name.keys():

    # Assigns master file_name, image feature vectors and product id values
    master_file_name = file_index_to_file_name[i]
    master_vector = file_index_to_file_vector[i]
    #master_product_id = file_index_to_product_id[i]

    # Calculates the nearest neighbors of the master item
    nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)

    # Loops through the nearest neighbors of the master item
    for j in nearest_neighbors:

      print(j)

      # Assigns file_name, image feature vectors and product id values of the similar item
      neighbor_file_name = file_index_to_file_name[j]
      neighbor_file_vector = file_index_to_file_vector[j]
      #neighbor_product_id = file_index_to_product_id[j]

      # Calculates the similarity score of the similar item
      similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
      rounded_similarity = int((similarity * 10000)) / 10000.0

      # Appends master product id with the similarity score 
      # and the product id of the similar items
      named_nearest_neighbors.append({
        'similarity': rounded_similarity})
        #'master_pi': master_product_id,
        #'similar_pi': neighbor_product_id})

    print("---------------------------------") 
    print("Similarity index       : %s" %i)
    print("Master Image file name : %s" %file_index_to_file_name[i]) 
    print("Nearest Neighbors.     : %s" %nearest_neighbors) 
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  
  print ("Step.2 - Similarity score calculation - Finished ") 

  # Writes the 'named_nearest_neighbors' to a json file
  with open('nearest_neighbors.json', 'w') as out:
    json.dump(named_nearest_neighbors, out)

  print ("Step.3 - Data stored in 'nearest_neighbors.json' file ") 
  print("--- Prosess completed in %.2f minutes ---------" % ((time.time() - start_time)/60))

In [11]:
get_image_feature_vectors()

---------------------------------
Step.1 of 2 - mobilenet_v2_140_224 - Loading Started at Sun Jul  5 22:11:57 2020
---------------------------------
---------------------------------
Step.1 of 2 - mobilenet_v2_140_224 - Loading Completed at Sun Jul  5 22:12:09 2020
--- 0.20 minutes passed ---------
---------------------------------
Step.2 of 2 - Generating Feature Vectors -  Started at Sun Jul  5 22:12:09 2020
-----------------------------------------------------------------------------------------
Image count                     :1
Image in process is             :trial\173443_0.jpg
Image feature vector saved to   :npz_files/173443_0.npz
-----------------------------------------------------------------------------------------
Image count                     :2
Image in process is             :trial\173443_1.jpg
Image feature vector saved to   :npz_files/173443_1.npz
-----------------------------------------------------------------------------------------
Image count                   

In [12]:
cluster()

---------------------------------
Step.1 - ANNOY index generation - Started at Sun Jul  5 22:12:46 2020
---------------------------------
---------------------------------
Annoy index     : 0
Image file name : 173443_0
--- 0.00 minutes passed ---------
---------------------------------
Annoy index     : 1
Image file name : 173443_1
--- 0.00 minutes passed ---------
---------------------------------
Annoy index     : 2
Image file name : 189660_0
--- 0.00 minutes passed ---------
---------------------------------
Annoy index     : 3
Image file name : 189660_1
--- 0.00 minutes passed ---------
---------------------------------
Annoy index     : 4
Image file name : 219104_0
--- 0.00 minutes passed ---------
---------------------------------
Annoy index     : 5
Image file name : 227012_0
--- 0.01 minutes passed ---------
---------------------------------
Annoy index     : 6
Image file name : 227272_0
--- 0.01 minutes passed ---------
---------------------------------
Annoy index     : 7
Ima

In [15]:
cluster()

TypeError: 'function' object is not subscriptable

In [16]:
import json

with open('nearest_neighbors.json') as f:
  data = json.load(f)

# Output: {'name': 'Bob', 'languages': ['English', 'Fench']}
print(data)

[{'similarity': 1.0}, {'similarity': 0.6964}, {'similarity': 0.686}, {'similarity': 0.6445}, {'similarity': 0.6371}, {'similarity': 0.6271}, {'similarity': 0.6242}, {'similarity': 0.6221}, {'similarity': 0.6182}, {'similarity': 0.6174}, {'similarity': 0.6128}, {'similarity': 0.6063}, {'similarity': 0.6053}, {'similarity': 0.599}, {'similarity': 0.5916}, {'similarity': 0.5873}, {'similarity': 0.5804}, {'similarity': 0.5793}, {'similarity': 0.5154}, {'similarity': 0.5071}, {'similarity': 1.0}, {'similarity': 0.6394}, {'similarity': 0.6368}, {'similarity': 0.6322}, {'similarity': 0.6087}, {'similarity': 0.5873}, {'similarity': 0.5712}, {'similarity': 0.5437}, {'similarity': 0.5436}, {'similarity': 0.5361}, {'similarity': 0.5306}, {'similarity': 0.5156}, {'similarity': 0.5139}, {'similarity': 0.5086}, {'similarity': 0.5007}, {'similarity': 0.4708}, {'similarity': 0.4703}, {'similarity': 0.4386}, {'similarity': 0.4299}, {'similarity': 0.3775}, {'similarity': 1.0}, {'similarity': 0.8693}, {'

In [20]:
from numpy import load

data = load('npz_files/272576_0.npz', allow_pickle=True)
lst = data.files
for item in lst:
    print(item)
    print(data[item])

OSError: Failed to interpret file 'npz_files/272576_0.npz' as a pickle

In [22]:
named_nearest_neighbors

NameError: name 'named_nearest_neighbors' is not defined