In [62]:
import cv2
import numpy as np
import scipy
from scipy.misc import imread
import pickle
import random
import os
import matplotlib.pyplot as plt
import json

In [55]:
mapping_categories = {"bras": "panties", "panties": "bras"}
categories = ["bras", "panties"]
data_path = "/Users/asingh/Documents/Learning/Projects/Image_Matching/Data/"
output_path = "/Users/asingh/Documents/Learning/Projects/Image_Matching/Notebooks/Output/"
image_path = os.path.join(data_path, "images")
features_path = os.path.join(data_path, "features_kaze")

In [38]:
# Feature extractor
def extract_features(image_path, vector_size=32):
    image = imread(image_path, mode="RGB")
    try:
        # Using KAZE, cause SIFT, ORB and other was moved to additional module
        # which is adding addtional pain during install
        alg = cv2.KAZE_create()
        # Dinding image keypoints
        kps = alg.detect(image)
        # Getting first 32 of them. 
        # Number of keypoints is varies depend on image size and color pallet
        # Sorting them based on keypoint response value(bigger is better)
        kps = sorted(kps, key=lambda x: -x.response)[:vector_size]
        # computing descriptors vector
        kps, dsc = alg.compute(image, kps)
        # Flatten all of them in one big vector - our feature vector
        dsc = dsc.flatten()
        # Making descriptor of same size
        # Descriptor vector size is 64
        needed_size = (vector_size * 64)
        if dsc.size < needed_size:
            # if we have less the 32 descriptors then just adding zeros at the
            # end of our feature vector
            dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
    except cv2.error as e:
        print('Error: ', e)
        return None

    return dsc

In [47]:
def feature_extractor(categories):
    for category in categories:
        category_images_path = os.path.join(image_path, category)
        files = [os.path.join(category_images_path, file) for file in os.listdir(category_images_path)]

        result = {}
        for f in files:
            name = f.split('/')[-1].split(".")[0]
            try:
                result[name] = extract_features(f)
            except Exception as e:
                continue

        # saving all our feature vectors in pickled file
        pickled_db_path = os.path.join(features_path, category + ".pck")
        with open(pickled_db_path, 'wb') as fp:
            pickle.dump(result, fp)

In [48]:
%%time
feature_extractor(categories)

`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 8h 5min 52s, sys: 1h 7min 55s, total: 9h 13min 48s
Wall time: 7h 32min 31s


In [49]:
with open(os.path.join(features_path, "bras" + ".pck"), "rb") as f:
    bras_features = pickle.load(f)

with open(os.path.join(features_path, "panties" + ".pck"), "rb") as f:
    panties_features = pickle.load(f)

In [57]:
all_features = {"bras": bras_features, "panties": panties_features}

In [50]:
len(bras_features), len(panties_features)

(1699, 3314)

In [53]:
bras_features['224438_0000006395']

array([ 0.00503823, -0.04584654,  0.01468258, ..., -0.00024455,
        0.00077621,  0.00025226], dtype=float32)

In [51]:
def findCosineSimilarity(feature1, feature2):
    """
    Fucntion to get the cos similarity between two vectors
    :param feature1:
    :param feature2:
    :return:
    """
    return np.dot(feature1, feature2) / (np.linalg.norm(feature1) * np.linalg.norm(feature2))

In [64]:
def calculate_similarity(features1, features2, category, output_path, top_matches_count=15):
    output_matches = {}
    for feature_name1, feature1 in features1.items():
        sku1, image_variation_name1 = feature_name1.split("_")
        if sku1 not in output_matches:
            output_matches[sku1] = {}
        output_matches[sku1][image_variation_name1] = []
        matches = []
        for feature_name2, feature2 in features2.items():
            sku2, image_variation_name2 = feature_name2.split("_")
            if feature_name1 != feature_name2:
                similarity = round(findCosineSimilarity(feature1, feature2), 4)
                matches.append((sku2, image_variation_name2, similarity))
        topmatches = sorted(matches, key=lambda x: x[2], reverse=True)[:top_matches_count]
        topmatches_formatted = [{"sku": sku2, "color_code": image_variation_name2,
                                 "similarity_score": str(score)} for sku2, image_variation_name2, score in topmatches]
        output_matches[sku1][image_variation_name1].extend(topmatches_formatted)
        
    try:
        with open(output_path + "{}.json".format(category), "w") as output_file_object:
            output_file_object.write(json.dumps(output_matches))
    except Exception as e:
        print("Error in writing the image matches: {}".format(str(e)))
            

In [66]:
for c in categories:
    print("Computing for the category : {}".format(c))
    calculate_similarity(all_features[c], all_features[mapping_categories[c]], c, 
                         output_path, top_matches_count=15)

Computing for the category : bras
Computing for the category : panties


In [67]:
output_path + "bras.json"

'/Users/asingh/Documents/Learning/Projects/Image_Matching/Notebooks/Output/bras.json'

In [7]:
a = np.array([[1,2,3], [3,4,5]])

In [8]:
a.reshape(1, -1)

array([[1, 2, 3, 3, 4, 5]])

In [12]:
files = [file for file in os.listdir("/Users/asingh/Documents/Learning/Projects/Image_Matching/Data/images/bras/")]


In [37]:
%%time
result = {}
result["102472_0000012202"] = extract_features("/Users/asingh/Documents/Learning/Projects/Image_Matching/Data/images/bras/102472_0000012202.jpg")



`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.
  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 5.28 s, sys: 758 ms, total: 6.04 s
Wall time: 2.69 s


In [26]:
import _pickle as pickle

In [28]:
with open("/Users/asingh/temp.pck", "wb") as f:
    pickle.dump(result, f)

In [30]:
with open("/Users/asingh/temp.pck", "rb") as f:
    t = pickle.load(f)

In [33]:
len(result["102472_0000012202"])

2048

In [34]:
len(t["102472_0000012202"])

2048

In [None]:
class Matcher(object):

    def __init__(self, pickled_db_path="features.pck"):
        with open(pickled_db_path) as fp:
            self.data = pickle.load(fp)
        self.names = []
        self.matrix = []
        for k, v in self.data.iteritems():
            self.names.append(k)
            self.matrix.append(v)
        self.matrix = np.array(self.matrix)
        self.names = np.array(self.names)

    def cos_cdist(self, vector):
        # getting cosine distance between search image and images database
        v = vector.reshape(1, -1)
        return scipy.spatial.distance.cdist(self.matrix, v, 'cosine').reshape(-1)

    def match(self, image_path, topn=5):
        features = extract_features(image_path)
        img_distances = self.cos_cdist(features)
        # getting top 5 records
        nearest_ids = np.argsort(img_distances)[:topn].tolist()
        nearest_img_paths = self.names[nearest_ids].tolist()

        return nearest_img_paths, img_distances[nearest_ids].tolist()