In [67]:
import numpy as np
import json 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Product Extraction and Cleaning
# {
#	accountId: prod.accountId,
#	baseCode: prod.baseCode, 
#	isActive: prod.isActive, 
#	mrpPrice: prod.mrpPrice, 
#	sellingPrice: prod.sellingPrice,
#	name: prod.name,
#	skuCode: prod.skuCode
# }
# Step 1.1 Loading data as numpy array

In [77]:
class ProductAttributeRepresentor():
    def __init__(self, productList):
        self.productList = productList
    def generateSingleCombinedFeature(self):
        self.featureList = [ ]
        for product in self.productList:
            feature_value = '__'.join(str(attribute_value) for attribute_value in product.values())
            self.featureList.append(feature_value)         
            
    def getIndexFromFeatureList(self, baseCode):
        count = 0
        for feature_product in self.featureList:
            feature_segmentList = feature_product.split("__")
            if(baseCode == feature_segmentList[1]):
                return (count, True)
            count += 1
        return (-1, False)


In [92]:
json_file_pointer = open("./data_sets/meomart_products.json")
product_np_data_set = np.array(json.load(json_file_pointer))

# feature builder
data_set = ProductAttributeRepresentor(product_np_data_set)
data_set.generateSingleCombinedFeature()

# numerical representation of the dataset: data_set.featureList
cv_func = CountVectorizer()
numerical_dataset_matrix = cv_func.fit_transform(data_set.featureList)

# similartiy
cosine_similar_product = cosine_similarity(numerical_dataset_matrix)

# user preference
product_user_ordered_baseCode = "KHADI-3007"
# get the index of this product from data_set.featureList


product_user_ordered = data_set.getIndexFromFeatureList(product_user_ordered_baseCode)

if( product_user_ordered[1] ):
    print("ORDERED PRODUCT INDEX: ", product_user_ordered[0])
    print("ORDERD PRODUCT: ", data_set.featureList[product_user_ordered[0]].split("__")[5])
    print("==========================================")
    
    similar_products = list(enumerate(cosine_similar_product[product_user_ordered[0]]))
    sorted_similar_products = sorted(similar_products,key=lambda x:x[1],reverse=True)[1:] 
    
    counter = 1
    for products in sorted_similar_products:
        if(counter <= 18):
            print("PRODUCT[{}] : ".format(counter), data_set.featureList[products[0]].split("__")[5])
            counter += 1
        else: break



ORDERED PRODUCT INDEX:  5
ORDERD PRODUCT:  Fully Organic Weaving Khadi Cotton Saree - White Black
PRODUCT[1] :  Fully Organic Weaving Khadi Cotton Saree -White Black
PRODUCT[2] :  Fully Organic Weaving Khadi Cotton Saree - White Pink
PRODUCT[3] :  Fully Organic Weaving Khadi Cotton Saree - White Pink
PRODUCT[4] :  Fully Organic Weaving Khadi Cotton Saree -White Red Black
PRODUCT[5] :  Fully Organic Weaving Khadi Cotton Saree - Red Brown
PRODUCT[6] :  Fully Organic Weaving Khadi Cotton Saree - Black Orange Pink
PRODUCT[7] :  Handloom Khadi Cotton Elephant Motive Jamdani Saree -  White Red Black
PRODUCT[8] :  Fully Organic Weaving Khadi Cotton Saree -Black Red
PRODUCT[9] :  Handloom Khadi Cotton Ball Design - White Brown
PRODUCT[10] :  Handloom Khadi Cotton Ball Design - White Green
PRODUCT[11] :  Handloom Khadi Cotton Ball Design - White Blue
PRODUCT[12] :  Handloom Khadi Cotton Ball Design - White Red
PRODUCT[13] :  Handloom Khadi Cotton Tree Design - White Red Black
PRODUCT[14] :  Han