In [48]:
import pandas as pd
import numpy as np
import pickle
import re
import matplotlib.pyplot as plt

# Do we want to classify user only by one recipe?

In [49]:
one_purchase = False

In [50]:
from py_stringmatching import GeneralizedJaccard
gj = GeneralizedJaccard(threshold=0.8)

In [51]:
def JaccardSim(customer, recipe):
    try:
        return len(set(customer) & set(recipe)) / len(recipe)
    except: 
        return 0

In [52]:
# instacart
recipes = pd.read_csv("../../data/instacart/recipes_cropped.csv")

#kaggle
#recipes = pd.read_csv("../../data/groceries/recipes_cropped.csv")

del recipes["Unnamed: 0"]

In [53]:
drop_indices = np.random.choice(recipes.index, 38000, replace=False)
recipes = recipes.drop(drop_indices)

n_recipes = len(recipes)
n_recipes

1757

In [54]:
recipes = recipes.sort_values(by="cuisine").reset_index(drop=True)
cuisine_counts = recipes.groupby("cuisine").size()
n_cuisines = len(cuisine_counts)
print(cuisine_counts)

cuisine
brazilian        32
british          50
cajun_creole     77
chinese         108
filipino         42
french          121
greek            45
indian          126
irish            32
italian         314
jamaican         18
japanese         55
korean           38
mexican         286
moroccan         54
russian          20
southern_us     192
spanish          49
thai             58
vietnamese       40
dtype: int64


In [55]:
# instacart
if one_purchase:
    with open('../../data/instacart/user_to_products_one_purchase_text.p', 'rb') as fp:
        user_to_products = pickle.load(fp)
else:
    with open('../../data/instacart/user_to_products_text.p', 'rb') as fp:
        user_to_products = pickle.load(fp)


# kaggle
#with open('../../data/groceries/user_to_products_one_purchase_text.p', 'rb') as fp:
#    user_to_products = pickle.load(fp)

In [58]:
n_users = 5000
utility_matrix = np.zeros((n_users, n_recipes))

In [59]:
for user in range(n_users):
    user_products = list(user_to_products[user + 1])
    
    # Needed to work with GeneralizedJaccard
    #user_products = [str(x) for x in user_products]
    
    for recipe_number in range(n_recipes):
        recipe_ingredients = recipes.iloc[recipe_number]['ingredients']
        recipe_ingredients = recipe_ingredients.split()
        
        # Needed to work with GeneralizedJaccard
        #recipe_ingredients = [str(x) for x in recipe_ingredients]
        
        #utility_matrix[user, recipe_number] = gj.get_sim_score(user_products, recipe_ingredients) 
        utility_matrix[user, recipe_number] = JaccardSim(user_products, recipe_ingredients)

In [60]:
u, s, vh = np.linalg.svd(utility_matrix, full_matrices=False)

In [61]:
# 50: number selected experimentally for situation where baset is breated from one purchase
k = 50

s = s[:k]
u = u[:, :k]
vh = vh[: k]

In [62]:
def normalize(v):
    norm=np.linalg.norm(v, ord=1)
    if norm==0:
        norm=np.finfo(v.dtype).eps
    return v/norm

In [63]:
counter = 0
cuisine_to_concepts = {}

for cuisine in range(n_cuisines):
    recipes_of_cuisine = recipes[counter: counter + cuisine_counts[cuisine]]
    vh_fragment = vh[:, counter: counter+ cuisine_counts[cuisine]]

    counter += cuisine_counts[cuisine]

    cuisine_to_concepts[cuisine] = normalize(np.sum(vh_fragment, axis=1))    

In [64]:
for user in range(n_users):
    user_concepts = u[user]
    user_concepts = normalize(u[user])
    
    distance_to_cusines = np.zeros(n_cuisines)
    
    for cuisine in range(n_cuisines):
        cuisine_concepts = cuisine_to_concepts[cuisine]
        
        distance_to_cusines[cuisine] = np.sum(np.power(np.abs(user_concepts - cuisine_concepts), 1))
    
    
    top_cuisines = np.argsort(distance_to_cusines)[:3]
    
    first_dist = distance_to_cusines[top_cuisines[0]]
    second_dist = distance_to_cusines[top_cuisines[1]]
    third_dist = distance_to_cusines[top_cuisines[2]]
    
    
    # INSTACART
    
    # ONE PURCHASE
    # print italian products
    #if top_cuisines[0] == 9 and first_dist < 0.9 * second_dist:
    #    print(user_to_products[user+1])
        
    # print mexican products
    #if top_cuisines[0] == 13 and first_dist < 0.85 * second_dist:
    #    print(user_to_products[user+1])
    
    # print indian products
    #if top_cuisines[0] == 7 and first_dist < 0.8 * second_dist:
    #    print(user_to_products[user+1])
    
    
    
    # MANY PURCHASES
    # print italian products
    #if top_cuisines[0] == 9 and first_dist < 0.87 * second_dist and len(user_to_products[user+1])<40:
    #    print( user_to_products[user+1])
       
    # print mexican products
    #if top_cuisines[0] == 13 and first_dist < 0.9 * second_dist and len(user_to_products[user+1])<40:
    #    print(user_to_products[user+1])
    
    # print indian products
    #if top_cuisines[0] == 7 and first_dist < 0.9 * second_dist and len(user_to_products[user+1])<40:
    #    print(user_to_products[user+1])
    
    
    
    # KAGGLE
    
    # print italian products
    #if top_cuisines[0] == 9 and first_dist < 0.9 * second_dist:
    #    print(user_to_products[user+1])
        
    # print mexican products
    #if top_cuisines[0] == 13 and first_dist < 0.9 * second_dist:
    #    print(user_to_products[user+1])
    
    # print indian products
    #if top_cuisines[0] == 7 and first_dist < 0.9 * second_dist:
    #    print(user_to_products[user+1])
    