In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import scipy.spatial

def openDataSet(data_folder):
    with open(f'../../data/artificial_data/{data_folder}/bought_products.p', 'rb') as filehandle:
        products = pickle.load(filehandle)

    with open(f'../../data/artificial_data/{data_folder}/dominant_cusine_for_user.p', 'rb') as filehandle:
        cuisine = pickle.load(filehandle)

    with open(f'../../data/artificial_data/{data_folder}/recipes_of_cuisine.p', 'rb') as filehandle:
        recipes = pickle.load(filehandle)

    return products, cuisine, recipes

In [2]:
n_products = 1000
n_cusines = 20
n_users = 1000

def JaccardSim(customer, recipe):
    return len(set(customer) & set(recipe)) / len(recipe)

def normalize(v):
    norm=np.linalg.norm(v, ord=1)
    if norm==0:
        norm=np.finfo(v.dtype).eps
    return v/norm

In [3]:
def accuracy_of_SVD_method(cut_values, users_set, products, cuisine, recipes):
    bought_products, dominant_cusine_for_user, recipes_of_cuisine = products, cuisine, recipes
    
    recipes_dictionary = {}

    for cuisine in range(n_cusines):
        recipes = recipes_of_cuisine[cuisine]

        for i, recipe in enumerate(recipes):
            recipes_dictionary[f"{cuisine}_{i}"] = recipe

    utility_matrix = np.zeros((n_users, len(recipes_dictionary)))

    for user in range(n_users):
        products = bought_products[user]    
        for i, recipe in enumerate(recipes_dictionary.keys()):
            utility_matrix[user, i] = JaccardSim(products, recipes_dictionary[recipe])
    
    u, s, vh = np.linalg.svd(utility_matrix, full_matrices=False)

    s = s[:cut_values]
    u = u[:, :cut_values]
    vh = vh[: cut_values]

    counter = 0
    cuisine_to_concepts = {}

    for cuisine in range(n_cusines):
        recipes = recipes_of_cuisine[cuisine]

        vh_fragment = vh[:, counter: counter+len(recipes)]
        counter += len(recipes)

        cuisine_to_concepts[cuisine] = normalize(np.sum(vh_fragment, axis=1))


    perfect_match_rate = 0
    approximate_match_rate = 0

    counter = 0

    for user in users_set:
        user_concepts = normalize(u[user])

        true_cuisine = dominant_cusine_for_user[user]

        distance_to_cusines = np.zeros(n_cusines)

        for cuisine in range(n_cusines):
            cuisine_concepts = cuisine_to_concepts[cuisine]

            #distance_to_cusines[cuisine] = np.sum(np.power(np.abs(user_concepts - cuisine_concepts), 1))
            distance_to_cusines[cuisine] = scipy.spatial.distance.cosine(user_concepts,  cuisine_concepts)



        if True or min(distance_to_cusines) < 3:
            counter+=1

            top_cuisines = np.argsort(distance_to_cusines)[:3]

            if top_cuisines[0] == true_cuisine:
                perfect_match_rate += 1

            if true_cuisine in top_cuisines:
                approximate_match_rate += 1



    perfect_match_rate /= counter
    approximate_match_rate /= counter

    return (perfect_match_rate, approximate_match_rate)

In [4]:
data_folders = ["3 recipes from preferred cuisine", \
                "3 recipes from preferred cuisine not full", \
                "2 from preferred cuisine 1 from other", "3 from preferred cuisine 2 from other", \
                "2 from preferred cuisine 1 from other not full", \
                "3 from preferred cuisine 2 from other not full", \
                "3 from preferred cuisine 6 from others not full"]

validation_set = range(n_users//2)
test_set = range(n_users//2, n_users)

ks = [2, 4, 8, 16, 32, 64, 128, 256, 512]

for folder in data_folders:
    print(folder)
    bought_products, dominant_cusine_for_user, recipes_of_cuisine = openDataSet(folder)
    
    accuracies = [accuracy_of_SVD_method(k, validation_set, bought_products, dominant_cusine_for_user, recipes_of_cuisine)[1] for k in ks]
    
    best_k = ks[np.argmax(accuracies)]
    
    print(accuracy_of_SVD_method(best_k, test_set, bought_products, dominant_cusine_for_user, recipes_of_cuisine))

3 recipes from preferred cuisine
(0.968, 0.998)
3 recipes from preferred cuisine not full
(0.922, 0.984)
2 from preferred cuisine 1 from other
(0.608, 0.972)
3 from preferred cuisine 2 from other
(0.484, 0.978)
2 from preferred cuisine 1 from other not full
(0.56, 0.908)
3 from preferred cuisine 2 from other not full
(0.492, 0.96)
3 from preferred cuisine 6 from others not full
(0.444, 0.734)
