# Synthetic data 

In [18]:
import numpy as np
np.set_printoptions(suppress=True)
import matplotlib.pyplot as plt
import pickle
import os

In [19]:
n_products = 1000
n_cusines = 20
products = list(range(n_products))
cuisines = list(range(n_cusines))

In [20]:
# each product has some overall frequency, 
# there are products globally infrequent and globally frequent
# some relative measure of frequency is proposed, e.g. frequent items are those with measure > 5
products_frequency = np.random.normal(size = n_products)
products_frequency = np.maximum(0, (products_frequency + 3.5))
# products_frequency

# Each cusine has from 5 to 15 frequent products used in recipes
n_frequent_products_of_cusines = np.random.randint(2, 10, n_cusines)

# Those products are:
frequent_products_of_cuisines = {}

for cuisine in range(n_cusines):
    frequent_products_of_cuisines[cuisine] = np.random.randint(0, n_products, n_frequent_products_of_cusines[cuisine])

In [21]:
# frequency of products in cusines will be adjusted in such way, that frequent products from a cusine will have 20% of overall frequency for cuisine

frequency_of_products_in_cuisines = {}

for cuisine in cuisines:
    frequent = frequent_products_of_cuisines[cuisine]
    non_frequent = list(set(products) - set(frequent))
    
    non_frequent_sum_frequency = sum(products_frequency[non_frequent])
    sum_freq_of_freq_products = 0.2 * non_frequent_sum_frequency
    
    frequency_of_products_in_cuisines[cuisine] = products_frequency
    
    mean_of_freq = sum_freq_of_freq_products / len(frequent)
    freqent_items_frequency = mean_of_freq + np.random.uniform(-mean_of_freq/5, mean_of_freq/5, len(frequent))
    
    freqent_items_frequency[0] *=2
    freqent_items_frequency[1] *=1.5
    
    frequency_of_products_in_cuisines[cuisine][frequent] = freqent_items_frequency
    frequency_of_products_in_cuisines[cuisine] = frequency_of_products_in_cuisines[cuisine].astype(int)
    

In [22]:
# Each cusine will have some number of recipes, from 10 to 50
number_of_recipes_per_cuisine = np.random.randint(10, 50, n_cusines)
# number_of_recipes_per_cuisine

In [23]:
# dictionary, key: cuisine, values: 2-dimensional array; each inner array is a list of products in recipe
recipes_of_cuisine = {}

for cuisine in range(n_cusines):
    recipes = []
    
    helper = []
    frequency_of_products = frequency_of_products_in_cuisines[cuisine]
    
    
    for i, freq in enumerate(frequency_of_products):
        for j in range(int(freq)):
            helper.append(i)
    
    #print(helper)
    
    for recipe_number in range(number_of_recipes_per_cuisine[cuisine]):
        len_of_recipe = np.random.randint(5, 15)
        
        recipes.append(np.unique(np.random.choice(helper, len_of_recipe)))
    
    recipes_of_cuisine[cuisine] = recipes


# Creating user purchases
### Each user has some dominating cuisine

In [24]:
# Each user will have dominant cuisine
# Each user will prepare 3 recipes from his dominant cusine
n_users = 1000
users = list(range(n_users))
dominant_cusine_for_user = np.random.randint(0, n_cusines, n_users)

## Each customer: 3 full recipes 

In [30]:
bought_products = {}

for user in users:
    available_recipes = recipes_of_cuisine[dominant_cusine_for_user[user]]
    
    chosen_recipes = np.random.choice(len(available_recipes), 3)
    
    rec = []
    
    for el in chosen_recipes:
        rec.append(available_recipes[el])

    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)

data_folder = "../../data/artificial_data/3 recipes from preferred cuisine"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Each customer: 3 not full recipes  

In [31]:
bought_products = {}

for user in users:
    available_recipes = recipes_of_cuisine[dominant_cusine_for_user[user]]
    
    chosen_recipes = np.random.choice(len(available_recipes), 3)
    
    rec = []
    
    for el in chosen_recipes:
        recipe = available_recipes[el]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)
    
data_folder = "../../data/artificial_data/3 recipes from preferred cuisine not full"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Each customer: 2 recipes from his cuisine and 1 from other  

In [32]:
bought_products = {}

for user in users:
    recipes_from_chosen_cuisine = recipes_of_cuisine[dominant_cusine_for_user[user]]
    recipes_from_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 1) % n_cusines]
    
    
    chosen_recipes_from_chosen_cuisine = np.random.choice(len(recipes_from_chosen_cuisine), 2)
    chosen_recipes_from_other_cuisine = np.random.choice(len(recipes_from_other_cuisine), 1)

    rec = []
    
    for num in chosen_recipes_from_chosen_cuisine:
        recipe = recipes_from_chosen_cuisine[num]
        subset_of_recipe = recipe
        #subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    for num in chosen_recipes_from_other_cuisine:
        recipe = recipes_from_other_cuisine[num]
        subset_of_recipe = recipe
        #subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

        
    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)
    
data_folder = "../../data/artificial_data/2 from preferred cuisine 1 from other"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Each customer: 3 recipes from his cuisine and 2 from other  

In [33]:
bought_products = {}

for user in users:
    recipes_from_chosen_cuisine = recipes_of_cuisine[dominant_cusine_for_user[user]]
    recipes_from_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 1) % n_cusines]
    
    
    chosen_recipes_from_chosen_cuisine = np.random.choice(len(recipes_from_chosen_cuisine), 3)
    chosen_recipes_from_other_cuisine = np.random.choice(len(recipes_from_other_cuisine), 2)

    rec = []
    
    for num in chosen_recipes_from_chosen_cuisine:
        recipe = recipes_from_chosen_cuisine[num]
        subset_of_recipe = recipe
        #subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    for num in chosen_recipes_from_other_cuisine:
        recipe = recipes_from_other_cuisine[num]
        subset_of_recipe = recipe
        #subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

        
    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)
    
data_folder = "../../data/artificial_data/3 from preferred cuisine 2 from other"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Each customer: 2 recipes from his cuisine and 1 from other - not full

In [34]:
bought_products = {}

for user in users:
    recipes_from_chosen_cuisine = recipes_of_cuisine[dominant_cusine_for_user[user]]
    recipes_from_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 1) % n_cusines]
    
    
    chosen_recipes_from_chosen_cuisine = np.random.choice(len(recipes_from_chosen_cuisine), 2)
    chosen_recipes_from_other_cuisine = np.random.choice(len(recipes_from_other_cuisine), 1)

    rec = []
    
    for num in chosen_recipes_from_chosen_cuisine:
        recipe = recipes_from_chosen_cuisine[num]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    for num in chosen_recipes_from_other_cuisine:
        recipe = recipes_from_other_cuisine[num]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

        
    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)
    
data_folder = "../../data/artificial_data/2 from preferred cuisine 1 from other not full"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)

## Each customer: 3 recipes from his cuisine and 2 from other - not full

In [35]:
bought_products = {}

for user in users:
    recipes_from_chosen_cuisine = recipes_of_cuisine[dominant_cusine_for_user[user]]
    recipes_from_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 1) % n_cusines]
    
    
    chosen_recipes_from_chosen_cuisine = np.random.choice(len(recipes_from_chosen_cuisine), 3)
    chosen_recipes_from_other_cuisine = np.random.choice(len(recipes_from_other_cuisine), 2)

    rec = []
    
    for num in chosen_recipes_from_chosen_cuisine:
        recipe = recipes_from_chosen_cuisine[num]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    for num in chosen_recipes_from_other_cuisine:
        recipe = recipes_from_other_cuisine[num]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

        
    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)

data_folder = "../../data/artificial_data/3 from preferred cuisine 2 from other not full"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)

##3 from preferred cuisine 6 from others not full

In [36]:
bought_products = {}

for user in users:
    recipes_from_chosen_cuisine = recipes_of_cuisine[dominant_cusine_for_user[user]]
    recipes_from_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 1) % n_cusines]
    recipes_from_other_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 2) % n_cusines]
    recipes_from_other_other_other_cuisine = recipes_of_cuisine[(dominant_cusine_for_user[user] + 3) % n_cusines]
    
    
    chosen_recipes_from_chosen_cuisine = np.random.choice(len(recipes_from_chosen_cuisine), 3)
    chosen_recipes_from_other_cuisine = np.random.choice(len(recipes_from_other_cuisine), 2)
    chosen_recipes_from_other_other_cuisine = np.random.choice(len(recipes_from_other_other_cuisine), 2)
    chosen_recipes_from_other_other_other_cuisine = np.random.choice(len(recipes_from_other_other_other_cuisine), 2)

    rec = []
    
    for num in chosen_recipes_from_chosen_cuisine:
        recipe = recipes_from_chosen_cuisine[num]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    for num in chosen_recipes_from_other_cuisine:
        recipe = recipes_from_other_cuisine[num]
        subset_of_recipe = recipe[: int(0.7 * len(recipe))]
        rec.append(subset_of_recipe)

    for num in chosen_recipes_from_other_other_cuisine:
        recipe = recipes_from_other_other_cuisine[num]
        subset_of_recipe = recipe[: int(0.5 * len(recipe))]
        rec.append(subset_of_recipe)
        
    for num in chosen_recipes_from_other_other_cuisine:
        recipe = recipes_from_other_other_cuisine[num]
        subset_of_recipe = recipe[: int(0.5 * len(recipe))]
        rec.append(subset_of_recipe)
        
        
    for num in chosen_recipes_from_other_other_other_cuisine:
        recipe = recipes_from_other_other_other_cuisine[num]
        subset_of_recipe = recipe[: int(0.5 * len(recipe))]
        rec.append(subset_of_recipe)
        
    needed_products = np.concatenate(rec)
    
    bought_products[user] = np.unique(needed_products)

data_folder = "../../data/artificial_data/3 from preferred cuisine 6 from others not full"
os.mkdir(data_folder)

with open(f'{data_folder}/bought_products.p', 'wb') as fp:
    pickle.dump(bought_products, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(f'{data_folder}/dominant_cusine_for_user.p', 'wb') as fp:
    pickle.dump(dominant_cusine_for_user, fp, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'{data_folder}/recipes_of_cuisine.p', 'wb') as fp:
    pickle.dump(recipes_of_cuisine, fp, protocol=pickle.HIGHEST_PROTOCOL)