In [1]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.sparse import lil_matrix

We first load the data and process it to form the graphs.

In [2]:
# Load data
data_file = 'data/recipes_with_nutritional_info_fixed_qty.json'
with open(data_file, 'r') as f:
    data = json.load(f)

In [3]:
# Print sample recipe
data[0]

{'fsa_lights_per100g': {'fat': 'green',
  'salt': 'green',
  'saturates': 'green',
  'sugars': 'orange'},
 'id': '000095fc1d',
 'ingredients': [{'text': 'yogurt, greek, plain, nonfat'},
  {'text': 'strawberries, raw'},
  {'text': 'cereals ready-to-eat, granola, homemade'}],
 'instructions': [{'text': 'Layer all ingredients in a serving dish.'}],
 'nutr_per_ingredient': [{'fat': 0.8845044000000001,
   'nrg': 133.80964,
   'pro': 23.110512399999998,
   'sat': 0.26535132,
   'sod': 81.64656,
   'sug': 7.348190400000001},
  {'fat': 0.46,
   'nrg': 49.0,
   'pro': 1.02,
   'sat': 0.023,
   'sod': 2.0,
   'sug': 7.43},
  {'fat': 7.415,
   'nrg': 149.25,
   'pro': 4.17,
   'sat': 1.207,
   'sod': 8.0,
   'sug': 6.04}],
 'nutr_values_per100g': {'energy': 81.12946131894766,
  'fat': 2.140139263515891,
  'protein': 6.914436593565536,
  'salt': 0.05597816738985967,
  'saturates': 0.36534716195613937,
  'sugars': 5.08634103436144},
 'partition': 'train',
 'quantity': [{'text': '8'}, {'text': '1'},

In [4]:
# List ingredient types
full_ingredient_names = defaultdict(dict)
for recipe in data:
    for ing in recipe['ingredients']:
        if not tuple(ing['text'].split(',')) in full_ingredient_names[ing['text'].split(',')[0]]:
            full_ingredient_names[ing['text'].split(',')[0]][tuple(ing['text'].split(','))] = 1
        else:
            full_ingredient_names[ing['text'].split(',')[0]][tuple(ing['text'].split(','))] += 1

To form the list of ingredients, we look for ingredient names that are specific enough. This means that we do not want one ingredient name to be associated to too many recipes nor to too few.

In [5]:
# Build trie for each ingredient class
class Trie:
    def __init__(self, val, d):
        self._build_trie(val, d)
        
    def _build_trie(self, val, d):
        if len(d) == 0:
            return
        self.val = val
        self.num = np.sum([d[k] for k in d])
        # Form next level
        children = defaultdict(dict)
        for k in d:
            if len(k) > 1:
                children[k[1]][k[1:]] = d[k]
        self.next = {}
        for k in children:
            self.next[k] = Trie(k, children[k])
        return self
    
    def relevant_ingredients(self, thresh, name, res):
        name.append(self.val)
        if len(self.next) <= 1:
            res.append((name, self.num))
            return
        # If all of the children have value below thresh, stop
        children_num = [self.next[child].num for child in self.next]
        if np.max(children_num) <= thresh:
            res.append((name, self.num))
            return
        for child in self.next:
            new_name = name.copy()
            self.next[child].relevant_ingredients(thresh, new_name, res)

In [6]:
# Retrieve the list of ingredients
thresh = len(data) / 10
ingredients_list = []
for ing in full_ingredient_names:
    t = Trie(ing, full_ingredient_names[ing])
    res = []
    t.relevant_ingredients(thresh, [], res)
    ingredients_list.extend(res)
ingredients = {''.join(ing[0]):ing[1] for ing in ingredients_list}

When associating ingredients to the recipes, the ones that appear in more than 10% of the recipes are considered common ingredients. These will not be taken into account when constructing the graph, but will be kept as features.

In [7]:
# Associate ids to recipes
recipe_frequency_thresh = len(data) / 10
recipe_names = [recipe['title'] for recipe in data]
recipe_ingredients = []
for recipe in data:
    ings_list = [ing['text'].split(',') for ing in recipe['ingredients']]
    ings = []
    for ing in ings_list:
        ing_name = ing[0]
        for i in range(1, len(ing) + 1):
            if ing_name in ingredients:
                if ingredients[ing_name] <= recipe_frequency_thresh:
                    ings.append(ing_name)
                break
            ing_name += ing[i]
    recipe_ingredients.append(ings)
recipe_df = pd.DataFrame({'name': recipe_names, 'ingredients': recipe_ingredients})

In [8]:
# Group recipes by ingredients
recipes_by_ingredients = defaultdict(list)
for ind in recipe_df.index:
    ings = recipe_df.iloc[ind]['ingredients']
    for ing in ings:
        recipes_by_ingredients[ing].append(ind)

The graph generation will be done in C++ due to memory and time constraints.

In [10]:
# Write recipes by ingredients as string
str_data = f'{len(data)} {len(recipes_by_ingredients)}\n'
for ing in recipes_by_ingredients:
    str_data += f"{len(recipes_by_ingredients[ing])}\n{' '.join([str(r) for r in recipes_by_ingredients[ing]])}\n"

with open('data/recipes_by_ingredients.txt', 'w') as f:
    f.write(str_data)

We obtained a adjacency lists which we would like to convert to a sparse matrix. However, the amount of data is very large (text file of 2G of edges) and it may not fit into memory. We therefore need to perform some sampling of the edges.

In [11]:
recipe_graph_file = 'data/recipe_graph.txt'

edge_values = []
with open(recipe_graph_file, 'r') as f:
    for i in range(len(data)):
        edges = f.readline().split(' ')[1:-1]
        edges_val = [float(edge[1:-1].split(',')[1]) for edge in edges]
        edge_values += edges_val

In [17]:
perc10 = np.percentile(edge_values, 90)
print(f'The number of edges is {len(edge_values)}.')
print(f'The smallest edge value is {np.min(edge_values)}.')
print(f'The largest edge value is {np.max(edge_values)}.')
print(f'The 90th percentile of the edge values is {perc10}.')

The number of edges is 126549982.
The smallest edge value is 0.151142.
The largest edge value is 15.6836.
The 90th percentile of the edge values is 1.64314.


Let's then sample the 10% most significant edges.