# SimSim

In [1]:
import json
import math
import networkx as nx
import community
from collections import defaultdict, Counter
from ipysigma import Sigma
from fog.metrics import sparse_dot_product
from tqdm import tqdm_notebook

## Loading the recipes

In [2]:
with open('./recipes.json', 'r') as f:
    RECIPES = json.load(f)

for i, recipe in enumerate(RECIPES[0:2]):
    title = 'Recipe n°%i' % (i + 1)
    print(title)
    print('-' * len(title))
    
    for ingredient in recipe['ingredients']:
        print(ingredient)
        
    print()
print('...')

Recipe n°1
----------
romaine lettuce
black olives
grape tomatoes
garlic
pepper
purple onion
seasoning
garbanzo beans
feta cheese crumbles

Recipe n°2
----------
plain flour
ground pepper
salt
tomatoes
ground black pepper
thyme
eggs
green tomatoes
yellow corn meal
milk
vegetable oil

...


## Computing the similarity matrix

In [3]:
VECTORS = defaultdict(Counter)
OCCURRENCES = Counter()
for recipe in RECIPES:
    ingredients = recipe['ingredients']
    
    for i in range(len(ingredients)):
        A = ingredients[i]
        OCCURRENCES[A] += 1
        for j in range(i + 1, len(ingredients)):
            B = ingredients[j]
            
            VECTORS[A][B] += 1
            VECTORS[B][A] += 1
            
print(VECTORS['milk'].most_common(5))
print(OCCURRENCES['milk'])
print(len(VECTORS))

[('salt', 1354), ('butter', 835), ('all-purpose flour', 791), ('eggs', 734), ('sugar', 598)]
2263
6714


In [4]:
NORMS = {}
for ingredient, vector in VECTORS.items():
    NORMS[ingredient] = math.sqrt(sum(map(lambda x: x * x, vector.values())))
    
NORMS['milk']

2372.3218584332103

In [5]:
MATRIX = defaultdict(dict)

for kA, A in tqdm_notebook(VECTORS.items()):
    nA = NORMS[kA]
    for kB in A:
        B = VECTORS[kB]
        nB = NORMS[kB]
        s = sparse_dot_product(A, B) / (nA * nB)

        MATRIX[kA][kB] = s
        MATRIX[kB][kA] = s

HBox(children=(IntProgress(value=0, max=6714), HTML(value='')))




In [6]:
# Test
MATRIX['milk']

{'romaine lettuce': 0.5134933486688922,
 'black olives': 0.533048360905154,
 'grape tomatoes': 0.5611005584053758,
 'garlic': 0.683404109215867,
 'pepper': 0.7637984053377127,
 'purple onion': 0.5922018304105658,
 'seasoning': 0.7089089720897498,
 'garbanzo beans': 0.5659175741891418,
 'feta cheese crumbles': 0.4926559481988714,
 'plain flour': 0.8179936460180105,
 'ground pepper': 0.6819179548190523,
 'salt': 0.6285361460756537,
 'tomatoes': 0.6565334785735949,
 'ground black pepper': 0.720209028576445,
 'thyme': 0.6817052055189198,
 'eggs': 0.8716422724926906,
 'green tomatoes': 0.7749376761979361,
 'yellow corn meal': 0.8513965277198132,
 'vegetable oil': 0.7594391139084048,
 'black pepper': 0.7400020800190426,
 'shallots': 0.685053065968898,
 'cornflour': 0.48359074820275616,
 'cayenne pepper': 0.7056420388327814,
 'onions': 0.6955679894301807,
 'garlic paste': 0.4861823467246021,
 'butter': 0.8663688396858132,
 'lemon juice': 0.7940761667055567,
 'water': 0.7867581626842027,
 'chi

## Computing the macro similarity graph

In [7]:
# Starting threshold
STARTING_THRESHOLD = 0.83

# Iwanthue palette for ten first communities
PALETTE = [
    '#c9587a',
    '#5dba5a',
    '#b65cbf',
    '#a3af44',
    '#747bc8',
    '#d49d43',
    '#4fbab7',
    '#d35238',
    '#588042',
    '#ac6f40'
]

In [8]:
similarity_graph = nx.Graph()

for ingredient, neighbors in MATRIX.items():
    for neighbor, cosine in neighbors.items():
        if cosine >= STARTING_THRESHOLD:
            similarity_graph.add_edge(ingredient, neighbor, cosine=cosine)

len(similarity_graph)

1319

In [9]:
# Keeping only largest component
largest_component = set(max(nx.connected_components(similarity_graph), key=len))

for node in list(similarity_graph.nodes):
    if node not in largest_component:
        similarity_graph.remove_node(node)

In [10]:
# Community detection using the Louvain method
partition = community.best_partition(similarity_graph)

# Counting
communities_count = Counter()

for node, c in partition.items():
    communities_count[c] += 1
    
communities_colors = {}

i = 0
for c, _ in communities_count.most_common(10):
    communities_colors[c] = PALETTE[i]
    i += 1

In [62]:
# Styling
for node, attrs in similarity_graph.nodes(data=True):
    attrs['size'] = min(12, max(2, OCCURRENCES[node] / len(OCCURRENCES) * 6))
    attrs['color'] = communities_colors.get(partition[node], '#BBB')

# Display
Sigma(similarity_graph, height=1000)

Sigma(data={'nodes': [('romaine lettuce', {'size': 2, 'color': '#c9587a'}), ('grape tomatoes', {'size': 2, 'co…

## Computing the micro similarity graphs

In [17]:
communities = [set() for i in range(len(communities_count))] 

for node, community in partition.items():
    communities[community].add(node)

In [19]:
SUBVECTORS = {}

for community in communities:
    for item in community:
        SUBVECTORS[item] = Counter(dict((key, count) for key, count in VECTORS[item].items() if key in community))
        
SUBVECTORS['milk']

Counter({'plain flour': 41,
         'eggs': 734,
         'green tomatoes': 13,
         'yellow corn meal': 58,
         'butter': 835,
         'sugar': 598,
         'ground cinnamon': 142,
         'vanilla extract': 247,
         'powdered sugar': 91,
         'baking powder': 389,
         'melted butter': 79,
         'white sugar': 180,
         'all-purpose flour': 791,
         'chopped walnuts': 16,
         'vanilla': 77,
         'large egg yolks': 59,
         'confectioners sugar': 68,
         'softened butter': 15,
         'buttermilk': 63,
         'granulated sugar': 98,
         'vegetable shortening': 22,
         'egg yolks': 139,
         'flour': 308,
         'lard': 14,
         'cream cheese': 70,
         'shortening': 68,
         'unsalted butter': 322,
         'jam': 7,
         'baking soda': 122,
         'dried currants': 8,
         'large eggs': 365,
         'slivered almonds': 13,
         'orange zest': 15,
         'boiling water': 20,
       

In [20]:
SUBNORMS = {}
for ingredient, vector in SUBVECTORS.items():
    SUBNORMS[ingredient] = math.sqrt(sum(map(lambda x: x * x, vector.values())))
    
SUBNORMS['milk']

1729.1422729203055

In [24]:
MATRICES = []

for community in tqdm_notebook(communities):
    m = defaultdict(dict)
    MATRICES.append(m)
    
    community = list(community)
    
    for i, kA in enumerate(community):
        nA = SUBNORMS[kA]
        A = SUBVECTORS[kA]
        
        for j in range(i + 1, len(community)):
            kB = community[j]
            nB = SUBNORMS[kB]
            B = SUBVECTORS[kB]
            
            s = sparse_dot_product(A, B) / (nA * nB)
            
            m[kA][kB] = s
            m[kB][kA] = s

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [45]:
similarity_graphs = []

SUB_THRESHOLD = 0.85

for i in range(len(communities)):
    m = MATRICES[i]
    g = nx.Graph()
    similarity_graphs.append(g)
    
    for ingredient, neighbors in m.items():
        for neighbor, cosine in neighbors.items():
            if cosine >= SUB_THRESHOLD:
                g.add_edge(ingredient, neighbor, cosine=cosine)

In [60]:
graph = similarity_graphs[1]

# Styling
for node, attrs in graph.nodes(data=True):
    attrs['size'] = min(12, max(2, OCCURRENCES[node] / len(OCCURRENCES) * 6))
    # attrs['color'] = communities_colors.get(partition[node], '#BBB')

# Display
Sigma(graph)

Sigma(data={'nodes': [('shallots', {'size': 2}), ('lemon wedge', {'size': 2}), ('fresh thyme leaves', {'size':…

In [63]:
Sigma(similarity_graph, height=1000)

Sigma(data={'nodes': [('romaine lettuce', {'size': 2, 'color': '#c9587a'}), ('grape tomatoes', {'size': 2, 'co…