In [18]:
import pandas as pd
import numpy as np
import os
if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

from utils.data_loading import read_foods, read_molecules

from itertools import combinations
import networkx as nx
from tqdm import tqdm

from collections import Counter
from more_itertools import collapse

In [19]:
# Add edges for shared molecules
def find_shared_molecules(food1, food2, molecules_to_include, fm_dict):
    molecules1 = set(fm_dict[food1])
    molecules2 = set(fm_dict[food2])
    shared_molecules = molecules1.intersection(molecules2)
    shared_molecules_to_include = shared_molecules.intersection(molecules_to_include)
    return shared_molecules_to_include

In [20]:
def generate_food_graph(nodes,df, molecules_to_include, fm_dict, shared=20):
    """Function to generate NetworkX graphs with selection of foods&molecules"""
    # Create an empty graph
    G = nx.Graph()

    # Add nodes for each food
    for i, food in enumerate(nodes):
        food_cat =  df.loc[df['food'] == food, 'category'].values[0]
        # food_cat = nodes_cats[i]
        G.add_node(food, nodes_cats = food_cat)

    # Add edges for shared molecules
    for food1, food2 in tqdm(combinations(nodes, 2)):
        shared_molecules = find_shared_molecules(food1, food2, molecules_to_include, fm_dict)
        if len(shared_molecules) > shared:
            G.add_edge(food1, food2, weight=np.int64(len(shared_molecules)))
    
    return G

In [21]:
def find_graph_statistics(G):
    nnodes = G.number_of_nodes()
    avg_deegre = sum(d for n, d in G.degree()) / float(nnodes)
    diameter = nx.diameter(G)
    density = nx.density(G)
    print(f"Average deegre: {avg_deegre}")
    print(f"Network diameter: {diameter}")
    print(f"Network density: {density}")

In [22]:
flavor_molecules = read_molecules()
food_df = read_foods()
# food_df = food_df.dropna(subset='category')

In [23]:
molecules = collapse(food_df['foodb_ids'].values.tolist())
c = Counter(molecules)
molecules_to_include = [k for k, v in c.items() if 50 > v]

In [24]:
food_df['category'] = food_df['category'].fillna('none').astype(str)

## Flavor Graph (molecule based)
___

In [25]:
# food_df = food_df[food_df['molecules'].apply(len) >= 20]
len(food_df)

1146

In [26]:
foods = food_df['food'].values.tolist()
food_molecule_dict = dict(zip(food_df['food'], food_df['foodb_ids']))

In [27]:
G = generate_food_graph(foods,  
                        food_df,
                        molecules_to_include,
                        food_molecule_dict,
                        shared=0)

656085it [00:06, 96922.12it/s] 


In [28]:
G.remove_nodes_from(list(nx.isolates(G)))

In [29]:
print(G)

Graph with 578 nodes and 45403 edges


In [43]:
nx.write_gexf(G, "networks/food-cut.gexf")

In [32]:
target = 'tomato'
if target in G:
    neighbors = {k:v['weight'] for k, v in G[target].items()}
    neighbors = dict(sorted(neighbors.items(), key=lambda item: item[1], reverse=True))
    print(neighbors)


{'garden tomato': 25, 'tea': 20, 'beef processed': 16, 'rum': 16, 'fruits': 15, 'garden tomato (var.)': 15, 'herbs and spices': 15, 'angelica': 14, 'blackberry': 14, 'coffee': 14, 'potato': 14, 'alcoholic beverages': 13, 'beer': 13, 'black tea': 13, 'corn': 13, 'rose wine': 12, 'soybean': 12, 'achilleas': 11, 'evergreen blackberry': 11, 'green tea': 11, 'hops oil': 11, 'chicken': 10, 'cocoa': 10, 'herbal tea': 10, 'red tea': 10, 'sheep milk': 10, 'silver linden': 10, 'arabica coffee': 9, 'banana': 9, 'cognac brandy': 9, 'fried potato': 9, 'japanese whisky': 9, 'malt whisky': 9, 'peas': 9, 'peppermint': 9, 'scotch whisky': 9, 'sherry': 9, 'bourbon whisky': 8, 'cabbage': 8, 'canadian whisky': 8, 'carrot': 8, 'cauliflower': 8, 'coffee and coffee products': 8, 'finnish whisky': 8, 'lean fish': 8, 'peanut': 8, 'pork': 8, 'robusta coffee': 8, 'whisky': 8, 'beef': 7, 'buckwheat': 7, 'butter': 7, 'capsicum': 7, 'caviar': 7, 'cherry tomato': 7, 'drumstick leaf': 7, 'melon': 7, 'mushroom': 7, 'n

In [12]:
# N = nx.read_gexf("food-full.gexf", node_type=str, relabel=True)

In [None]:
# pogrupować graf i pokazac grupy, mozna jakąś jedną w zoomie pokazać
# networkx grupowanie mozna zobaczyc, infomap/infonode

### Louvain communities

In [51]:
partitions = nx.community.louvain_communities(G, weight='weight', resolution=1, seed=123)
len(partitions)

6

In [65]:
lenghts = [len(partition) for partition in partitions]
lenghts

[163, 136, 30, 139, 32, 78]

In [53]:
for i, partition in enumerate(partitions):
    food_cat = []
    for food in list(partition):
        food_cat.append(food_df.loc[food_df['food'] == food, 'category'].values[0])
    values, counts = np.unique(food_cat, return_counts=True)
    cats_print = dict(zip(values, counts))
    print(f"Partition {i}: {(sorted([(v, k) for k, v in cats_print.items()], reverse=True))}")

Partition 0: [(46, 'seasoning'), (44, 'herb'), (25, 'fruit'), (23, 'vegetable'), (7, 'fish'), (4, 'seafood'), (4, 'mushroom'), (2, 'nut'), (2, 'meat'), (2, 'beverage'), (1, 'none'), (1, 'grain'), (1, 'dairy'), (1, 'bean')]
Partition 1: [(18, 'vegetable'), (15, 'seasoning'), (11, 'seafood'), (11, 'grain'), (10, 'beverage'), (10, 'bakery product'), (9, 'nut'), (8, 'meat'), (8, 'herb'), (8, 'fruit'), (8, 'dairy'), (7, 'fish'), (5, 'bean'), (4, 'mushroom'), (2, 'none'), (1, 'sugar'), (1, 'alcohol')]
Partition 2: [(28, 'dairy'), (1, 'vegetable'), (1, 'fish')]
Partition 3: [(71, 'fruit'), (36, 'alcohol'), (8, 'vegetable'), (4, 'seasoning'), (3, 'nut'), (3, 'none'), (2, 'sugar'), (2, 'meat'), (2, 'grain'), (2, 'fish'), (2, 'dairy'), (1, 'seafood'), (1, 'herb'), (1, 'beverage'), (1, 'bakery product')]
Partition 4: [(15, 'vegetable'), (6, 'fruit'), (3, 'nut'), (2, 'seasoning'), (2, 'meat'), (2, 'fish'), (1, 'herb'), (1, 'grain')]
Partition 5: [(31, 'vegetable'), (15, 'bean'), (8, 'herb'), (7, '

In [47]:
for i, partition in enumerate(partitions):
    partition_foods = list(partition)
    G = generate_food_graph(partition_foods,  
                        food_df,
                        molecules_to_include,
                        food_molecule_dict,
                        shared=0)
    nx.write_gexf(G, f"networks/partition_{i}.gexf")

30135it [00:00, 70707.72it/s]
2080it [00:00, 64243.13it/s]
13366it [00:00, 34963.80it/s]
18915it [00:00, 65753.27it/s]


In [54]:
G.degree

DegreeView({'abalone': 30, 'abiyuch': 40, 'achilleas': 393, 'alaska pollock': 12, 'alcoholic beverages': 403, 'alfalfa': 238, 'allium': 56, 'allspice': 254, 'almond': 165, 'amaranth': 178, 'american cranberry': 49, 'angelica': 395, 'anguilliformes': 46, 'anise': 174, 'anise brandy': 146, 'anise hyssop': 13, 'apple': 364, 'apple brandy': 218, 'apple cider vinegar': 257, 'apricot': 292, 'arabica coffee': 271, 'armagnac brandy': 146, 'arrack': 28, 'arrowhead': 96, 'artemisia': 70, 'artichoke': 55, 'asafoetida': 80, 'ashgourd': 20, 'asian pear': 279, 'asparagus': 233, 'atlantic croaker': 57, 'atlantic halibut': 243, 'avocado': 134, 'babaco': 39, 'bakery products': 25, 'banana': 345, 'bantu beer': 87, 'barley': 189, 'bartlett pear': 173, 'basil': 274, 'basmati rice': 138, 'beans': 284, 'beaver': 48, 'beef': 309, 'beef processed': 393, 'beer': 417, 'beli': 45, 'beluga whale': 49, 'bergamot': 75, 'berry': 95, 'bilberry': 241, 'bilberry wine': 248, 'bitter cherry': 76, 'bitter gourd': 12, 'bit

In [57]:
nnodes = G.number_of_nodes()
deg = sum(d for n, d in G.degree()) / float(nnodes)
deg

126.12307692307692

### Leiden communities

In [66]:
from cdlib import algorithms

coms = algorithms.leiden(G)
len(coms.communities)

5

In [75]:
coms.communities[4]

['angelica',
 'annual wild rice',
 'arrowroot',
 'broccoli',
 'bulgur',
 'capsicum',
 'chard',
 'climbing bean',
 'common buckwheat',
 'common wheat',
 'comte cheese',
 'corn',
 'cornbread',
 'cucumber',
 'eggplant',
 'european anchovy',
 'fish',
 'flour',
 'green zucchini',
 'ham',
 'hard wheat',
 'hops oil',
 'japanese pumpkin',
 'lettuce',
 'lingcod',
 'millet',
 'nanking cherry',
 'oat',
 'oats',
 'oil palm',
 'olive',
 'oriental wheat',
 'pumpkin',
 'quinoa',
 'raccoon',
 'red beetroot',
 'red rice',
 'romaine lettuce',
 'rum',
 'rye',
 'saskatoon berry',
 'shea tree',
 'sorghum',
 'spelt',
 'sunburst squash (pattypan squash)',
 'swiss chard',
 'tartary buckwheat',
 'teff',
 'triticale',
 'turnip',
 'wheat',
 'white cabbage',
 'whitefish',
 'yellow wax bean',
 'yellow zucchini']

In [67]:
lenghts = [len(partition) for partition in coms.communities]
lenghts

[222, 221, 102, 70, 55]

In [73]:
for i, com in enumerate(coms.communities):
    G = generate_food_graph(com,  
                        food_df,
                        molecules_to_include,
                        food_molecule_dict,
                        shared=0)
    print(f"== Community {i} ==")
    print(G)
    find_graph_statistics(G)
    nx.write_gexf(G, f"networks/com_{i}.gexf")

24531it [00:00, 69899.42it/s]


== Community 0 ==
Graph with 222 nodes and 16204 edges
Average deegre: 145.981981981982
Network diameter: 3
Network density: 0.6605519546696017


24310it [00:00, 71605.71it/s]


== Community 1 ==
Graph with 221 nodes and 15182 edges
Average deegre: 137.39366515837105
Network diameter: 4
Network density: 0.6245166598107774


5151it [00:00, 68870.21it/s]


== Community 2 ==
Graph with 102 nodes and 2773 edges
Average deegre: 54.372549019607845
Network diameter: 4
Network density: 0.5383420695010678


2415it [00:00, 74869.50it/s]


== Community 3 ==
Graph with 70 nodes and 1111 edges
Average deegre: 31.742857142857144
Network diameter: 3
Network density: 0.4600414078674948


1485it [00:00, 72570.04it/s]

== Community 4 ==
Graph with 55 nodes and 920 edges
Average deegre: 33.45454545454545
Network diameter: 2
Network density: 0.6195286195286195





In [72]:
for i, partition in enumerate(coms.communities):
    if len(partition) < 2:
        continue
    else:
        food_cat = []
        for food in list(partition):
            food_cat.append(food_df.loc[food_df['food'] == food, 'category'].values[0])
        values, counts = np.unique(food_cat, return_counts=True)
        cats_print = dict(zip(values, counts))
        print(f"Partition {i}: {(sorted([(v, k) for k, v in cats_print.items()], reverse=True))}")

Partition 0: [(41, 'vegetable'), (36, 'meat'), (32, 'dairy'), (20, 'fruit'), (13, 'seafood'), (12, 'fish'), (11, 'seasoning'), (10, 'alcohol'), (8, 'grain'), (8, 'bakery product'), (7, 'nut'), (7, 'herb'), (7, 'bean'), (6, 'beverage'), (4, 'none')]
Partition 1: [(55, 'seasoning'), (50, 'fruit'), (49, 'herb'), (29, 'vegetable'), (8, 'fish'), (7, 'beverage'), (5, 'nut'), (4, 'seafood'), (4, 'meat'), (3, 'mushroom'), (2, 'dairy'), (2, 'alcohol'), (1, 'sugar'), (1, 'none'), (1, 'grain')]
Partition 2: [(36, 'fruit'), (26, 'alcohol'), (8, 'vegetable'), (6, 'seasoning'), (6, 'nut'), (4, 'dairy'), (3, 'bakery product'), (2, 'sugar'), (2, 'none'), (2, 'meat'), (2, 'herb'), (2, 'grain'), (2, 'fish'), (1, 'seafood')]
Partition 3: [(14, 'bean'), (13, 'vegetable'), (12, 'fruit'), (6, 'seasoning'), (6, 'herb'), (5, 'mushroom'), (4, 'dairy'), (3, 'fish'), (2, 'seafood'), (2, 'grain'), (1, 'sugar'), (1, 'beverage'), (1, 'bakery product')]
Partition 4: [(18, 'grain'), (16, 'vegetable'), (4, 'fish'), (3

In [84]:
def get_cluster_molecules(partition: list):
    molecules_df = read_molecules()
    cluster_df = food_df[food_df['food'].isin(partition)]
    all_molecules = cluster_df['foodb_ids'].values.tolist()
    cluster_molecules = [x for xs in all_molecules for x in xs]
    c = Counter(cluster_molecules)

    def map_to_name(key):
        new_key = molecules_df.loc[molecules_df['foodbid'] == key, 'common name'].values
        if new_key.size == 0:
            return key
        else:
            return new_key[0] # molecules_df.loc[molecules_df['foodbid'] == key, 'common name'].values[0]
    
    molecules_count = {map_to_name(k):v for k, v in c.items()}
    sorted_molecules_counts = dict(sorted(molecules_count.items(), key=lambda item: item[1], reverse=True))
    sorted_count = dict(Counter(sorted_molecules_counts).most_common(5))
    return sorted_count


In [85]:
for part in coms.communities:
    print(get_cluster_molecules(part))

{'lauric acid': 196, 'CID 644104': 194, 'Decanoic acid': 189, 'indole': 188, '2,3-butanedione': 188}
{'phenylacetaldehyde': 203, 'Geraniol': 201, 'CID 644104': 201, '2-Phenylethanol': 200, 'benzyl alcohol': 200}
{'cis-3-Hexenyl acetate': 90, 'CID 644104': 90, 'Isobutyraldehyde': 90, 'Propionaldehyde': 88, '2-Phenylethanol': 87}
{'CID 644104': 69, 'betaine': 68, 'succinic acid': 67, 'L-histidine': 67, 'Decanoic acid': 66}
{'succinic acid': 55, 'isoliquiritigenin': 55, 'Sinapic Acid': 55, 'octanoic acid': 55, 'Decanoic acid': 55}


In [105]:
nx.panther_similarity(G, 'cinnamon', k=12, path_length=5, c=0.5, delta=0.1, eps=None)


{'ginger': 0.000711392491235236,
 'pepper (spice)': 0.0006365090711052112,
 'pepper': 0.0006262976956329351,
 'ceylon cinnamon': 0.000616086320160659,
 'herbs and spices': 0.0005922597773920147,
 'rosemary': 0.0005650294427992784,
 'soy milk': 0.0005582218591510944,
 'dill': 0.0005514142755029103,
 'laurel': 0.000530991524558358,
 'celery': 0.0005207801490860819,
 'nutmeg': 0.0005139725654378978}

In [None]:
def generate_food_graph(nodes, molecules_to_include, fm_dict, shared=20):
    """Function to generate NetworkX graphs with selection of foods&molecules"""
    # Create an empty graph
    G = nx.Graph()

    # Add edges for shared molecules
    for food1, food2 in tqdm(combinations(nodes, 2)):
        molecules1 = set(fm_dict[food_df_part.loc[food_df_part['food_id']==food1, 'food'].iloc[0]])
        molecules2 = set(fm_dict[food_df_part.loc[food_df_part['food_id']==food2, 'food'].iloc[0]])
        shared_molecules = molecules1.intersection(molecules2)
        shared_molecules_to_include = shared_molecules.intersection(molecules_to_include)
        
        if len(shared_molecules_to_include) > shared:
            G.add_edge(food1, food2, weight=np.int64(len(shared_molecules_to_include)))
    
    return G

In [None]:
food_df_part = food_df.loc[food_df['food'].isin(partitions[i])]
food_molecule_dict = dict(zip(food_df_part['food'], food_df_part['foodb_ids']))
G = generate_food_graph(food_df_part['food_id'].values.tolist(), 
                    molecules_to_include, 
                    food_molecule_dict,
                    shared=0)
print(nx.density(G))

## Node2vec

In [107]:
from node2vec import Node2Vec
from gensim.models import Word2Vec

In [108]:
node2vec = Node2Vec(G, dimensions=128, walk_length=5, num_walks=100, workers=4, weight_key = 'weight', temp_folder='output')

Computing transition probabilities:   0%|          | 0/670 [00:00<?, ?it/s]

Generating walks (CPU: 1):   0%|          | 0/25 [00:00<?, ?it/s]
[A

Generating walks (CPU: 1):  12%|█▏        | 3/25 [00:00<00:01, 11.67it/s]
[A

Generating walks (CPU: 1):  20%|██        | 5/25 [00:00<00:02,  7.93it/s]
[A

Generating walks (CPU: 1):  24%|██▍       | 6/25 [00:01<00:03,  5.28it/s]
[A

Generating walks (CPU: 1):  28%|██▊       | 7/25 [00:01<00:03,  4.89it/s]
[A

Generating walks (CPU: 1):  32%|███▏      | 8/25 [00:01<00:03,  4.75it/s]
[A

Generating walks (CPU: 1):  36%|███▌      | 9/25 [00:01<00:03,  4.59it/s]
[A

Generating walks (CPU: 1):  40%|████      | 10/25 [00:01<00:03,  4.81it/s]
[A

Generating walks (CPU: 1):  44%|████▍     | 11/25 [00:02<00:02,  5.01it/s]
[A

Generating walks (CPU: 1):  48%|████▊     | 12/25 [00:02<00:02,  5.17it/s]
[A

Generating walks (CPU: 1):  52%|█████▏    | 13/25 [00:02<00:02,  5.29it/s]
[A

Generating walks (CPU: 1):  56%|█████▌    | 14/25 [00:02<00:02,  5.38it/s]
[A

Generating walks (CPU: 1):  60%|██████    | 15/25 [00:0

In [109]:
model = node2vec.fit(window=10, min_count=1)

In [27]:
model.save('output/node2vec')

In [112]:
model.wv.most_similar('tomato', topn=10)

[('corn oil', 0.767562210559845),
 ('feijoa', 0.7613927125930786),
 ('mate', 0.7538378834724426),
 ('cherry tomato', 0.7343324422836304),
 ('starfruit', 0.7312519550323486),
 ('achilleas', 0.7302459478378296),
 ('gooseberry', 0.725249707698822),
 ('cherry', 0.7234303951263428),
 ('tortilla', 0.7188439965248108),
 ('sandalwood', 0.7170975804328918)]

In [15]:
# model.load('node2vec.model')
model = Word2Vec.load('output/node2vec.model')

### Embeddings visualization

In [28]:
FOODS = ['tomato', 'onion', 'cinnamon', 'pepper'] #, 'beer', 'rice']

embeddings = []
words = []
for target in FOODS:
    for similar_word, _ in model.wv.most_similar(target, topn=15):
        words.append(similar_word)
        embeddings.append(model.wv[similar_word])

In [64]:
from sklearn.manifold import TSNE
import numpy as np

tsne_model_en_2d = TSNE(metric='manhattan', perplexity=25, method="barnes_hut", n_components=2, init='pca', n_iter=300, random_state=32)

In [65]:
np.asarray(embeddings).shape

(60, 128)

In [66]:
tsne_embeddings = tsne_model_en_2d.fit_transform(np.asarray(embeddings))
colors = np.concatenate([([i]*15) for i in FOODS], axis=0)

In [67]:
import plotly.express as px
import plotly.io as pio

def plot_reduction(embedding, name):
    fig = px.scatter(
        embedding, 
        x=0, y=1, 
        color=colors,
        text=words,
        hover_data=[words],
        labels={'hover_data_0':'food',
                'color': 'target',
                },
    #text=flavor_df['food']
    )
    fig.update_traces(textposition='top center')

    fig.update_layout(
        legend = dict(
            font = dict(
                family = "CMU Serif", 
                size = 14,
            )),
        font=dict(
            family="CMU Serif",
            size=10,
        ),
        title_font=dict(
            family="CMU Serif",
            size=14, 
            ),)
    fig.update_xaxes(title_font = dict(
                family = "CMU Serif", 
                size = 14,
            ))
    fig.update_yaxes(title_font = dict(
                family = "CMU Serif", 
                size = 14,
            ))
    fig.update_layout( 
        template = 'ggplot2', 
        height=500,
        width = 900,
        margin=dict(l=20, r=20, t=20, b=20),
        # title_text='Visualization by  UMAP'
    )
    config = {
    'toImageButtonOptions': {
        'format': 'png', # one of png, svg, jpeg, webp
        'height': 600,
        'width': 900,
        'scale':6 # Multiply title/legend/axis/canvas sizes by this factor
    }
    }
    
    fig.show(config=config)
    pio.write_image(fig, f"images/{name}.png", scale=6, width=900, height=500)

In [68]:
plot_reduction(tsne_embeddings, 'node2vec')

In [None]:
# Train logistic regression classifier on train-set edge embeddings
from sklearn.linear_model import LogisticRegression
edge_classifier = LogisticRegression(random_state=0)
edge_classifier.fit(train_edge_embs, train_edge_labels)