## Data exploration
___

In [6]:
import pandas as pd
import os
import numpy as np

if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

import re
import ast
from utils.data_loading import read_compounds, read_food_molecules


In [5]:
compounds_df, result_df = read_compounds()

  compounds_df = pd.read_csv("foodb/Content.csv")


In [6]:
compounds_df.head()

Unnamed: 0,id,source_id,source_type,food_id,orig_food_id,orig_food_common_name,orig_food_scientific_name,orig_food_part,orig_source_id,orig_source_name,...,citation_type,creator_id,updater_id,created_at,updated_at,orig_method,orig_unit_expression,standard_content,preparation_type,export
0,1,1,Nutrient,4,29,Kiwi,Actinidia chinensis PLANCHON [Actinidiaceae],Fruit,FAT,FAT,...,DATABASE,,,2014-11-05 13:42:11 UTC,2019-12-11 20:23:57 UTC,,,1955.0,raw,0
1,2,1,Nutrient,6,53,Onion,Allium cepa L. [Liliaceae],Bulb,FAT,FAT,...,DATABASE,,,2014-11-05 13:42:11 UTC,2019-12-11 20:23:57 UTC,,,1853.95,raw,0
2,3,1,Nutrient,6,53,Onion,Allium cepa L. [Liliaceae],Leaf,FAT,FAT,...,DATABASE,,,2014-11-05 13:42:11 UTC,2019-12-11 20:23:57 UTC,,,4150.0,raw,0
3,4,1,Nutrient,9,55,Chives,Allium schoenoprasum L. [Liliaceae],Leaf,FAT,FAT,...,DATABASE,,,2014-11-05 13:42:11 UTC,2019-12-11 20:23:57 UTC,,,3900.0,raw,0
4,5,1,Nutrient,11,70,Cashew,Anacardium occidentale L. [Anacardiaceae],Fruit,FAT,FAT,...,DATABASE,,,2014-11-05 13:42:11 UTC,2019-12-11 20:23:57 UTC,,,2500.0,other,0


In [7]:
result_df.head()

Unnamed: 0,orig_food_common_name,source_id
0,100% barley malt reference beer,"[30981, 8298, 3241, 8133, 19928, 19914, 8281, ..."
1,20% buckwheat malt beer,"[11907, 2536, 30981, 8298, 3241, 8133, 19928, ..."
2,40% buckwheat malt beer,"[11907, 2536, 30981, 8298, 3241, 8133, 19928, ..."
3,9-grain bread,[2608]
4,AMARANTH FLAKES,"[2, 3, 4, 5, 11, 15, 20, 21, 23, 24, 29, 34, 3..."


In [8]:
result_df.to_csv("data/food_compounds.csv")

In [9]:
foods_df = pd.read_csv("foodb/Food.csv")

In [10]:
foods_df.head(3)

Unnamed: 0,id,name,name_scientific,description,itis_id,wikipedia_id,picture_file_name,picture_content_type,picture_file_size,picture_updated_at,...,food_type,created_at,updated_at,creator_id,updater_id,export_to_afcdb,category,ncbi_taxonomy_id,export_to_foodb,public_id
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,,Angelica,1.jpg,image/jpeg,111325.0,2012-04-20 09:29:57 UTC,...,Type 1,2011-02-09 00:37:14 UTC,2019-05-14 18:04:13 UTC,,2.0,False,specific,357850.0,True,FOOD00001
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,,Savoy cabbage,2.jpg,image/jpeg,155178.0,2012-04-20 09:39:54 UTC,...,Type 1,2011-02-09 00:37:15 UTC,2019-05-14 18:04:13 UTC,,,False,specific,1216010.0,True,FOOD00002
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,845789.0,Tilia tomentosa,3.jpg,image/jpeg,56367.0,2012-04-20 09:41:25 UTC,...,Type 1,2011-02-09 00:37:15 UTC,2019-05-17 16:19:45 UTC,,,False,specific,,True,FOOD00003


In [11]:
foods_df['category'].unique()

array(['specific', 'generic', nan], dtype=object)

## Preprocessing
___

In [46]:
def remove_dupes(lst):
    return list(set(lst))

def calculate_list_length(lst):
    return len(lst)

def sort_list(lst):
    return sorted(lst)

def to_integer(lst):
    return [int(x) for x in lst]

def string_to_list(string):
    return ast.literal_eval(string)

In [13]:
def read_flavors():
    flavor_df = pd.read_csv(
        "data/flavordb_v3.csv", 
        sep=';', 
        index_col=False
        )
    flavor_df['molecules'] = flavor_df['molecules'].apply(string_to_list)
    flavor_df['synonyms'] = flavor_df['synonyms'].apply(string_to_list)

    return flavor_df

In [14]:
flavor_df = read_flavors()

flavor_df.head()

Unnamed: 0,entity id,alias,synonyms,scientific name,category,molecules
0,1,bakery products,[bakery products],poacceae,bakery,"[22201, 26331, 26808, 31252, 7976, 27457]"
1,2,bread,[bread],poacceae,bakery,"[1032, 10393, 10430, 10448, 107, 10797, 107971..."
2,3,rye bread,[rye bread],rye,bakery,"[10448, 1049, 1130, 1183, 12020, 12209, 12366,..."
3,4,wheaten bread,"[soda scones, soda farls]",wheat,bakery,"[1049, 1060, 1146, 11747, 12170, 14286, 19310,..."
4,5,white bread,[white bread],wheat,bakery,"[10883, 11173, 11559, 12170, 5365891, 5960, 61..."


In [15]:
result_df['list_length'] = result_df['source_id'].apply(calculate_list_length)
min_length = result_df['list_length'].min()
max_length = result_df['list_length'].max()
average_length = result_df['list_length'].mean()

print("Minimum length:", min_length)
print("Maximum length:", max_length)
print("Average length:", average_length)

Minimum length: 1
Maximum length: 2043150
Average length: 543.8676672656168


In [22]:
nan_counts = compounds_df.isna().sum()
print(nan_counts)

id                                 0
source_id                          0
source_type                        0
food_id                            0
orig_food_id                 4308143
orig_food_common_name              0
orig_food_scientific_name    5099435
orig_food_part               5104956
orig_source_id               4448551
orig_source_name             4445227
orig_content                 4289574
orig_min                     5099438
orig_max                     5099476
orig_unit                    4286563
orig_citation                5142217
citation                           0
citation_type                      0
creator_id                   5145532
updater_id                   5145532
created_at                         0
updated_at                         0
orig_method                  5138235
orig_unit_expression         5139940
standard_content             4286932
preparation_type             1324146
export                             0
dtype: int64


In [26]:
result_df.head()

Unnamed: 0,orig_food_common_name,source_id
0,100% barley malt reference beer,"[30981, 8298, 3241, 8133, 19928, 19914, 8281, ..."
1,20% buckwheat malt beer,"[11907, 2536, 30981, 8298, 3241, 8133, 19928, ..."
2,40% buckwheat malt beer,"[11907, 2536, 30981, 8298, 3241, 8133, 19928, ..."
3,9-grain bread,[2608]
4,AMARANTH FLAKES,"[2, 3, 4, 5, 11, 15, 20, 21, 23, 24, 29, 34, 3..."


In [29]:
def read_food_molecules(source: str = "flavordb"):
    if source == "flavordb":
        flavor_df = pd.read_csv(
            "data/flavordb_v4.csv", 
            sep=';', 
            index_col=False
            )
        flavor_df['synonyms'] = flavor_df['synonyms'].apply(string_to_list)
    else:
        flavor_df = pd.read_csv(
            "data/foodb_molecules.csv", 
            sep=';', 
            index_col=False
            )
    flavor_df['molecules'] = flavor_df['molecules'].apply(string_to_list)

    return flavor_df

In [31]:
foodb_molecules = read_food_molecules(source = "foodb")

In [34]:
foodb_molecules['molecules'] = foodb_molecules['molecules'].apply(remove_dupes)

In [35]:
foodb_molecules['list_length'] = foodb_molecules['molecules'].apply(calculate_list_length)
min_length = foodb_molecules['list_length'].min()
max_length = foodb_molecules['list_length'].max()
average_length = foodb_molecules['list_length'].mean()

print("Minimum length:", min_length)
print("Maximum length:", max_length)
print("Average length:", average_length)

Minimum length: 1
Maximum length: 42428
Average length: 74.08751717577424


In [36]:
foodb_molecules.head()

Unnamed: 0,ingredient,molecules,list_length
0,100% barley malt reference beer,"[31105, 8132, 30981, 8133, 3241, 8298, 19914, ...",9
1,20% buckwheat malt beer,"[31105, 11907, 8132, 30981, 8133, 2536, 3241, ...",11
2,40% buckwheat malt beer,"[31105, 11907, 8132, 30981, 8133, 2536, 3241, ...",11
3,9-grain bread,[2608],1
4,AMARANTH FLAKES,"[2, 3, 4, 5, 3716, 16258, 12163, 3337, 13831, ...",58


In [40]:
print("before: " + str(len(compounds_df)))
foodb_copy = compounds_df.drop_duplicates(subset=['orig_food_common_name', 'source_id', 'standard_content'])
print("after: " + str(len(foodb_copy)))

before: 5145532
after: 792944


In [46]:
def sort_molecules_by_quantity(group):
    sorted_molecules = group.sort_values(by='standard_content', ascending=False)['source_id'].tolist()
    return sorted_molecules

food_molecules_df_v2 = foodb_copy.groupby(['orig_food_id', 'orig_food_common_name']).apply(sort_molecules_by_quantity).reset_index()

# Rename columns
food_molecules_df_v2.columns = ['id','ingredient', 'molecules']

In [47]:
food_molecules_df_v2.to_csv("data/foodb_molecules.csv", sep=';', index=False)

In [None]:
# histogram występowania molekuł
# wyrzucić te co występują raz albo bardzo rzadk
# zrobic jeden duży wektor ze wszystkimi molekułami [0-1]

# ogarnac dane
# i wypisac liste metod (must-have i dobrze by było)

## Plots
___

In [4]:
flavor_df = read_food_molecules()

In [47]:
flavor_df['molecules'] = flavor_df['molecules'].apply(remove_dupes)

In [48]:
from collections import Counter
from more_itertools import collapse
import plotly.express as px

molecules = collapse(flavor_df['molecules'].values.tolist())
c = Counter(molecules)

In [23]:
fig = px.histogram(x=list(c.values()))
fig.update_layout(
    height=400,
    width = 800,
    title_text='FlavorDB molecules histogram'
)
fig.show()

In [49]:
fig = px.histogram(x=list(c.values()))
fig.update_layout(
    height=400,
    width = 800,
    title_text='FlavorDB molecules histogram'
)
fig.show()

In [44]:
c_sort = [v for k,v in c.most_common()]

In [45]:
sorted(c_sort)

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
