## Data exploration
___

In [107]:
import pandas as pd
import os
import numpy as np

if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

import re
import ast
from utils.data_loading import read_compounds, read_food_molecules


In [10]:
def string_to_list(string):
    return ast.literal_eval(string)

compounds_df = pd.read_csv('compounds.csv', sep=';')

compounds_df['foods'] = compounds_df['foods'].apply(string_to_list)
compounds_df['quantities'] = compounds_df['quantities'].apply(string_to_list)

In [11]:
compounds_df.head()

Unnamed: 0,compound_id,foods,quantities
0,FDB000004,"[Highbush blueberry, Lowbush blueberry]","[0.103886, 0.664873]"
1,FDB000013,[Strawberry],[1.223154]
2,FDB000014,"[Cloudberry, Red raspberry, Strawberry, Wheat]","[0.0, 0.0, 10.444922, 1067.213885]"
3,FDB000024,"[Highbush blueberry, Lowbush blueberry]","[0.069621, 0.157808]"
4,FDB000025,"[Highbush blueberry, Lowbush blueberry]","[0.039365, 1.765813]"


In [26]:
df_exploded = compounds_df.explode(['foods', 'quantities'])

result_df = df_exploded.groupby('foods')[['compound_id', 'quantities']].agg(list).reset_index()

result_df.head()

Unnamed: 0,foods,compound_id,quantities
0,0,"[FDB000048, FDB000115, FDB000232, FDB000299, F...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Abalone,"[FDB000236, FDB000316, FDB000432, FDB000442, F...","[0, 0, 0, 0, 587.0, 0.0, 0, 1355.5, 0, 1141.5,..."
2,Abiyuch,"[FDB000236, FDB000240, FDB000316, FDB000374, F...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Acerola,"[FDB000236, FDB000240, FDB000316, FDB000374, F...","[0, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 0, 0..."
4,Acorn,"[FDB000236, FDB000240, FDB000316, FDB000374, F...","[0, 0, 0, 0, 0, 0, 165.04675, 0, 0, 339.346, 0..."


In [85]:
result_df.to_csv("data/foodb_compounds.csv", sep=';', index=None)

In [54]:
def string_to_int(lst):
    lst = [(float(x)) for x in lst] 
    return lst

In [94]:
result_df = pd.read_csv("data/foodb_compounds.csv", sep=';', index_col=False)
result_df['public_ids'] = result_df['public_ids'].apply(string_to_list)
result_df['molecules'] = result_df['molecules'].apply(string_to_list)
result_df['quantities'] = result_df['quantities'].apply(string_to_list)
result_df['quantities'] = result_df['quantities'].apply(string_to_int)

result_df.head()

Unnamed: 0,foods,quantities,public_ids,molecules
0,Abalone,"[2572.5, 1756.0, 1355.5, 1329.5, 1295.0, 1141....","[FDB012535, FDB012567, FDB000474, FDB002257, F...","[12538, 12570, 474, 2257, 1946, 484, 556, 1274..."
1,Abiyuch,"[8550.0, 2252.25, 1901.9, 304.0, 100.1, 54.1, ...","[FDB003715, FDB012530, FDB012528, FDB003521, F...","[3716, 12533, 12531, 3522, 1193, 1224, 3521, 1..."
2,Acerola,"[4500.0, 1651.733333, 129.666667, 18.0, 15.71,...","[FDB003715, FDB001224, FDB003521, FDB031004, F...","[3716, 1224, 3522, 31167, 2602, 3514, 3521, 12..."
3,Acorn,"[1162.333333, 748.666667, 624.75, 576.333333, ...","[FDB012535, FDB012567, FDB003521, FDB001946, F...","[12538, 12570, 3522, 1946, 2257, 474, 556, 570..."
4,Acorn squash,"[347.0, 220.0, 38.0, 36.0, 33.0, 32.0, 11.0, 3...","[FDB003521, FDB014613, FDB019865, FDB003520, F...","[3522, 14616, 19872, 3521, 3514, 31167, 1224, ..."


In [98]:
foods_ids = pd.read_csv('foodb/Food.csv')

In [99]:
food_id_dict = pd.Series(foods_ids['id'].values,index=foods_ids['name']).to_dict()

In [101]:
def public_to_id(food):
    id = food_id_dict[food]
    return id

result_df['food_id'] = result_df['foods'].apply(public_to_id)

In [103]:
result_df = result_df.rename(columns={'foods': 'food'})

In [104]:
result_final = result_df[['food_id', 'food', 'public_ids', 'molecules', 'quantities']]

In [105]:
result_final.head()

Unnamed: 0,food_id,food,public_ids,molecules,quantities
0,280,Abalone,"[FDB012535, FDB012567, FDB000474, FDB002257, F...","[12538, 12570, 474, 2257, 1946, 484, 556, 1274...","[2572.5, 1756.0, 1355.5, 1329.5, 1295.0, 1141...."
1,281,Abiyuch,"[FDB003715, FDB012530, FDB012528, FDB003521, F...","[3716, 12533, 12531, 3522, 1193, 1224, 3521, 1...","[8550.0, 2252.25, 1901.9, 304.0, 100.1, 54.1, ..."
2,282,Acerola,"[FDB003715, FDB001224, FDB003521, FDB031004, F...","[3716, 1224, 3522, 31167, 2602, 3514, 3521, 12...","[4500.0, 1651.733333, 129.666667, 18.0, 15.71,..."
3,283,Acorn,"[FDB012535, FDB012567, FDB003521, FDB001946, F...","[12538, 12570, 3522, 1946, 2257, 474, 556, 570...","[1162.333333, 748.666667, 624.75, 576.333333, ..."
4,1004,Acorn squash,"[FDB003521, FDB014613, FDB019865, FDB003520, F...","[3522, 14616, 19872, 3521, 3514, 31167, 1224, ...","[347.0, 220.0, 38.0, 36.0, 33.0, 32.0, 11.0, 3..."


In [106]:
result_final.to_csv("data/foodb.csv", sep=';', index=None)

## Preprocessing
___

In [16]:
def remove_dupes(lst):
    return list(set(lst))

def calculate_list_length(lst):
    return len(lst)

def sort_list(lst):
    return sorted(lst)

def to_integer(lst):
    return [int(x) for x in lst]

def string_to_list(string):
    return ast.literal_eval(string)

In [14]:
def check_lengths(df):
    df['list_length'] = df['molecules'].apply(calculate_list_length)
    min_length = df['list_length'].min()
    max_length = df['list_length'].max()
    average_length = df['list_length'].mean()

    print("Minimum length:", min_length)
    print("Maximum length:", max_length)
    print("Average length:", average_length)

In [40]:
print("before: " + str(len(compounds_df)))
foodb_copy = compounds_df.drop_duplicates(subset=['orig_food_common_name', 'source_id', 'standard_content'])
print("after: " + str(len(foodb_copy)))

before: 5145532
after: 792944


In [46]:
def sort_molecules_by_quantity(group):
    sorted_molecules = group.sort_values(by='standard_content', ascending=False)['source_id'].tolist()
    return sorted_molecules

food_molecules_df_v2 = foodb_copy.groupby(['orig_food_id', 'orig_food_common_name']).apply(sort_molecules_by_quantity).reset_index()

# Rename columns
food_molecules_df_v2.columns = ['id','ingredient', 'molecules']

In [47]:
food_molecules_df_v2.to_csv("data/foodb_molecules.csv", sep=';', index=False)

In [25]:
flavordb_molecules = pd.read_csv('data/molecules.csv', index_col=False)

In [26]:
flavordb_molecules

Unnamed: 0,id,pubchem id,common name,flavors
0,0,4,1-Aminopropan-2-ol,['fishy']
1,1,49,3-Methyl-2-oxobutanoic acid,['fruity']
2,2,58,2-oxobutanoic acid,"['caramel', 'sweet', 'creamy', 'brown', 'lacto..."
3,3,70,4-Methyl-2-oxovaleric acid,['fruity']
4,4,72,"3,4-Dihydroxybenzoic Acid","['mild', 'phenolic', 'balsamic']"
...,...,...,...,...
2537,2537,5315892,cinnamyl alcohol,"['bitter', 'sweet', 'powdery', 'cinnamic', 'sp..."
2538,2538,5318042,trans-2-Hexen-1-Ol,"['wine', 'unripe banana', 'leaf', 'alcohol', '..."
2539,2539,5363388,cis-3-Hexenyl acetate,"['sweet', 'banana', 'fruity', 'fresh', 'grassy..."
2540,2540,5365811,4-Hexen-3-One,"['pungent', 'spicy', 'metallic', 'ethereal', '..."


In [29]:
print("before: " + str(len(flavordb_molecules)))
molecules_copy = flavordb_molecules.drop_duplicates(subset=['pubchem id'])
print("after: " + str(len(molecules_copy)))

before: 2542
after: 1788


In [30]:
molecules_copy.to_csv('data/molecules_v2.csv', index=False)

In [None]:
# histogram występowania molekuł
# wyrzucić te co występują raz albo bardzo rzadk
# zrobic jeden duży wektor ze wszystkimi molekułami [0-1]

# ogarnac dane
# i wypisac liste metod (must-have i dobrze by było)

## Plots
___

In [None]:
from collections import Counter
from more_itertools import collapse
import plotly.express as px

def plot_histogram(df):
    molecules = collapse(df['molecules'].values.tolist())
    c = Counter(molecules)
    fig = px.histogram(
    x=list(c.values()), 
    nbins=610,
    )
    fig.update_layout(
        height=400,
        width = 800,
        title_text='FlavorDB molecules histogram'
    )
    fig.show()

In [108]:
food_df = read_food_molecules('foodb')

FileNotFoundError: [Errno 2] No such file or directory: 'data/foodb_molecules.csv'

In [5]:
filtered_df = flavor_df[flavor_df['molecules'].apply(len) == 1]

filtered_df.head()

Unnamed: 0,id,ingredient,synonyms,scientific name,category,molecules
86,87,ghee,[ghee],cattle,dairy,[7714]
94,95,achilleas,"[allheal, achillea, yarrow, bloodwort]",achillea,essential oil,[2758]
100,101,cedar,[cedar],cedrus,essential oil,[1201521]
131,132,yarrow,"[thousand-leaf, devils nettle, sanguinary, old...",achillea,essential oil,[6654]
141,142,prawn,[prawn],crustacean,seafood,[15380]


In [6]:
len(filtered_df)

20

In [8]:
single_elements = filtered_df['molecules'].explode().unique().tolist()

In [9]:
single_elements

[7714,
 2758,
 1201521,
 6654,
 15380,
 5311110,
 5364578,
 3591866,
 7127,
 1550890,
 32594,
 1130,
 644104]

In [12]:
singles_counts = [c.get(x) for x in single_elements]

In [13]:
singles_counts

[17, 64, 9, 123, 10, 8, 4, 1, 33, 1, 17, 506, 609]

In [17]:
check_lengths(flavor_df)

Minimum length: 1
Maximum length: 391
Average length: 71.6243842364532
