In [2]:
import pandas as pd
import os
import numpy as np

if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

import re
import ast
from utils.data_loading import read_foods, read_molecules
from utils.ml_utils import molecules2vec


## Data exploration
___

In [8]:
def string_to_list(string):
    return ast.literal_eval(string)

def string_to_int(lst):
    lst = [(float(x)) for x in lst] 
    return lst

In [None]:
molecules_df = pd.read_csv(
    "data/flavordb_molecules.csv", 
    sep=';',
    index_col=False
    )
molecules_df['flavors'] = molecules_df['flavors'].apply(string_to_list)

In [None]:
flavordb_foods = read_food_molecules()

In [None]:
def translate_to_foodb(lst):
    foodb_ids = lst.copy()
    for i in range(len(foodb_ids)):
        try:
            foodb_ids[i] = molecules_df.loc[molecules_df['pubchem id']== foodb_ids[i], 'foodbid'].values[0]
        except IndexError:
            print(f"No matching ID found for {foodb_ids[i]}")
            foodb_ids[i] = 0
   # lst = [molecules_df.loc[molecules_df['pubchem id']== x, 'foodbid'].values[0] for x in lst]
    return foodb_ids

In [None]:
flavordb_foods['foodb_ids'] = flavordb_foods['molecules'].apply(translate_to_foodb)

In [None]:
flavordb_foods.to_csv("data/flavordb_foods.csv", sep=';', index=None)

## Preprocessing
___

In [9]:
def remove_dupes(lst):
    return list(set(lst))

def calculate_list_length(lst):
    return len(lst)

def sort_list(lst):
    return sorted(lst)

def to_integer(lst):
    return [int(x) for x in lst]

def string_to_list(string):
    try:
        return ast.literal_eval(string)
    except:
        return [s.strip("'") for s in string[1:-1].split(', ')]

In [10]:
def check_lengths(df):
    df['list_length'] = df['foodb_ids'].apply(calculate_list_length)
    min_length = df['list_length'].min()
    max_length = df['list_length'].max()
    average_length = df['list_length'].mean()

    print("Minimum length:", min_length)
    print("Maximum length:", max_length)
    print("Average length:", average_length)

In [None]:
def sort_molecules_by_quantity(group):
    sorted_molecules = group.sort_values(by='standard_content', ascending=False)['source_id'].tolist()
    return sorted_molecules

food_molecules_df_v2 = foodb_copy.groupby(['orig_food_id', 'orig_food_common_name']).apply(sort_molecules_by_quantity).reset_index()

# Rename columns
food_molecules_df_v2.columns = ['id','ingredient', 'molecules']

In [None]:
food_molecules_df_v2.to_csv("data/foodb_molecules.csv", sep=';', index=False)

In [None]:
flavordb_molecules = pd.read_csv('data/molecules.csv', index_col=False)

In [None]:
flavordb_molecules

In [None]:
print("before: " + str(len(flavordb_molecules)))
molecules_copy = flavordb_molecules.drop_duplicates(subset=['pubchem id'])
print("after: " + str(len(molecules_copy)))

In [None]:
molecules_copy.to_csv('data/molecules_v2.csv', index=False)

In [None]:
def read_food_molecules(source: str = "flavordb") -> pd.DataFrame:
    if source == "flavordb":
        flavor_df = pd.read_csv(
            "data/flavordb_foods.csv", 
            sep=';', 
            index_col=False
            )
        flavor_df['synonyms'] = flavor_df['synonyms'].apply(string_to_list)
        flavor_df['foodb_ids'] = flavor_df['foodb_ids'].apply(string_to_list)
    else:
        flavor_df = pd.read_csv(
            "data/foodb_v2.csv", 
            sep=';', 
            index_col=False
            )

        flavor_df['public_ids'] = flavor_df['public_ids'].apply(string_to_list)
        flavor_df['food_id'] = flavor_df['food_id'].astype(float)
        flavor_df['food_id'] = flavor_df['food_id'].astype(int)
    
    flavor_df['molecules'] = flavor_df['molecules'].apply(string_to_list)

    return flavor_df

## Molecules shared across databases
___

In [None]:
flavordb_df = read_food_molecules()
foodb_df = read_food_molecules("foodb")

In [None]:
def flatten(xss):
    return [x for xs in xss for x in xs]

In [None]:
shared = []
foodb = list(set(flatten(foodb_df['public_ids'].values.tolist())))
flavordb = list(set(flatten(flavordb_df['foodb_ids'].values.tolist())))

In [None]:
len(flavordb)

In [None]:
def maybeMakeNumber(s):
    """Returns a string 's' into a integer if possible, a float if needed or
    returns it as is."""

    # handle None, "", 0
    if not s:
        return s
    try:
        f = float(s)
        i = int(f)
        return i if f == i else f
    except ValueError:
        return s

In [None]:
for x in foodb:
    x = maybeMakeNumber(x)

In [None]:
len(flavordb)

In [None]:
1130 in foodb

In [None]:
for molecule in foodb:
    if molecule in flavordb:
        shared.append(molecule)
    else:
        print(f"{molecule} not found in FooDB")

In [None]:
len(shared)

In [None]:
apple_foodb = (foodb_df[foodb_df['food']=='apple'])['public_ids'].values.tolist()[0]

In [None]:
apple_flavordb = (flavordb_df[flavordb_df['food']=='apple'])['foodb_ids'].values.tolist()[0]

In [None]:
shared = []
for molecule in apple_flavordb:
    if molecule in apple_foodb:
        shared.append(molecule)
    else:
        print(f"{molecule} not found in FooDB")

In [None]:
len(shared)

## Removing molecules from FooDB that are absent in FlavorDB
___

In [None]:
foodb_molecules_full = pd.read_csv(
    "foodb/Compounds_v3.csv",
    sep=';',
    index_col=False,
)

foodb_molecules_filtered = foodb_molecules_full[foodb_molecules_full['public_id'].isin(flavordb)]

In [None]:
print(len(foodb_molecules_filtered))
foodb_molecules_filtered = foodb_molecules_filtered.drop_duplicates(subset=['public_id'])
print(len(foodb_molecules_filtered))

In [None]:
foodb_molecules_filtered.to_csv("data/foodb_molecules_filtered.csv", sep=';', index=None)

In [None]:
foodb_foods_full = pd.read_csv(
    "data/foodb.csv",
    sep=';',
    index_col=False,
)

In [None]:
foodb_foods_full.head(3)

In [None]:
foodb_foods_full['public_ids'] = foodb_foods_full['public_ids'].apply(string_to_list)
foodb_foods_full['molecules'] = foodb_foods_full['molecules'].apply(string_to_list)
foodb_foods_full['quantities'] = foodb_foods_full['quantities'].apply(string_to_list)

In [None]:
class resetable_range:
    def __init__(self, val):
        self.max = val
        self.val = 0
    def __iter__(self):
        return self
    def __next__(self):
        val = self.val
        if self.val == self.max:
            raise StopIteration
        self.val += 1
        return val
    def reset(self, val):
        self.val = val

In [None]:
for row in range(len(foodb_foods_full)):
  public_to_check = foodb_foods_full.at[row, 'public_ids']
  id_to_check = foodb_foods_full.at[row, 'molecules']
  quantites_to_check = foodb_foods_full.at[row, 'quantities']
  l = resetable_range(len(public_to_check))
  for i in l:
    if i >= len(public_to_check):
      break
    if public_to_check[i] in flavordb: # if the molecule exists in both datasets
      pass
    else:
      del public_to_check[i]
      del id_to_check[i]
      del quantites_to_check[i]
      l.reset(i)
  foodb_foods_full.at[row, 'public_ids'] = public_to_check
  foodb_foods_full.at[row, 'molecules'] = id_to_check
  foodb_foods_full.at[row, 'quantities'] = quantites_to_check

  

In [None]:
foodb_foods_full.to_csv("data/foodb_foods_filtered.csv", sep=';', index=None)

## Removing molecules from FlavorDB that are absent in FooDB
___

In [None]:
flavordb_molecules_full = read_molecules()

In [None]:
print(len(flavordb_molecules_full))
flavordb_molecules_full = flavordb_molecules_full.drop_duplicates(subset=['foodbid'])
print(len(flavordb_molecules_full))

In [None]:
flavordb_molecules_filtered = flavordb_molecules_full[flavordb_molecules_full['foodbid'].isin(foodb)]
len(flavordb_molecules_filtered)

In [None]:
flavordb_molecules_filtered.to_csv("data/flavordb_molecules_filtered.csv", sep=';', index=None)

In [None]:
flavordb_foods_full = read_food_molecules()

In [None]:
flavordb_foods_full.head(3)

In [None]:
for row in range(len(flavordb_foods_full)):
  public_to_check = flavordb_foods_full.at[row, 'foodb_ids']
  id_to_check = flavordb_foods_full.at[row, 'molecules']
  l = resetable_range(len(public_to_check))
  for i in l:
    if i >= len(public_to_check):
      break
    if public_to_check[i] in foodb: # if the molecule exists in both datasets
      pass
    else:
      del public_to_check[i]
      del id_to_check[i]
      l.reset(i)
  flavordb_foods_full.at[row, 'foodb_ids'] = public_to_check
  flavordb_foods_full.at[row, 'molecules'] = id_to_check

In [None]:
flavordb_foods_full.to_csv("data/flavordb_foods_filtered.csv", sep=';', index=None)

## Joinig FlavorDB and FooDB data into single dataset
___

In [None]:
foods_df = flavordb_foods_full.copy()

foodb_foods_filtered = pd.read_csv(
            "data/foodb_foods_filtered.csv", 
            sep=';', 
            index_col=False
            )

foodb_foods_filtered['public_ids'] = foodb_foods_filtered['public_ids'].apply(string_to_list)
foodb_foods_filtered['molecules'] = foodb_foods_filtered['molecules'].apply(string_to_list)
foodb_foods_filtered['quantities'] = foodb_foods_filtered['quantities'].apply(string_to_list)

In [None]:
flavordb_foods_full.head()

In [None]:
foodb_foods_filtered.head()

In [None]:
foodb_foods = foodb_foods_filtered['food'].values.tolist() # 982 entites
flavordb_foods = flavordb_foods_full['food'].values.tolist() # 800 entities

# 517 shared foods between datasets

In [None]:
shared = []
for entity in flavordb_foods:
    if entity in foodb_foods:
        shared.append(entity)

In [None]:
foods_df['quantities'] = pd.Series()

In [None]:
foods_df.head()

In [None]:
# adding foodb data (qunatities) to foods in flavordb

for row in range(len(foods_df)): # copy of flavordb_foods_full
  food_to_check = foods_df.at[row, 'food']
  if food_to_check in shared:
    
    foodb_molecules_ids = foodb_foods_full.loc[foodb_foods_full['food'] == food_to_check, 'public_ids']
    foodb_molecules_quants = foodb_foods_full.loc[foodb_foods_full['food'] == food_to_check, 'quantities']

    ids_quants_dict = dict(zip(foodb_molecules_ids.values[0], foodb_molecules_quants.values[0]))

    entity_molecules_ids = list(set(foodb_molecules_ids.values[0] + foods_df.at[row, 'foodb_ids']))
    entity_molecules_quantities = [ids_quants_dict[x] if x in ids_quants_dict.keys() else 0 for x in entity_molecules_ids ]
    
    foods_df.at[row, 'foodb_ids'] = entity_molecules_ids
    foods_df.at[row, 'quantities'] = entity_molecules_quantities
  
  else:
    foods_df.at[row, 'quantities'] = []
  

In [None]:
foods_df.head()

In [None]:
foodb_foods_full.head(1)

In [None]:
for row in range(len(foodb_foods_full)): # adding foodb rows
    food_to_check = foodb_foods_full.at[row, 'food']
    if food_to_check not in shared:
        foodb_slice = foodb_foods_full.loc[foodb_foods_full['food'] == food_to_check]
        #print(foodb_slice['food_id'].values[0])
        #break
        foods_df.loc[len(foods_df)] = [
            foodb_slice['food_id'].values[0],
            foodb_slice['food'].values[0],
            [],
            "", # scientific name
            "", # category
            foodb_slice['molecules'].values[0],
            "", # group
            foodb_slice['public_ids'].values[0],
            foodb_slice['quantities'].values[0],
        ]


In [None]:
foods_df.head(-2)

In [None]:
foods_df.to_csv("food.csv", sep=';', index=None)

In [None]:
(foods_df.sort_values(by=['food'],ascending = True)).to_csv("data/food.csv", sep=';', index=None)

## Removing sparse molecules and foods
___

In [4]:
from collections import Counter
from more_itertools import collapse

In [5]:
food_df = molecules2vec(read_foods(), read_molecules())

check_lengths(food_df)

Minimum length: 11
Maximum length: 382
Average length: 108.77574171029669


In [11]:
molecules = collapse(food_df['foodb_ids'].values.tolist())
c = Counter(molecules)

In [8]:
molecules_to_count = [k for k, v in c.items() if v > 5]

In [13]:
# remove molecules 
molecules_df = read_molecules()

filtered_df = molecules_df[molecules_df['foodbid'].isin(molecules_to_count)]
    

In [15]:
len(filtered_df)

780

In [16]:
filtered_df.to_csv("data/flavordb_molecules_cut.csv", sep=';', index=False)

In [26]:
def remove_sparse(lst):
    new_lst = []
    for molecule in lst:
        if molecule in molecules_to_count:
            new_lst.append(molecule)
    if len(new_lst) > 10:
        return new_lst
    else:
        return pd.NA

In [29]:
food_df['foodb_ids'] = food_df['foodb_ids'].apply(remove_sparse)

In [30]:
len(food_df)

1227

In [31]:
df = food_df.copy()
df = df.dropna(subset='foodb_ids')
len(df)

1146

In [35]:
# removing molecules that are sparse in the data and foods with less than 5 molecules
def clean_df(df):
    for row in range(len(df)): 
        try:
            # print(df.loc[row, 'foodb_ids'])
            molecules = df.loc[row, 'foodb_ids']
            molecules_quants = df.loc[row, 'quantities']
            if molecules_quants is pd.NA:
                molecules_to_keep = [m for m in molecules if m in molecules_to_count]
                if len(molecules_to_keep) > 5:
                    df.at[row, 'foodb_ids'] = molecules_to_keep
                else:
                    df.at[row, 'foodb_ids'] = pd.NA
            else:
                ids_quants_dict = dict(zip(molecules, molecules_quants))

                ids_to_keep = {k:v for k, v in ids_quants_dict.items() if k in molecules_to_count}

                if len(ids_to_keep.keys()) > 5:
                    entity_molecules_ids = list(ids_to_keep.keys())
                    entity_molecules_quantities = list(ids_to_keep.values())

                    df.loc[row, 'foodb_ids'] = entity_molecules_ids
                    df.loc[row, 'quantities'] = entity_molecules_quantities
                
                else:
                    df.loc[row, 'foodb_ids'] = pd.NA
                    df.loc[row, 'quantities'] = pd.NA
        except:
            pass
    return df

In [36]:
df = clean_df(food_df.copy())

In [38]:
df = df.dropna(subset='foodb_ids')

In [40]:
len(df)

919

In [16]:
check_lengths(df)

Minimum length: 6
Maximum length: 382
Average length: 105.23989898989899


In [33]:
df = df.drop(columns=['quantities'])

In [34]:
df.to_csv("data/food_cut.csv", sep=';', index=None)

## Removing invalid entries
___

In [None]:
# removing entries with no molecules and entries such as "other product", "unclassified product"

In [None]:
def check_if_empty(lst):
    if not lst:
        return np.nan
    else:
        return lst    

In [None]:
food_df = read_foods()
food_df['foodb_ids'] = food_df['foodb_ids'].apply(check_if_empty)
food_df_filtered = food_df.dropna(subset=['foodb_ids'], how='any')

In [None]:
print(len(food_df))
print(len(food_df_filtered))

In [None]:
food_df_filtered = food_df_filtered[~food_df_filtered['food'].str.contains('other|unclassified')]

In [None]:
len(food_df_filtered)

In [None]:
food_df_filtered.to_csv("food.csv", sep=';', index=None)

## Categories cleanup - FooDB
___

In [None]:
foods_df = read_foods()

In [None]:
foods_df = foods_df.drop('category', axis=1)

In [None]:
clean_categories = pd.read_csv("data/cleanfoods.csv", sep=';', index_col=None)

In [None]:
merged_df = pd.merge(foods_df, clean_categories[['food', 'category']], on='food', how='left')

In [None]:
len(foods_df['category'].unique())

In [None]:
foods_df['category'] = foods_df['category'].str.replace('cheese','dairy')


In [None]:
food_df = merged_df.filter(['food_id', 'food', 'synonyms', 'scientific name', 'category','molecules',
       'foodb_ids', 'quantities'])

In [None]:
food_df.head(3)

In [None]:
foods_df.to_csv("data/food.csv", sep=';', index=None)

## Reduced data
___

In [4]:
from collections import Counter
from more_itertools import collapse

In [5]:
flavor_molecules = read_molecules()
food_df = read_foods()
food_df = food_df.dropna(subset='category')

molecules = collapse(food_df['foodb_ids'].values.tolist())
c = Counter(molecules)
molecules_to_include = [k for k, v in c.items() if 100 > v]

In [13]:
def remove_sparse(lst):
    lst = [item for item in lst if item not in molecules_to_include]
    if len(lst) > 5:
        return lst
    else:
        return pd.NA

for index, row in food_df.iterrows():
    food_df.at[index, 'foodb_ids'] = remove_sparse(row['foodb_ids'])

In [15]:
food_df = food_df.dropna(subset=['foodb_ids'])
check_lengths(food_df)

Minimum length: 6
Maximum length: 179
Average length: 94.08984725965858


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['list_length'] = df['foodb_ids'].apply(calculate_list_length)


In [17]:
food_df.to_csv("data/food_reduced.csv", sep=';', index=None)