In [1]:
import pubchempy as pcp
import pandas as pd
import numpy as np
from tqdm import tqdm

import os
if os.path.basename(os.getcwd()) != 'food-pairing':
    os.chdir(os.path.dirname(os.getcwd()))

In [2]:
compounds_df = pd.read_csv('foodb/Compounds_v2.csv', sep=';', index_col=None)

In [3]:
print(len(compounds_df))
compounds_df.dropna(subset=['description'], inplace=True)
compounds_df.reset_index(drop=True, inplace=True)
print(len(compounds_df))

15229
15229


In [4]:
sample_lst = ['FDB012535', 'FDB012567', 'FDB000474', 'FDB002257']

In [5]:
cids = [(compounds_df.loc[compounds_df['public_id'] == x, 'description']).values[0] for x in sample_lst]

In [6]:
cids

['56-86-0', '56-84-8', '56-87-1', '74-79-3']

In [7]:
[(pcp.get_cids(x)[0]) for x in cids]

[33032, 5960, 5962, 6322]

In [8]:
def flatten(xss):
    return [x for xs in xss for x in xs]

def translate_to_cid(x):
    if type(x) != str:
        return " "
    try:
        pubchem_id = (pcp.get_cids(x))[0]
    except:
        return " "
    if type(pubchem_id) != int:
        return " "
    else:
        return pubchem_id

In [9]:
i = 0
for row in tqdm(range(len(compounds_df))):
    desc = compounds_df.at[row, 'description']
    compounds_df.at[row, 'cid_molecules'] = translate_to_cid(desc)
    i = i + 1
    if i % 100 == 0:
        compounds_df.to_csv("foodb/Compounds_v3.csv", sep=';', index=None)

100%|██████████| 15229/15229 [2:09:48<00:00,  1.96it/s]  


In [10]:
compounds_df.to_csv("foodb/Compounds_v2.csv", sep=';', index=None)

In [11]:
compounds_df.head()

Unnamed: 0,id,public_id,name,description,cid_molecules
0,4,FDB000004,Cyanidin 3-(6''-acetyl-galactoside),350602-26-5,122396785.0
1,13,FDB000013,Cyanidin 3-(6''-succinyl-glucoside),216692-08-9,
2,25,FDB000025,Peonidin 3-(6''-acetyl-galactoside),75-07-0,177.0
3,27,FDB000027,Malvidin 3-(6''-acetyl-galactoside),75-07-0,177.0
4,35,FDB000035,Peonidin 3-(6''-p-coumaroyl-glucoside),147-85-3,145742.0


In [4]:
from utils.data_loading import read_food_molecules

foods_db = read_food_molecules('foodb')

In [13]:
sample_lst = ['FDB012535', 'FDB012567', 'FDB000474', 'FDB002257']

In [22]:
ids = [(compounds_df.loc[compounds_df['public_id'] == x, 'cid_molecules']) for x in sample_lst]

In [36]:
def translate_list_to_cid(lst):
    ids = [(compounds_df.loc[compounds_df['public_id'] == x, 'cid_molecules']) for x in lst]
    ids = [x.values[0] if len(x) == 1 else 0 for x in ids]
    return ids

In [37]:
foods_db['cid_molecules'] = foods_db['public_ids'].apply(translate_list_to_cid)

In [38]:
foods_db.head()

Unnamed: 0,food_id,food,public_ids,molecules,quantities,cid_molecules
0,280,abalone,"[FDB012535, FDB012567, FDB000474, FDB002257, F...","[12538, 12570, 474, 2257, 1946, 484, 556, 1274...","[2572.5, 1756.0, 1355.5, 1329.5, 1295.0, 1141....","[33032, 5960, 5962, 6322, 6106, 750, 5950, 595..."
1,281,abiyuch,"[FDB003715, FDB012530, FDB012528, FDB003521, F...","[3716, 12533, 12531, 3522, 1193, 1224, 3521, 1...","[8550.0, 2252.25, 1901.9, 304.0, 100.1, 54.1, ...","[5988, 107526, 439709, 5462222, 6255, 54670067..."
2,282,acerola,"[FDB003715, FDB001224, FDB003521, FDB031004, F...","[3716, 1224, 3522, 31167, 2602, 3514, 3521, 12...","[4500.0, 1651.733333, 129.666667, 18.0, 15.71,...","[5988, 54670067, 5462222, 888, 68247, 5460341,..."
3,283,acorn,"[FDB012535, FDB012567, FDB003521, FDB001946, F...","[12538, 12570, 3522, 1946, 2257, 474, 556, 570...","[1162.333333, 748.666667, 624.75, 576.333333, ...","[33032, 5960, 5462222, 6106, 6322, 5962, 5950,..."
4,1004,acorn squash,"[FDB003521, FDB014613, FDB019865, FDB003520, F...","[3522, 14616, 19872, 3521, 3514, 31167, 1224, ...","[347.0, 220.0, 38.0, 36.0, 33.0, 32.0, 11.0, 3...","[5462222, 5280489, 5280899, 5462309, 5460341, ..."


In [39]:
foods_db.to_csv("data/foodb_v2.csv", sep=';', index=None)