**Author :  Ong Cheng Kei TP055620** <br>
**Description :**
<br>This file contains code to generate text embeddings for similarity calculation of ingredients name in two database : Nutrition5k, USDA-FNDDS.<br>This module exposes some function that can be called to get the most similar ingredient in both database

In [1]:
from pathlib import Path
from pprint import pprint
from types import SimpleNamespace
from IPython.display import display
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

In [2]:
# Setting root directory to FoodNet
dir_parents = Path.cwd().parents
x = 0
root_dir = dir_parents[x]
while root_dir.name != "food_v5" and x < len(dir_parents):
    x += 1
    root_dir = dir_parents[x]
assert (
    root_dir.name == "food_v5"
), "Unable to find FoodNet root directory. Please change the root directory or set the working directory under the FoodNet root directory."

### Google Universal Sentence Encoder for word embeddings

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)


def embed(input):
    return model(input)


if __name__ == "__main__":
    print("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


### Create embeddings for all items in nutrition database

In [4]:
if __name__ == "__main__":
    display(embed(["poultry and chicken", "i love milk"]))

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.00842834,  0.00642012,  0.05519094, ...,  0.00923468,
         0.04580159, -0.04145262],
       [ 0.04911732, -0.00836502,  0.02530871, ..., -0.03663807,
         0.02890901, -0.07283901]], dtype=float32)>

In [5]:
fndds_dir = root_dir / "Food Datasets" / "USDA-FNDDS"
with open((fndds_dir / "cleaned_food_category.txt").resolve(strict=True), "r") as file:
    fndds_category = file.read().split("\n")
    fndds_category.pop(-1)  # remove last line (empty)

In [6]:
if __name__ == "__main__":
    pprint(fndds_category)

['broccoli',
 'soft drinks',
 'cottage cheese or ricotta cheese',
 'pineapple',
 'reduced fat flavored milk',
 'dips and gravies and other sauces',
 'candy not containing chocolate',
 'olives and pickles and pickled vegetables',
 'macaroni and cheese',
 'corn',
 'salad dressings and vegetable oils',
 'fried vegetables',
 'pasta and noodles and cooked grains',
 'eggs and omelets',
 'bagels and english muffins',
 'greek yogurt',
 'seafood mixed dishes',
 'sugars and honey',
 'mustard and other condiments',
 'fried rice and lo mein or chow mein',
 'bananas',
 'beans and peas and legumes',
 'jams and syrups and toppings',
 'mayonnaise',
 'vegetable juice',
 'egg sandwiches or breakfast sandwiches',
 'citrus fruits',
 'potato chips',
 'nuts and seeds',
 'bacon',
 'frankfurters',
 'pears',
 'cream cheese and sour cream and whipped cream',
 'stir-fry and soy-based sauce mixtures',
 'apple juice',
 'whole milk',
 'doughnuts and sweet rolls and pastries',
 'cookies and brownies',
 'pasta mixed 

In [7]:
df_fndds_nutrient_values = pd.read_csv(
    (fndds_dir / "cleaned_fndds_nutrient_values.csv").resolve(strict=True), sep="\t"
)

In [8]:
if __name__ == "__main__":
    display(df_fndds_nutrient_values)

Unnamed: 0,Main food description,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),Total Fat (g)
0,"milk, not further specified",reduced fat milk,0.51,0.0334,0.0487,0.0199
1,"milk, whole",whole milk,0.60,0.0328,0.0467,0.0320
2,"milk, low sodium, whole",whole milk,0.61,0.0310,0.0446,0.0346
3,"milk, calcium fortified, whole",whole milk,0.60,0.0328,0.0467,0.0320
4,"milk, calcium fortified, low fat",lowfat milk,0.43,0.0338,0.0519,0.0095
...,...,...,...,...,...,...
6085,gin,liquor and cocktails,2.31,0.0000,0.0000,0.0000
6086,rum,liquor and cocktails,2.31,0.0000,0.0000,0.0000
6087,rum cooler,liquor and cocktails,0.68,0.0000,0.1007,0.0000
6088,vodka,liquor and cocktails,2.31,0.0000,0.0000,0.0000


In [9]:
from pathlib import Path
nutrition5k_dir = Path("D:/nutrition5k/metadata/ingredients_metadata.csv")
df_nutrition5k_nutrient_values = pd.read_csv(nutrition5k_dir.resolve(strict=True))

In [10]:
if __name__ == "__main__":
    display(df_nutrition5k_nutrient_values.head())

Unnamed: 0,ingr,id,cal/g,fat(g),carb(g),protein(g)
0,cottage cheese,1,0.98,0.043,0.034,0.11
1,strawberries,2,0.33,0.003,0.08,0.007
2,garden salad,3,0.646,0.034,0.032,0.061
3,bacon,4,5.41,0.42,0.014,0.37
4,potatoes,5,0.77,0.001,0.17,0.02


In [11]:
fndds_category_embeddings = embed(fndds_category)

In [12]:
fndds_description_embeddings = {}
fndds_description = {}
for category in fndds_category:
    all_food_in_category = df_fndds_nutrient_values[
        df_fndds_nutrient_values["WWEIA Category description"] == category
    ]["Main food description"].tolist()
    fndds_description_embeddings[category] = embed(all_food_in_category)
    fndds_description[category] = all_food_in_category

In [13]:
nutrition5k_ingredient = df_nutrition5k_nutrient_values["ingr"].tolist()
nutrition5k_ingredient_embeddings = embed(nutrition5k_ingredient)

In [14]:
if __name__ == "__main__":
    print("Below is an overview of Nutrition5k ingredient embeddings\n")
    pprint(nutrition5k_ingredient_embeddings)

Below is an overview of Nutrition5k ingredient embeddings

<tf.Tensor: shape=(555, 512), dtype=float32, numpy=
array([[-0.04213649,  0.02428613,  0.03386062, ...,  0.02607823,
         0.02512661, -0.05266538],
       [ 0.0114829 ,  0.00819828,  0.03557925, ...,  0.00613289,
         0.03251709, -0.06750827],
       [-0.01347016, -0.01774764,  0.01833391, ..., -0.04396158,
         0.07233529, -0.06495779],
       ...,
       [ 0.04466048,  0.01656268,  0.02949798, ...,  0.00699599,
        -0.03023824, -0.07513157],
       [ 0.03270739, -0.01751684,  0.0051727 , ..., -0.0397653 ,
        -0.01495523, -0.07337393],
       [-0.05237374, -0.0495625 ,  0.03297006, ..., -0.04588482,
        -0.04796786, -0.0534832 ]], dtype=float32)>


In [15]:
if __name__ == "__main__":
    print("Below is an overview of FNDDS category embeddings\n")
    pprint(fndds_category_embeddings)

Below is an overview of FNDDS category embeddings

<tf.Tensor: shape=(134, 512), dtype=float32, numpy=
array([[-0.02048537, -0.03889232,  0.07314803, ..., -0.00016655,
         0.0091969 , -0.06343628],
       [-0.02819614,  0.00764418, -0.03117243, ...,  0.04469761,
        -0.01590378, -0.04597705],
       [-0.02105323, -0.01606695,  0.0235441 , ...,  0.03196798,
         0.02654512, -0.07217751],
       ...,
       [ 0.01148288,  0.00819833,  0.03557925, ...,  0.00613291,
         0.03251706, -0.06750827],
       [ 0.01639188,  0.02775561, -0.01526352, ..., -0.00422445,
        -0.03507275, -0.0226869 ],
       [ 0.00719603, -0.0188462 ,  0.04508138, ...,  0.00653856,
         0.04128977, -0.0376791 ]], dtype=float32)>


In [16]:
def get_cosine_similarity(matrix_embedding, target_vector_embedding):
    # compute the inner product
    similarity = tf.linalg.matvec(matrix_embedding, target_vector_embedding)
    return similarity


def get_most_similar_from_nutrition5k(target_vector_embedding):
    similarity = get_cosine_similarity(
        nutrition5k_ingredient_embeddings, target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(value, [1])[0].numpy()
    index = tf.reshape(index, [1])[0].numpy()
    return (value, index)


def get_most_similar_from_fndds(target_vector_embedding):
    category_similarity = get_cosine_similarity(
        fndds_category_embeddings, target_vector_embedding
    )
    category_value, category_index = tf.math.top_k(category_similarity, k=1)
    category_index = tf.reshape(category_index, [1])[0].numpy()
    category = get_category_from_fndds(category_index)
    similarity = get_cosine_similarity(
        fndds_description_embeddings[category], target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(value, [1])[0].numpy()
    index = tf.reshape(index, [1])[0].numpy()
    return (value, index, category)


def get_ingredient_nutrient_from_nutrition5k(index):
    ingredient_name = nutrition5k_ingredient[index]
    result = df_nutrition5k_nutrient_values.loc[
        df_nutrition5k_nutrient_values["ingr"] == ingredient_name, :
    ]
    return result.squeeze()


def get_category_from_fndds(index):
    return fndds_category[index]


def get_ingredient_nutrient_from_fndds(category, index):
    ingredient_name = fndds_description[category][index]
    result = df_fndds_nutrient_values.loc[
        df_fndds_nutrient_values["Main food description"] == ingredient_name, :
    ]
    return result.squeeze()

In [17]:
exported = {
    fn.__name__: fn
    for fn in [
        get_ingredient_nutrient_from_fndds,
        get_ingredient_nutrient_from_nutrition5k,
        get_most_similar_from_fndds,
        get_most_similar_from_nutrition5k,
        embed,
    ]
}

In [18]:
exported = SimpleNamespace(**exported)

In [19]:
if __name__ != "__main__":
    print("Module ingredient_embeddings_similarity.ipynb is loaded")