In [1]:
import pandas as pd 
import re 
import inflect

In [2]:
df_lebanese_dishes = pd.read_csv("data/lebanese-dishes.csv")
print(f"df_lebanese_dishes.shape {df_lebanese_dishes.shape}")
df_lebanese_dishes.head()

df_lebanese_dishes.shape (94, 3)


Unnamed: 0,Source,Dish,Ingredients
0,20210422-LU-RePa-Report.pdf,Baba ghanouj,"Aubergines, garlic cloves, lemon juice, tahini..."
1,20210422-LU-RePa-Report.pdf,Batata mehchi,"Lamb ground, onions, butter, salt, pepper, pin..."
2,20210422-LU-RePa-Report.pdf,Borgul bi banadoura,"Coarse bulgur wheat, small pearl onions, chick..."
3,20210422-LU-RePa-Report.pdf,Chichbarak,"Chichbarak Dough: multi-purpose flour, salt, w..."
4,20210422-LU-RePa-Report.pdf,Falafel,Dry peeled fava beans dried chickpeas (aka Gar...


In [3]:
ingredients = set([ingredient.strip().lower() for sublist in df_lebanese_dishes["Ingredients"].str.split(',') for ingredient in sublist])

num_ingredients_per_recipe = df_lebanese_dishes['Ingredients'].str.split(',').apply(len)

print(f"Data is collected from {df_lebanese_dishes['Source'].nunique()} sources")
print(f"There is {len(ingredients)} ingredients")
print(f"Nb Ingredients per Recipes: Min={num_ingredients_per_recipe.min()}, Max={num_ingredients_per_recipe.max()}")

Data is collected from 2 sources
There is 344 ingredients
Nb Ingredients per Recipes: Min=4, Max=18


we will conduct a series of actions to clean the text and extract the ingredients we need

In [4]:
import re

NON_ESSENTIAL_INGREDIENTS = {
    "water", "salt", "black pepper", "sugar", "oil", "olive oil",
    "butter", "flour", "vinegar", "spices", "herbs", "yeast",
    "baking powder", "baking soda"
}

SYNONYM_MAPPING = {
    "olive oil": "oil",
    "vegetable oil": "oil",
    "canola oil": "oil",
    "tomatoes": "tomato",
    "chicken breast": "chicken",
    "chicken thigh": "chicken",
    "chicken wings": "chicken",
    "beef steak": "beef",
    "red onions": "onion",
    "white onions": "onion",
    "garlic cloves": "garlic",
    "fresh basil": "basil",
    "dried basil": "basil",
    "coriander leaves": "coriander",
    "coriander powder": "coriander"
}

def normalize_ingredients(ingredients):
    cleaned_ingredients = ""  # Use a set to avoid duplicates
    
    for ingredient in ingredients.split(", "):
        # Convert to lowercase
        ingredient = ingredient.lower().strip()

        # Remove extra descriptive words (like "chopped", "fresh", "dried")
        ingredient = re.sub(r"\b(fresh|dried|chopped|sliced|minced|crushed|powder|ground|whole|cloves|leaves|pieces)\b", "", ingredient).strip()

        # Map to synonym if exists
        if ingredient in SYNONYM_MAPPING:
            ingredient = SYNONYM_MAPPING[ingredient]

        # Remove non-essential ingredients
        if ingredient not in NON_ESSENTIAL_INGREDIENTS and ingredient is not None and ingredient != "":
            if cleaned_ingredients is not None and cleaned_ingredients != "":
                cleaned_ingredients += ", "
            cleaned_ingredients += ingredient
            
    return cleaned_ingredients

In [5]:
normalized_ingredients = []
for idx, row in df_lebanese_dishes.iterrows(): 
  ingredients = normalize_ingredients(row['Ingredients'])
  print(idx, row['Ingredients'])
  print(idx, ingredients)
  normalized_ingredients.append(ingredients) 

0 Aubergines, garlic cloves, lemon juice, tahini, pomegranate seeds, salt
0 aubergines, garlic, lemon juice, tahini, pomegranate seeds
1 Lamb ground, onions, butter, salt, pepper, pine nuts, potato, tomato juice
1 lamb, onions, pepper, pine nuts, potato, tomato juice
2 Coarse bulgur wheat, small pearl onions, chickpeas, cinnamon stick, caraway seed, vegetable oil, mild white pepper, salt
2 coarse bulgur wheat, small pearl onions, chickpeas, cinnamon stick, caraway seed, mild white pepper
3 Chichbarak Dough: multi-purpose flour, salt, water warm to form a paste, yeast, sugar; Meat Stuffing: ground beef, salt to taste, black pepper to taste, cinnamon powder to taste, onion finely chopped, pine nuts, olive oil, bushel of parsley chopped; Chich Barak Stew: yogurt, water, starch, garlic cloves crushed (optional), rice, dried mint, salt to taste
3 chichbarak dough: multi-purpose flour, water warm to form a paste, sugar; meat stuffing:  beef, salt to taste, black pepper to taste, cinnamon  to

In [6]:
df_lebanese_dishes["Normalized Ingredients"] = normalized_ingredients
df_lebanese_dishes.to_csv('data/normalized_ingredients_algorithms.csv', index=False)