In [1]:
import pandas as pd 
import re 
import inflect

In [2]:
df = pd.read_csv("data/lebanese-dishes.csv")
print(f"df.shape {df.shape}")
df.head()

df.shape (94, 3)


Unnamed: 0,Source,Dish,Ingredients
0,20210422-LU-RePa-Report.pdf,Baba ghanouj,"Aubergines, garlic cloves, lemon juice, tahini..."
1,20210422-LU-RePa-Report.pdf,Batata mehchi,"Lamb ground, onions, butter, salt, pepper, pin..."
2,20210422-LU-RePa-Report.pdf,Borgul bi banadoura,"Coarse bulgur wheat, small pearl onions, chick..."
3,20210422-LU-RePa-Report.pdf,Chichbarak,"Chichbarak Dough: multi-purpose flour, salt, w..."
4,20210422-LU-RePa-Report.pdf,Falafel,Dry peeled fava beans dried chickpeas (aka Gar...


In [3]:
df_dic = pd.read_csv("data/ingredients_2.csv")
print(f"df_dic.shape {df_dic.shape}")
df_dic.head()

df_dic.shape (436, 3)


Unnamed: 0,Old Ingredients,Main Ingredient,Nb Words
0,allspice,,1
1,almond,,1
2,all-purpose flour,flour,2
3,amaretti,,1
4,anchovy,,1


In [5]:
ingredients = set([ingredient.strip().lower() for sublist in df["Ingredients"].str.split(',') for ingredient in sublist])
ingredients_dic = set([ingredient.strip().lower() for ingredient in df_dic["Old Ingredients"] ])

num_ingredients_per_recipe = df['Ingredients'].str.split(',').apply(len)

print(f"Data is collected from {df['Source'].nunique()} sources")
print(f"There is {len(ingredients)} ingredients")
print(f"Nb Ingredients per Recipes: Min={num_ingredients_per_recipe.min()}, Max={num_ingredients_per_recipe.max()}")

print(f"Dict {len(ingredients_dic)}")

Data is collected from 2 sources
There is 344 ingredients
Nb Ingredients per Recipes: Min=4, Max=18
Dict 427


In [6]:
for ing in ingredients: 
    if len(ing.split()) != 2 or "oil" not in ing: 
        continue 
    for dict in ingredients_dic: 
        if dict in ing: 
            print(f"{ing} -> {dict}")


sunflower oil -> sunflower oil
sunflower oil -> oil
vegetable oil -> vegetable oil
vegetable oil -> oil
corn oil -> oil
corn oil -> corn
vegetable oils -> vegetable oil
vegetable oils -> oil
olive oil -> oil
olive oil -> olive
olive oil -> olive oil


we will conduct a series of actions to clean the text and extract the ingredients we need

In [5]:
# def clean_ingredient(ingredient):
#     ingredient = re.sub(r'[^a-zA-Z\s]', '', ingredient) # keep only letters and spaces
#     return ingredient.strip().lower()

# # Stopwords to remove
# remove_words = {"a", "pinch", "of", "or", "and", "ingredients", "dissolved", "with", "cream"}

# 
# {p.singular_noun(ingredient) or ingredient for ingredient in ingredients if len(ingredient.split(" ")) == 1}

units = {"cup", "cups", "tbsp", "tsp", "teaspoon", "tablespoon", "grams", "g", "kg", "oz", "ml", "liter"}
details = {"paste", "dough"}
main_ingredients = {"chicken", "oil", "cheese"} 

p = inflect.engine()

def keep_letters_and_spaces(ingredient): 
    return re.sub(r'[^a-zA-Z\s]', '', ingredient)
    
def to_singular(ingredient): 
    return p.singular_noun(ingredient) or ingredient

def unwantted_single_word(ingredient): 
    lst = ["minced", "melted", "diced", "grilled", "herb", # verbs 
           "water", "bread", "pepper", "salt", "sugar", "vegetable"]    
    return "" if len(ingredient.split()) == 1 and ingredient in lst else ingredient 

def remove_quantities(ingredient):
    words = ingredient.split()
    words = [word for word in words if not re.match(r"^\d+/?\d*$", word) and word not in units and word not in details]
    return " ".join(words)

def map_ingredients(ingredient): 
    for main_ingredient in main_ingredients: 
        if main_ingredient in ingredient: 
            return main_ingredient
    return ingredient

threshold_test = 5

for ing in ingredients: 
    if len(ing.split()) == threshold_test: 
        print(ing)
    # ing_0 = ing
    # ing = keep_letters_and_spaces(ing)
    # ing = to_singular(ing)
    # ing = unwantted_single_word(ing)
    # ing = remove_quantities(ing)
    # ing = map_ingredients(ing)
    
    # if ing != ing_0: 
    #     print(f"{ing_0} -> {ing}")
    # else: 
    #     print(f"{ing}")



sugar. sugar syrup ingredients: sugar
sugar; meat stuffing: ground beef
freshly peeled crushed garlic cloves
blossom water. filling ingredients: water
if spicy falafel is desired)
kallaj sheets. kashta ingredients: milk
rose water. hulled unsalted pistachio
rose water for filling: walnuts
unsalted butter at room temperature
piquant post spicy mint blend
blanched almonds plus whole almonds
salt and pepper to taste
finely ground beef (or lamb
fried nuts almond and cashew
few drops of pomegranate molasses


In [7]:
import re
from collections import defaultdict

# Define non-essential ingredients (stopword list)
NON_ESSENTIAL_INGREDIENTS = {
    "water", "salt", "black pepper", "sugar", "oil", "olive oil",
    "butter", "flour", "vinegar", "spices", "herbs", "yeast",
    "baking powder", "baking soda"
}

# Define synonym mappings
SYNONYM_MAPPING = {
    "olive oil": "oil",
    "vegetable oil": "oil",
    "canola oil": "oil",
    "tomatoes": "tomato",
    "chicken breast": "chicken",
    "chicken thigh": "chicken",
    "chicken wings": "chicken",
    "beef steak": "beef",
    "red onions": "onion",
    "white onions": "onion",
    "garlic cloves": "garlic",
    "fresh basil": "basil",
    "dried basil": "basil",
    "coriander leaves": "coriander",
    "coriander powder": "coriander"
}

# Function to clean and normalize ingredients
def normalize_ingredients(ingredient_list):
    cleaned_ingredients = set()  # Use a set to avoid duplicates
    
    for ingredient in ingredient_list:
        # Convert to lowercase
        ingredient = ingredient.lower().strip()

        # Remove extra descriptive words (like "chopped", "fresh", "dried")
        ingredient = re.sub(r"\b(fresh|dried|chopped|sliced|minced|crushed|powder|ground|whole|cloves|leaves|pieces)\b", "", ingredient).strip()

        # Map to synonym if exists
        if ingredient in SYNONYM_MAPPING:
            ingredient = SYNONYM_MAPPING[ingredient]

        # Remove non-essential ingredients
        if ingredient not in NON_ESSENTIAL_INGREDIENTS:
            cleaned_ingredients.add(ingredient)
    
    return list(cleaned_ingredients)

# Example usage
# raw_ingredients = ["Fresh Basil", "Garlic Cloves", "Olive Oil", "Chicken Breast", "Salt", "Black Pepper", "Red Onions"]
raw_ingredients = ingredients.copy()
cleaned_ingredients = normalize_ingredients(raw_ingredients)
print(cleaned_ingredients)


['roasted pistachio mixture', '', 'green coriander', 'garlic', 'tamarind sauce', 'free-range chicken', 'cheese or nuts', 'all spice', 'shoulder of lamb', 'bread', 'sugar; meat stuffing:  beef', 'lettuces or romaine lettuce', 'eggplant', 'carrot', 'paprika', 'cold water', 'coarse bulgur wheat', 'green onions', 'liquid whipping cream', 'salt and black pepper', 'kallaj sheets. kashta ingredients: milk', 'cream', 'hulled unsalted pistachios', 'caraway butter', 'red lemon blossom', 'egg', 'lamb cubed', 'sea bass', 'shredded phyllo dough', 'basmati rice', 'honey', 'fragrant sugar syrup. walnuts', 'pita bread', 'parsley', 'thin dough', 'dry mint', 'green cardamoms', 'onions (finely )', 'krefeh packet', 'chicken or rabbit', 'dry peeled fava beans  chickpeas (aka garbanzo beans)', 'potato', 'red chili pepper (optional', 'fine powdered sugar', 'rose syrup. sugar', 'unsalted diced butter', 'dry instant yeast', 'radishes', 'blanched almonds', 'liquid cream', 'and cinnamon', 'chickpeas', 'peppers)'