# Scrapping Cooking Recipe Website : Giallozafferano (https://www.giallozafferano.it/)

The purpose of this code is to retrieve information from the recipes of a number of dishes for our application in order to integrate allergies.

There are 4 parts:
- The first is to retrieve the name and link of all the recipes available on this site.
- Then we take the links and extract the ingredients and quantities.
- Then we propose two alternatives for translating our data
- Finally, we create an index to detect whether the recipe contains allergens or not.


In [1]:
# Import packages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
import time
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import re
from dotenv import load_dotenv
import os
import pandas as pd
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer

# Connect the driver to te website
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)

url = "https://www.giallozafferano.it/"
driver.get(url)
time.sleep(3)

# for cookies
try:
    manage_button = driver.find_element(By.CSS_SELECTOR, ".amecp_button-customize")
    manage_button.click()

except Exception as e:
    print(f"Errore: {e}")

time.sleep(3)

try:
    manage_button = driver.find_element(By.CSS_SELECTOR, "#iubFooterBtn")
    manage_button.click()

except Exception as e:
    print(f"Errore: {e}")

# Use the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.__version__)
print(device)

2.5.1+cpu
cpu


### Usefull functions :

In [None]:
def get_ingredients(url):
    """
    Scrapes the website Giallozafferano to get recipes and ingredient.

    Args:
        url (str): URL of each recipes to extract ingredients from.

    Returns:
        list[str] | None: A list of ingredients in the format "name (quantity)", or None if an error occurs.
    """
    try:
        response = requests.get(url, verify=False)
        if response.status_code != 200:
            print(f"Error accessing {url}: Status Code {response.status_code}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        ingredients = soup.find_all('dd', class_='gz-ingredient')
        ingredients_clean = []
        for ingredient in ingredients:
            name_ingredient = ingredient.find('a').get_text(strip=True)
            quantity_ingredient = ingredient.find('span').get_text(strip=True)
            name_ingredient = re.sub(r'\s+', ' ', name_ingredient)
            quantity_ingredient = re.sub(r'\(.*?\)', '', quantity_ingredient)
            quantity_ingredient = re.sub(r'\s+', ' ', quantity_ingredient)
            ingredients_clean.append(name_ingredient + " (" + quantity_ingredient.strip() + ")")

        return ingredients_clean

    except Exception as e:
        print(f"Error when extracting from {url}: {e}")
        return None

def translator(language_from, language_to, text):
    """
    Translates text from one language to another using specific Hugging Face translation models.

    Args:
        language_from (str): Source language code (e.g., "fr" for French, "it" for Italian).
        language_to (str): Target language code (e.g., "en" for English, "fr" for French).
        text (str): The text to translate.

    Returns:
        str: The translated text.
    """
    if language_from == "fr" and language_to == "en":
        api_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
        translation = api_translator(text)[0]['translation_text']
    if language_from == "it" and language_to == "fr":
        api_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-it-fr")
        translation = api_translator(text)[0]['translation_text']
    if language_from == "it" and language_to == "en":
        api_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-it-en")
        translation = api_translator(text)[0]['translation_text']
    return translation

# I) Extract recipes names and link

In [None]:
# locate and click the menu button on the web page
menu_button = driver.find_element(By.CSS_SELECTOR, "#gz-header-hamburger > span:nth-child(1) > svg:nth-child(1)")
menu_button.click()

# Wait until the submenu is present and then find it
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "gz-header-submenu")))
submenu = driver.find_element(By.CLASS_NAME, "gz-header-submenu")

# Extract links from the submenu (categories)
category_links = submenu.find_elements(By.TAG_NAME, "a")

# List and dict to store info
categories = []
categories_urls = []
all_recipes = {}

# Loop through each category link to extract name and ulr
for link in category_links:
    categories_name = link.get_attribute("title").strip()# Extract category name
    categories.append(categories_name)
    categories_url = link.get_attribute("href").strip()# Extract category url
    categories_urls.append(categories_url)

# Loop through each category to scrape recipe data
for categories_name, categories_url in zip(categories, categories_urls):
    driver.get(categories_url) # Navigate to the category url

    # Wait for the recipe cards to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.gz-card"))
    )

    while True:
        try:
            # Find all recipe cards on the current page
            recipe_cards = driver.find_elements(By.CSS_SELECTOR, "article.gz-card")

            for card in recipe_cards:
                try:
                    # Extract the title (name of the recipe) and the url of each recipe
                    title_element = card.find_element(By.CLASS_NAME, "gz-title").find_element(By.TAG_NAME, "a")
                    recipe_title = title_element.text.strip()
                    recipe_url = title_element.get_attribute("href").strip()

                    # Add the recipe to the dictionary
                    all_recipes[recipe_title] = recipe_url
                    print(len(all_recipes), end='\r')
                except Exception as e:
                    print(f"Error processing a recipe: {e}")

            try:
                # Check if a "next" button exists and navigate to the next page
                next_arrow = driver.find_element(By.CLASS_NAME, "gz-arrow.next")
                next_url = next_arrow.get_attribute("href")

                if next_url:
                    driver.get(next_url)
                    time.sleep(3)# break to allow the page to load
                else:
                    break# exit the loop if no next url exists

            except Exception as e:
                break

        except Exception as e:
            print(f"Error loading category {categories_name}: {e}")
            break

# Convert to a dataframe
df = pd.DataFrame(list(all_recipes.items()), columns = ["Recipes", "Links"])

In [12]:
df.head()

Unnamed: 0,Recipes,Links
0,Focaccia (fügassa) alla genovese,https://ricette.giallozafferano.it/Focaccia-fu...
1,Ravioli cinesi al vapore,https://ricette.giallozafferano.it/Ravioli-cin...
2,Hummus,https://ricette.giallozafferano.it/Hummus.html
3,Vitello tonnato,https://ricette.giallozafferano.it/Vitello-ton...
4,Capesante gratinate,https://ricette.giallozafferano.it/Capesante-g...


# II) Extract ingredients

In [13]:
# Init a dict to store recipes and the ingredients
recipes_ingredients = {}

# Iterate through rows of the dataframe using tqdm for progress tracking
for _, row in tqdm(df.iterrows()):
    title = row['Recipes']
    url = row['Links']

    # Get the list of ingredients from the URL
    ingredients = get_ingredients(url)
    if ingredients:
        recipes_ingredients[title] = ingredients
    else:
        recipes_ingredients[title] = []


# Prepare a dataset for conversion into a df
dataset = []
for title, ingredients in recipes_ingredients.items():
    all_ingredient = "" # Initialize a string to concatenate all ingredients
    for ingredient in ingredients:
        all_ingredient += ingredient + ", " # Append each ingredient to the string
    dataset.append({
        "Recipes": title,
        "Ingredients": all_ingredient,
    })

df = pd.DataFrame(dataset)

6723it [1:06:43,  1.68it/s]


# III) Translation :

## Method 1 : Hugging face

In [12]:
# Prepare list to store info
translated_recipes_en = []
translated_ingredients_en = []
translated_recipes_fr = []
translated_ingredients_fr = []

In [33]:
# We choose to translate in english because it's gonna be the main language of our application and in french because the menu of the Crous is in french.
# So it's just to simplifie the navigation between the different part of our application.

# Translation with the function "translator" using HF models (very long)
for _, row in tqdm(df.iterrows()):
    recipe = row['Recipes (Italian)']
    ingredients = row['Ingredients (Italian)']
    translated_recipes_en.append(translator("it", "en", recipe))
    translated_ingredients_en.append(translator("it", "en", ingredients))
    translated_recipes_fr.append(translator("it", "fr", recipe))
    translated_ingredients_fr.append(translator("it", "fr", ingredients))


Device set to use cuda:0
Device set to use cuda:0
1it [00:04,  4.57s/it]Device set to use cuda:0
Device set to use cuda:0
2it [00:08,  4.44s/it]Device set to use cuda:0
Device set to use cuda:0
3it [00:13,  4.33s/it]Device set to use cuda:0
Device set to use cuda:0
4it [00:17,  4.20s/it]Device set to use cuda:0
Device set to use cuda:0
5it [00:20,  4.05s/it]Device set to use cuda:0
Device set to use cuda:0
6it [00:25,  4.07s/it]Device set to use cuda:0
Device set to use cuda:0
7it [00:30,  4.42s/it]Device set to use cuda:0
Device set to use cuda:0
8it [00:34,  4.30s/it]Device set to use cuda:0
Device set to use cuda:0
9it [00:38,  4.20s/it]Device set to use cuda:0
Device set to use cuda:0
10it [00:42,  4.14s/it]Device set to use cuda:0
Device set to use cuda:0
11it [00:48,  4.83s/it]Device set to use cuda:0
Device set to use cuda:0
12it [00:53,  4.87s/it]Device set to use cuda:0
Device set to use cuda:0
13it [00:57,  4.67s/it]Device set to use cuda:0
Device set to use cuda:0
14it [01:0

In [36]:
# Add the translation to the dataframe
df['Recipes (English)'] = translated_recipes_en
df['Ingredients (English)'] = translated_ingredients_en
df['Recipes (French)'] = translated_recipes_fr
df['Ingredients (French)'] = translated_ingredients_fr

In [13]:
df.head()

Unnamed: 0,Recipes (Italian),Ingredients (Italian),Recipes (English),Ingredients (English),Recipes (French),Ingredients (French)
0,Focaccia (fügassa) alla genovese,"Farina 00 (400 g), Farina Manitoba (250 g), Ac...",Genoese focaccia (fügassa),"flour 00 (400 g), flour Manitoba (250 g), wate...",Focaccia (fügassa) alla génoise,"Farine 00 (400 g), Farine Manitoba (250 g), Ea..."
1,Ravioli cinesi al vapore,"Farina 00 (250 g), Acqua (115 g), Sale fino (1...",Steamed Chinese Ravioli,"flour 00 (250 g), water (115 g), salt up to (1...",Raviolis chinois à la vapeur,"Farine 00 (250 g), Eau (115 g), Sel fin (1 pin..."
2,Hummus,"Ceci precotti (500 g), Succo di limone (90 g),...",Hummus,"Pre-cooked chickpeas (500 g), Lemon juice (90 ...",Hummus,"Chiches précuites (500 g), jus de citron (90 g..."
3,Vitello tonnato,"Vitello (800 g), Sedano (1 costa), Carote (1),...",Calf tonned,"Calf (800 g), celery (1 coast), carrots (1), g...",Veau tonné,"Vin blanc (250 g), Eau (1,5 l), Laurier (1 feu..."
4,Capesante gratinate,"Capesante (8), Pane (mollica 100 g), Scorza di...",Capesanta gratinata,"Capesante (8), Bread (mollica 100 g), Lemon pe...",Capesant gratté,"Capesant (8), Pain (mollyca 100 g), Scorce de ..."


## Method 2 : API Deepl

In [None]:
# Other method to translate recipes and ingredients more faster (But need credit, and a lot !)
import deepl

# Environnement path for the "deepl_api_key"
path = "C:/Users/busch/OneDrive/Documents/Fac/M2/UE1 - Advanced programming and data visualization/Advanced Programming/projet/environment/"
load_dotenv(f"{path}.env")
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")

# Define the translator
translator = deepl.Translator(DEEPL_API_KEY)

# Define 2 functions to translate in french and in english
def translate_text_EN(text):
    return translator.translate_text(text, target_lang="EN-US").text

def translate_text_FR(text):
    return translator.translate_text(text, target_lang="FR").text 

tqdm.pandas() 

#Add the translation to the df
df['Recipes (English)'] = df['Recipes (Italian)'].progress_apply(translate_text_EN)
df['Ingredients (English)'] = df['Ingredients (Italian)'].progress_apply(translate_text_EN)
df['Recipes (French)'] = df['Recipes (Italian)'].progress_apply(translate_text_FR)
df['Ingredients (French)'] = df['Ingredients (Italian)'].progress_apply(translate_text_FR)

In [14]:
df.head()

Unnamed: 0,Recipes (Italian),Ingredients (Italian),Recipes (English),Ingredients (English),Recipes (French),Ingredients (French)
0,Focaccia (fügassa) alla genovese,"Farina 00 (400 g), Farina Manitoba (250 g), Ac...",Genoese focaccia (fügassa),"flour 00 (400 g), flour Manitoba (250 g), wate...",Focaccia (fügassa) alla génoise,"Farine 00 (400 g), Farine Manitoba (250 g), Ea..."
1,Ravioli cinesi al vapore,"Farina 00 (250 g), Acqua (115 g), Sale fino (1...",Steamed Chinese Ravioli,"flour 00 (250 g), water (115 g), salt up to (1...",Raviolis chinois à la vapeur,"Farine 00 (250 g), Eau (115 g), Sel fin (1 pin..."
2,Hummus,"Ceci precotti (500 g), Succo di limone (90 g),...",Hummus,"Pre-cooked chickpeas (500 g), Lemon juice (90 ...",Hummus,"Chiches précuites (500 g), jus de citron (90 g..."
3,Vitello tonnato,"Vitello (800 g), Sedano (1 costa), Carote (1),...",Calf tonned,"Calf (800 g), celery (1 coast), carrots (1), g...",Veau tonné,"Vin blanc (250 g), Eau (1,5 l), Laurier (1 feu..."
4,Capesante gratinate,"Capesante (8), Pane (mollica 100 g), Scorza di...",Capesanta gratinata,"Capesante (8), Bread (mollica 100 g), Lemon pe...",Capesant gratté,"Capesant (8), Pain (mollyca 100 g), Scorce de ..."


In [23]:
# Save the df
df.to_csv("full_translated_data.csv", index=False)

# IV) Index for allergies

In [28]:
# path = "C:/Users/busch/OneDrive/Documents/Fac/M2/UE1 - Advanced programming and data visualization/Advanced Programming/projet/db/"
df = pd.read_csv("full_translated_data.csv")

In [30]:
df = df.drop(columns = ["gluten", "shellfish", "eggs", "peanut", "soia", "lactose", "nut", "celery", "mustard", "sesamo", "lupins", "Molluschi"])

In [31]:
# Allergens defined by the French Ministry of Health
 
h_gluten = ["grano", "segale", "orzo", "avena", "farro", "kamut", 
             "wheat", "rye", "barley", "oats", "spelt", "kamut", 
             "blé", "seigle", "orge", "avoine", "épeautre", "kamut", 'pain', "bread", "dolore", 'farine', "farina", "flour", 'céréales', 'céréale', "cereali", "cereale", "cereals", "cereal", 'pâtes', 'pâte', "pasta", "pastas", 'cracker', 'crackers', 'sandwich', 'sandwichs', 'dessert', 'desserts', "dolce", "dolci"]
h_shellfish = ["gamberi", "gamberetti", "scampi", "granchi", "aragoste", "astici",
               "shrimps", "prawns", "langoustines", "crabs", "lobsters", "spiny lobsters",
               "crevettes", "langoustine", "crabe", "homard", "langouste"]
h_eggs = ["uova", "eggs", "œufs", "oeuf", "oeufs", "uova", "uovo", "egg", "eggs"]
h_peanut = ["arachide", "peanut", "cacahuète"]
h_soia = ["soia", "soya", "soja"]
h_lactose = ["latte", "formaggio", "burro", "panna", "yogurt", "crema di latte", "ricotta", "mozzarella", "parmigiano", "gorgonzola",
    "milk", "cheese", "butter", "cream", "yogurt", "heavy cream", "ricotta", "mozzarella", "parmesan",
    "lait", "fromage", "beurre", "crème", "yaourt", "crème épaisse", "ricotta", "mozzarella", "parmesan", "lait", "crème", "crèmes", 'yaourt', 'yaourts', 'fromage', 'fromages', 'beurre', 'mozzarella', "milk", "cream", "yoghurt", "cheese", "butter", "latte", "panna", "yogurt", "formaggio", "burro"] 
h_nut = ["noci", "nocciole", "mandorle", "pistacchi", "anacardi", "pinoli", "arachidi", "castagne", "pecan", "noci del Brasile", "noci macadamia",
    "walnuts", "hazelnuts", "almonds", "pistachios", "cashews", "pine nuts", "peanuts", "chestnuts", "pecans", "brazil nuts", "macadamia nuts",
    "noix", "noisette", "amande", "pistaches", "pignons", "arachides", "châtaignes", "pecan"]
h_celery = ["sedano", "celery", "céleri"]
h_mustard = ['moutarde', "mustard", "senape", "mostarda"]
h_sesamo =["sesamo", "sesame", "sésame"]
h_lupins = ["lupini", "lupins", "lupin"]
h_Molluschi = ["canestrello", "cannolicchio", "capasanta", "dattero di mare", "fasolaro", 
    "garagolo", "lumachino", "cozza", "murice", "ostrica", "patella", "tartufo di mare", "tellina", "vongola", 
    "scallop", "razor clam", "queen scallop", "date mussel", "smooth clam", "whelk", "periwinkle", "mussel", "murex", "oyster", 
    "limpet", "sea truffle", "bean clam", "clam", 
    "pétoncle", "couteau", "coquille Saint-Jacques", "dattier de mer", "praire", 
    "buccin", "bigorneau", "moule", "murex", "huître", 
    "patelle", "truffe de mer", "telline", "palourde"]

In [None]:
# Allergens defined by the French Ministry of Health
h_lactose = ["lait", "crème", "crèmes", 'yaourt', 'yaourts', 'fromage', 'fromages', 'beurre', 'mozzarella', "milk", "cream", "yoghurt", "cheese", "butter", "latte", "panna", "yogurt", "formaggio", "burro"]
h_egg = ["oeuf", "oeufs", "uova", "uovo", "egg", "eggs"]
h_arachide = ['céréales', 'céréale', "cereali", "cereale", "cereals", "cereal", 'chili', 'chilis', "peperoncino", "peperoncini", 'fruits secs', "frutta secca", "dried fruit", 'chips', "patatine fritte", 'amande', 'amandes', "mandorla", "mandorle", "almonds", "almond", 'noisette', 'noisettes', "hazelnut", "hazelnuts", "nocciola", "nocciole", 'nougat', 'nougats', "torrone", "torroni", "torroncini", 'gâteaux', 'gâteau', "cakes", "cake", "torta", "dolce", "torte", "dolci", 'biscuits', 'biscuit', "biscotti", "biscotto", 'beignets', 'beignet', "doughnut", "donut", "doughnuts", "donuts", "ciambelle", "ciambella", 'pâtisseries', 'pâtisserie', "pasticcini", "pasticceria", "pasticcerie", "pasticceria", "pastries", "pastry", "bakery", 'graines de sésame', 'sesame seeds' , "sesame seed", "semi di sesamo", 'soja', "soia", "soy", "soybean", "soybeans", 'noix', "nuts", "walnuts", "walnut", "nut", "dadi", 'fruits à coque', "noccioline", "noci", "nocciole"]
h_gluten = ['pain', "bread", "dolore", 'farine', "farina", "flour", 'céréales', 'céréale', "cereali", "cereale", "cereals", "cereal", 'pâtes', 'pâte', "pasta", "pastas", 'cracker', 'crackers', 'sandwich', 'sandwichs', 'dessert', 'desserts', "dolce", "dolci"]
h_mustard = ['moutarde', "mustard", "senape", "mostarda"]

Count the iteration of allergenes in the 3 languages

In [32]:
gluten1 = []
shellfish1 = []
eggs1 = []
peanut1 = []
soia1 = []
lactose1 = []
nut1 = []
celery1 = []
mustard1 = []
sesamo1 = []
lupins1 = []
Molluschi1 = []

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_gluten:
        if l in str(k).lower():
            i = 1
    gluten1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_shellfish:
        if l in str(k).lower():
            i = 1
    shellfish1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_eggs:
        if l in str(k).lower():
            i = 1
    eggs1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_peanut:
        if l in str(k).lower():
            i = 1
    peanut1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_soia:
        if l in str(k).lower():
            i = 1
    soia1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_lactose:
        if l in str(k).lower():
            i = 1
    lactose1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_nut:
        if l in str(k).lower():
            i = 1
    nut1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_celery:
        if l in str(k).lower():
            i = 1
    celery1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_mustard:
        if l in str(k).lower():
            i = 1
    mustard1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_sesamo:
        if l in str(k).lower():
            i = 1
    sesamo1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_lupins:
        if l in str(k).lower():
            i = 1
    lupins1.append(i)

for k in list(df["Ingredients (Italian)"]):
    i = 0
    for l in h_Molluschi:
        if l in str(k).lower():
            i = 1
    Molluschi1.append(i)

In [33]:
gluten2 = []
shellfish2 = []
eggs2 = []
peanut2 = []
soia2 = []
lactose2 = []
nut2 = []
celery2 = []
mustard2 = []
sesamo2 = []
lupins2 = []
Molluschi2 = []

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_gluten:
        if l in str(k).lower():
            i = 1
    gluten2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_shellfish:
        if l in str(k).lower():
            i = 1
    shellfish2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_eggs:
        if l in str(k).lower():
            i = 1
    eggs2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_peanut:
        if l in str(k).lower():
            i = 1
    peanut2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_soia:
        if l in str(k).lower():
            i = 1
    soia2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_lactose:
        if l in str(k).lower():
            i = 1
    lactose2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_nut:
        if l in str(k).lower():
            i = 1
    nut2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_celery:
        if l in str(k).lower():
            i = 1
    celery2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_mustard:
        if l in str(k).lower():
            i = 1
    mustard2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_sesamo:
        if l in str(k).lower():
            i = 1
    sesamo2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_lupins:
        if l in str(k).lower():
            i = 1
    lupins2.append(i)

for k in list(df["Ingredients (English)"]):
    i = 0
    for l in h_Molluschi:
        if l in str(k).lower():
            i = 1
    Molluschi2.append(i)

In [34]:
gluten3 = []
shellfish3 = []
eggs3 = []
peanut3 = []
soia3 = []
lactose3 = []
nut3 = []
celery3 = []
mustard3 = []
sesamo3 = []
lupins3 = []
Molluschi3 = []

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_gluten:
        if l in str(k).lower():
            i = 1
    gluten3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_shellfish:
        if l in str(k).lower():
            i = 1
    shellfish3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_eggs:
        if l in str(k).lower():
            i = 1
    eggs3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_peanut:
        if l in str(k).lower():
            i = 1
    peanut3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_soia:
        if l in str(k).lower():
            i = 1
    soia3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_lactose:
        if l in str(k).lower():
            i = 1
    lactose3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_nut:
        if l in str(k).lower():
            i = 1
    nut3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_celery:
        if l in str(k).lower():
            i = 1
    celery3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_mustard:
        if l in str(k).lower():
            i = 1
    mustard3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_sesamo:
        if l in str(k).lower():
            i = 1
    sesamo3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_lupins:
        if l in str(k).lower():
            i = 1
    lupins3.append(i)

for k in list(df["Ingredients (French)"]):
    i = 0
    for l in h_Molluschi:
        if l in str(k).lower():
            i = 1
    Molluschi3.append(i)

In [35]:
df["gluten1"] = gluten1
df["shellfish1"] = shellfish1
df["eggs1"] = eggs1
df["peanut1"] = peanut1
df["soia1"] = soia1
df["lactose1"] = lactose1
df["nut1"] = nut1
df["celery1"] = celery1
df["mustard1"] = mustard1
df["sesamo1"] = sesamo1
df["lupins1"] = lupins1
df["Molluschi1"] = Molluschi1

df["gluten2"] = gluten2
df["shellfish2"] = shellfish2
df["eggs2"] = eggs2
df["peanut2"] = peanut2
df["soia2"] = soia2
df["lactose2"] = lactose2
df["nut2"] = nut2
df["celery2"] = celery2
df["mustard2"] = mustard2
df["sesamo2"] = sesamo2
df["lupins2"] = lupins2
df["Molluschi2"] = Molluschi2

df["gluten3"] = gluten3
df["shellfish3"] = shellfish3
df["eggs3"] = eggs3
df["peanut3"] = peanut3
df["soia3"] = soia3
df["lactose3"] = lactose3
df["nut3"] = nut3
df["celery3"] = celery3
df["mustard3"] = mustard3
df["sesamo3"] = sesamo3
df["lupins3"] = lupins3
df["Molluschi3"] = Molluschi3

In [36]:
df["gluten"] = df["gluten1"] + df["gluten2"] + df["gluten3"]
df["shellfish"] = df["shellfish1"] + df["shellfish2"] + df["shellfish3"]
df["eggs"] = df["eggs1"] + df["eggs2"] + df["eggs3"]
df["peanut"] = df["peanut1"] + df["peanut2"] + df["peanut3"]
df["soia"] = df["soia1"] + df["soia2"] + df["soia3"]
df["lactose"] = df["lactose1"] + df["lactose2"] + df["lactose3"]
df["nut"] = df["nut1"] + df["nut2"] + df["nut3"]
df["celery"] = df["celery1"] + df["celery2"] + df["celery3"]
df["mustard"] = df["mustard1"] + df["mustard2"] + df["mustard3"]
df["sesamo"] = df["sesamo1"] + df["sesamo2"] + df["sesamo3"]
df["lupins"] = df["lupins1"] + df["lupins2"] + df["lupins3"]
df["Molluschi"] = df["Molluschi1"] + df["Molluschi2"] + df["Molluschi3"]

In [37]:
df = df.drop(columns=["gluten1", "shellfish1", "eggs1", "peanut1", "soia1", "lactose1", "nut1", "celery1", "mustard1", "sesamo1", "lupins1", "Molluschi1",
                      "gluten2", "shellfish2", "eggs2", "peanut2", "soia2", "lactose2", "nut2", "celery2", "mustard2", "sesamo2", "lupins2", "Molluschi2",
                      "gluten3", "shellfish3", "eggs3", "peanut3", "soia3", "lactose3", "nut3", "celery3", "mustard3", "sesamo3", "lupins3", "Molluschi3"])

In [18]:
df

Unnamed: 0,Recipes (Italian),Ingredients (Italian),Recipes (English),Ingredients (English),Recipes (French),Ingredients (French),gluten,shellfish,eggs,peanut,soia,lactose,nut,celery,mustard,sesamo,lupins,Molluschi
0,Focaccia (fügassa) alla genovese,"Farina 00 (400 g), Farina Manitoba (250 g), Ac...",Genoese focaccia (fügassa),"flour 00 (400 g), flour Manitoba (250 g), wate...",Focaccia (fügassa) alla génoise,"Farine 00 (400 g), Farine Manitoba (250 g), Ea...",0,0,0,0,0,0,0,0,0,0,0,0
1,Ravioli cinesi al vapore,"Farina 00 (250 g), Acqua (115 g), Sale fino (1...",Steamed Chinese Ravioli,"flour 00 (250 g), water (115 g), salt up to (1...",Raviolis chinois à la vapeur,"Farine 00 (250 g), Eau (115 g), Sel fin (1 pin...",0,0,0,0,2,0,0,0,0,0,0,0
2,Hummus,"Ceci precotti (500 g), Succo di limone (90 g),...",Hummus,"Pre-cooked chickpeas (500 g), Lemon juice (90 ...",Hummus,"Chiches précuites (500 g), jus de citron (90 g...",0,0,0,1,0,0,0,0,0,3,0,0
3,Vitello tonnato,"Vitello (800 g), Sedano (1 costa), Carote (1),...",Calf tonned,"Calf (800 g), celery (1 coast), carrots (1), g...",Veau tonné,"Vin blanc (250 g), Eau (1,5 l), Laurier (1 feu...",0,0,3,0,0,0,0,2,0,0,0,0
4,Capesante gratinate,"Capesante (8), Pane (mollica 100 g), Scorza di...",Capesanta gratinata,"Capesante (8), Bread (mollica 100 g), Lemon pe...",Capesant gratté,"Capesant (8), Pain (mollyca 100 g), Scorce de ...",0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6718,Piadina ai fichi,"Piadelle Gustose (3), Fichi (300 g), Gorgonzol...",Fig Piadina,"Piadelle Gustose (3), Figs (300 g), Gorgonzola...",Piadina aux figues,"Piadelle Gustose (3), Fichiers (300 g), Gorgon...",0,0,0,0,0,3,2,0,0,0,0,0
6719,Piadina con fiori di zucca ripieni e fritti,"Piadine (sfogliatissime all'olio EVO 4), Fiori...",Piadina with stuffed and fried pumpkin flowers,"Piadine (boiled with EVO oil 4), Pumpkin bloss...",Piadina avec des fleurs de courge farcies et f...,"Piadines (feuilleuses très à l'huile EVO 4), F...",0,0,0,0,0,3,3,0,0,0,0,0
6720,"Torta di riso con piselli, asparagi e zucchine","Riso integrale (350 g), Aglio (2 spicchi), Uov...","Rice cake with peas, asparagus and zucchini","Integral rice (350 g), Garlic (2 cloves), Eggs...","Gâteau de riz avec pois, asperges et courgettes","Riz entier (350 g), Ail (2 gousses), Oeufs (12...",0,0,2,0,0,0,0,0,0,0,0,0
6721,"Insalata di riso con pesche, piselli e stracchino","Riso integrale (300 g), Pesche (200 g), Stracc...","Rice salad with peaches, peas and stracchino","Integral rice (300 g), Peaches (200 g), Stracc...","Salade de riz avec des pêches, des pois et de ...","Riz intégral (300 g), Pêches (200 g), Étrangle...",0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
colonnes_a_modifier = ["gluten", "shellfish", "eggs", "peanut", "soia", "lactose", "nut", "celery", "mustard", "sesamo", "lupins", "Molluschi"]
df[colonnes_a_modifier] = df[colonnes_a_modifier].applymap(lambda x: 1 if x >= 1 else 0)

  df[colonnes_a_modifier] = df[colonnes_a_modifier].applymap(lambda x: 1 if x >= 1 else 0)


In [39]:
df

Unnamed: 0,Recipes (Italian),Ingredients (Italian),Recipes (English),Ingredients (English),Recipes (French),Ingredients (French),gluten,shellfish,eggs,peanut,soia,lactose,nut,celery,mustard,sesamo,lupins,Molluschi
0,Focaccia (fügassa) alla genovese,"Farina 00 (400 g), Farina Manitoba (250 g), Ac...",Genoese focaccia (fügassa),"flour 00 (400 g), flour Manitoba (250 g), wate...",Focaccia (fügassa) alla génoise,"Farine 00 (400 g), Farine Manitoba (250 g), Ea...",1,0,0,0,0,0,0,0,0,0,0,0
1,Ravioli cinesi al vapore,"Farina 00 (250 g), Acqua (115 g), Sale fino (1...",Steamed Chinese Ravioli,"flour 00 (250 g), water (115 g), salt up to (1...",Raviolis chinois à la vapeur,"Farine 00 (250 g), Eau (115 g), Sel fin (1 pin...",1,0,0,0,1,0,0,0,0,0,0,0
2,Hummus,"Ceci precotti (500 g), Succo di limone (90 g),...",Hummus,"Pre-cooked chickpeas (500 g), Lemon juice (90 ...",Hummus,"Chiches précuites (500 g), jus de citron (90 g...",1,0,0,1,0,0,0,0,0,1,0,0
3,Vitello tonnato,"Vitello (800 g), Sedano (1 costa), Carote (1),...",Calf tonned,"Calf (800 g), celery (1 coast), carrots (1), g...",Veau tonné,"Vin blanc (250 g), Eau (1,5 l), Laurier (1 feu...",0,0,1,0,0,0,0,1,0,0,0,0
4,Capesante gratinate,"Capesante (8), Pane (mollica 100 g), Scorza di...",Capesanta gratinata,"Capesante (8), Bread (mollica 100 g), Lemon pe...",Capesant gratté,"Capesant (8), Pain (mollyca 100 g), Scorce de ...",1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6718,Piadina ai fichi,"Piadelle Gustose (3), Fichi (300 g), Gorgonzol...",Fig Piadina,"Piadelle Gustose (3), Figs (300 g), Gorgonzola...",Piadina aux figues,"Piadelle Gustose (3), Fichiers (300 g), Gorgon...",0,0,0,0,0,1,1,0,0,0,0,0
6719,Piadina con fiori di zucca ripieni e fritti,"Piadine (sfogliatissime all'olio EVO 4), Fiori...",Piadina with stuffed and fried pumpkin flowers,"Piadine (boiled with EVO oil 4), Pumpkin bloss...",Piadina avec des fleurs de courge farcies et f...,"Piadines (feuilleuses très à l'huile EVO 4), F...",1,0,0,0,0,1,1,0,0,0,0,0
6720,"Torta di riso con piselli, asparagi e zucchine","Riso integrale (350 g), Aglio (2 spicchi), Uov...","Rice cake with peas, asparagus and zucchini","Integral rice (350 g), Garlic (2 cloves), Eggs...","Gâteau de riz avec pois, asperges et courgettes","Riz entier (350 g), Ail (2 gousses), Oeufs (12...",0,0,1,0,0,0,0,0,0,0,0,0
6721,"Insalata di riso con pesche, piselli e stracchino","Riso integrale (300 g), Pesche (200 g), Stracc...","Rice salad with peaches, peas and stracchino","Integral rice (300 g), Peaches (200 g), Stracc...","Salade de riz avec des pêches, des pois et de ...","Riz intégral (300 g), Pêches (200 g), Étrangle...",0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# Check the number
print(sum(list(df["gluten"])))
print(sum(list(df["shellfish"])))
print(sum(list(df["eggs"])))
print(sum(list(df["peanut"])))
print(sum(list(df["soia"])))
print(sum(list(df["lactose"])))
print(sum(list(df["nut"])))
print(sum(list(df["celery"])))
print(sum(list(df["mustard"])))
print(sum(list(df["sesamo"])))
print(sum(list(df["lupins"])))
print(sum(list(df["Molluschi"])))

3761
255
3192
699
143
4116
1563
558
120
128
5
137


In [41]:
# SAve 
df.to_csv("full_translated_data.csv", index=False)

# V) Text similarity

In [24]:
def similarities(plats, data):
    sim = {plat: None for plat in plats}
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    for plat in tqdm(plats):
        tot_sim = []
        for _, row in data.iterrows():
            recipe = row['Recipes (French)']
            embeddings1 = model.encode([plat], convert_to_tensor=True)
            embeddings2 = model.encode([recipe], convert_to_tensor=True)
            tot_sim.append(torch.nn.functional.cosine_similarity(embeddings1, embeddings2).item())
        sim[plat] = tot_sim
        
    return sim

In [25]:
with open("menu.txt", 'r', encoding='utf-8') as fichier:
    menus = fichier.read()
print(menus)

Menu du samedi 04/01 décembre 2024 à Esplanade pour le déjeuné :

Repas
- Cordon bleu, sauce crème maison
- Haricots beurre
- Purée de pommes de terre

- Bourguignon de champignons maison
- Purée de pommes de terre

Dessert
- Financier aux myrtilles
- Eclair vanille
- Mini donut

Origines de nos viandes du jour
- Volaille origine France


Menu du dimanche 04/01 décembre 2024 à Gallia pour le diner :

Repas
- Spaghetti carbonara
- Spaghetti bolognaise

Desserts
- Fondant au chocolat
- Fraise chantilly

Origines de nos viandes du jour
- Viandes origine France

Menu du dimanche 04/01 décembre 2024 à PEGE pour le diner :

Repas
- Pizza
- Tarte Flambée

Desserts
- Tarte aux pommes

Origines de nos viandes du jour
- Viande origines hors UE


In [26]:
plat = []
for menu in [x for x in menus.split("\n") if x]:
    if menu.split()[0] == "-":
        plat.append(menu.split("- ")[1])
plat

['Cordon bleu, sauce crème maison',
 'Haricots beurre',
 'Purée de pommes de terre',
 'Bourguignon de champignons maison',
 'Purée de pommes de terre',
 'Financier aux myrtilles',
 'Eclair vanille',
 'Mini donut',
 'Volaille origine France',
 'Spaghetti carbonara',
 'Spaghetti bolognaise',
 'Fondant au chocolat',
 'Fraise chantilly',
 'Viandes origine France',
 'Pizza',
 'Tarte Flambée',
 'Tarte aux pommes',
 'Viande origines hors UE']

In [27]:
index = similarities(plat, df)

100%|██████████| 18/18 [17:26<00:00, 58.12s/it]


In [128]:
index["Pizza"].index(max(index["Pizza"]))

6195

In [129]:
plat = "Pizza"
print(max(index[plat]))
df.iloc[index[plat].index(max(index[plat]))]

0.7063373327255249


Recipes (Italian)                                         Pizza Margherita
Ingredients (Italian)    Farina 0 (500 g), Acqua (a temperatura ambient...
Recipes (English)                                         Pizza Margherita
Ingredients (English)    Flour 0 (500 g), Water (at room temperature 25...
Recipes (French)                                          Pizza Margherita
Ingredients (French)     Farine 0 (500 g), Eau (à température ambiante ...
L                                                                        1
E                                                                        0
P                                                                        0
G                                                                        1
M                                                                        0
Name: 6195, dtype: object