In [1]:
# Imports
import re
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import sqlite3
import time

In [2]:
def scrape_gousto_recipe(url, driver):
    """
    Scrapes recipe information from a Gousto recipe webpage.

    This function uses Selenium to navigate to the provided URL, waits for the page to load,
    and extracts the recipe title, ingredients, and instructions. It handles potential errors
    during the scraping process and returns a dictionary containing the extracted data.

    Args:
        url (str): The URL of the Gousto recipe webpage.
        driver (selenium.webdriver.remote.webdriver.WebDriver): The Selenium WebDriver instance.

    Returns:
        dict: A dictionary containing the scraped recipe data, including:
            - 'title' (str): The recipe title.
            - 'ingredients' (str): A newline-separated string of ingredients.
            - 'url' (str): The URL of the scraped recipe.
        None: If an error occurs during scraping.
    """

    try:
        driver.get(url)

        # Wait for the main recipe title to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )

        # Get heading of page, corresponding to recipe title
        title = driver.find_element(By.TAG_NAME, "h1").text.strip() if driver.find_elements(By.TAG_NAME, "h1") else "Title not found"

        # Get the list of ingredients
        ingredients_list = []
        ingredients_elements = driver.find_elements(By.CSS_SELECTOR, "ul.IngredientList_ingredientList__14UI0 li")
        for ingredient in ingredients_elements:
            ingredients_list.append(ingredient.text.strip())
        
        # Remove duplicates found in some erroneous ingredient lists
        unique_ingredients = list(set(ingredients_list))

        # Return a dictionary with necessary information
        return {
            'title': title,
            'ingredients': unique_ingredients,
            'url': url
        }

    # Handle exceptions during scraping by returning None instead
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [8]:
def parse_ingredient(ingredient_line):
    """
    Parses an ingredient line to extract the ingredient name, quantity, and unit.

    This function uses a regular expression to parse ingredient lines that may contain
    quantity, unit, and multiple values within parentheses or preceded by "x".
    It handles various formats and returns a tuple containing the parsed information.

    Args:
        ingredient_line (str): The ingredient line to parse.

    Returns:
        tuple: A tuple containing (name, quantity, unit), where:
            - name (str): The name of the ingredient.
            - quantity (float or int or None): The quantity of the ingredient (if specified), or None.
            - unit (str or None): The unit of the ingredient (if specified), or None.

    Examples:
        >>> parse_ingredient("Chicken breast (200g)")
        ('Chicken breast', 200.0, 'g')
        >>> parse_ingredient("Onion x2")
        ('Onion', 2.0, None)
        >>> parse_ingredient("Salt")
        ('Salt', None, None)
    """
    ingredient_str = ingredient_line.replace('†','').strip() # Remove allergen marker from ingredients where needed
    ingredient_str = ingredient_str.strip()

    # Case 1: Parentheses with optional x quantity
    parentheses_match = re.search(r"^(.*?)\s*\((?P<quantity>\d+(?:\.\d+)?)\s*(?P<unit>(?:g|kg|ml|l|tsp|tbsp|cup|oz|lb|pinch|dash|pcs)\b)?\)\s*(?:x\s*(?P<multiplier>\d+(?:\.\d+)?))?$", ingredient_str)
    if parentheses_match:
        name = parentheses_match.group(1).strip()
        quantity = float(parentheses_match.group("quantity"))
        unit = parentheses_match.group("unit")
        multiplier = parentheses_match.group("multiplier")

        if multiplier:
            quantity *= float(multiplier)
        return name, str(quantity), unit

    # Case 2: Quantity x Name
    quantity_x_start_match = re.search(r"^(?P<quantity>\d+(?:\.\d+)?)\s*x\s*(?P<name>.*)$", ingredient_str)
    if quantity_x_start_match:
        return quantity_x_start_match.group("name").strip(), quantity_x_start_match.group("quantity"), None

    # Case 3: Name x Quantity
    name_x_end_match = re.search(r"^(?P<name>.*)\s*x\s*(?P<quantity>\d+(?:\.\d+)?)$", ingredient_str)
    if name_x_end_match:
        return name_x_end_match.group("name").strip(), name_x_end_match.group("quantity"), None

    # Case 4: Quantity Unit Name
    quantity_unit_name_match = re.search(r"^(?P<quantity>\d+(?:\.\d+)?)\s*(?P<unit>(?:g|kg|ml|l|tsp|tbsp|cup|oz|lb|pinch|dash|pcs)\b)\s*(?P<name>.*)$", ingredient_str)
    if quantity_unit_name_match:
        return quantity_unit_name_match.group("name").strip(), quantity_unit_name_match.group("quantity"), quantity_unit_name_match.group("unit")

    # Case 5: Quantity Name
    quantity_name_match = re.search(r"^(?P<quantity>\d+(?:\.\d+)?)\s*(?P<name>.*)$", ingredient_str)
    if quantity_name_match:
        return quantity_name_match.group("name").strip(), quantity_name_match.group("quantity"), None

    # Case 6: Name Only
    return ingredient_str, None, None

In [4]:
def insert_recipe_data(recipe_data, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    # Insert into the recipes table
    cursor.execute("INSERT INTO recipes (title, instructions_url) VALUES (?, ?)",
                   (recipe_data['title'], recipe_data['url']))
    # cursor.execute("INSERT INTO recipes (title, instructions_url) VALUES (?, ?)",
    # (recipe_data['title'], recipe_data['url'],))
    recipe_id = cursor.lastrowid  # Get the newly inserted recipe ID

    for line in recipe_data['ingredients']:
        ingredient_name, quantity, unit = parse_ingredient(line)
        # Check if the ingredient already exists
        cursor.execute("SELECT ingredient_id FROM ingredients WHERE ingredient_name = ?", (ingredient_name,))
        existing_ingredient = cursor.fetchone()

        if existing_ingredient:
            ingredient_id = existing_ingredient[0]
        else:
            # Insert the new ingredient
            cursor.execute("INSERT INTO ingredients (ingredient_name) VALUES (?)", (ingredient_name,))
            ingredient_id = cursor.lastrowid

        # Insert into the recipe_ingredients table
        cursor.execute("INSERT INTO recipe_ingredients (recipe_id, ingredient_id, quantity, unit) VALUES (?, ?, ?, ?)",
                       (recipe_id, ingredient_id, quantity, unit))

    conn.commit()
    conn.close()

In [5]:
def get_recipe_urls_from_category(category_url, driver):
    driver.get(category_url)
    time.sleep(2) #allow page to load.

    recipe_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/cookbook/']") #find all links containing cookbook
    urls = []
    for link in recipe_links:
        urls.append(link.get_attribute("href"))
    return urls

In [6]:
def scrape_all_gousto_recipes(category_url, driver):
    visited_urls = set()
    all_recipe_data = []

    recipe_urls = get_recipe_urls_from_category(category_url, driver)
    for recipe_url in recipe_urls:
        if recipe_url not in visited_urls:
            visited_urls.add(recipe_url)
            recipe_data = scrape_gousto_recipe(recipe_url, driver) #your existing scrape function.
            if recipe_data:
                all_recipe_data.append(recipe_data)
            time.sleep(1) #rate limiting.
    return all_recipe_data

In [None]:
category_url = 'https://www.gousto.co.uk/cookbook/recipes'

service = Service(executable_path='chromedriver-win64/chromedriver.exe')  # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
try:
    all_recipe_data = scrape_all_gousto_recipes(category_url, driver)
    # store all_recipe_data into the database.
finally:
    driver.quit()
for i in all_recipe_data:
    if i['ingredients']:
        insert_recipe_data(i, 'recipes.sqlite')

Error scraping https://www.gousto.co.uk/cookbook/meatball-recipes: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=134.0.6998.36)
Stacktrace:
	GetHandleVerifier [0x00007FF61363DF85+26693]
	(No symbol) [0x00007FF61359EAD0]
	(No symbol) [0x00007FF6134291CA]
	(No symbol) [0x00007FF613401763]
	(No symbol) [0x00007FF6134AF5EE]
	(No symbol) [0x00007FF6134CF975]
	(No symbol) [0x00007FF6134A76C3]
	(No symbol) [0x00007FF613470490]
	(No symbol) [0x00007FF613471743]
	GetHandleVerifier [0x00007FF61399436D+3525677]
	GetHandleVerifier [0x00007FF6139A7F3B+3606523]
	GetHandleVerifier [0x00007FF61399CEE3+3561379]
	GetHandleVerifier [0x00007FF613707C0A+853194]
	(No symbol) [0x00007FF6135A990F]
	(No symbol) [0x00007FF6135A5674]
	(No symbol) [0x00007FF6135A5816]
	(No symbol) [0x00007FF613594D89]
	BaseThreadInitThunk [0x00007FFD43347374+20]
	RtlUserThreadStart [0x00007FFD435BCC91+33]
Error scraping https://www.gousto.co.uk/cookbook/mushro

In [10]:
service = Service(executable_path='chromedriver-win64/chromedriver.exe')  # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
troubleshoot = scrape_gousto_recipe('https://www.gousto.co.uk/cookbook/vegan-recipes/sri-lankan-coconut-dal-with-aubergine-pickle', driver)
for line in troubleshoot['ingredients']:
    print(parse_ingredient(line))
#insert_recipe_data(troubleshoot, 'C:/Users/brxce/Documents/Python Projects/mealplanner/recipes.sqlite')
driver.quit()

('Ground turmeric', '1.0', 'tsp')
('Red lentils', '100.0', 'g')
('Coriander', '20.0', 'g')
('Aubergine', None, None)
('Curry powder', '1.0', 'tbsp')
('Cider vinegar', '15.0', 'ml')
('Solid creamed coconut', '50.0', 'g')
('Vegetable stock mix', '11.0', 'g')
('Shallot', '3', None)
('Fresh root ginger', '15.0', 'g')
('Black mustard seeds', '1.0', 'tsp')
('White basmati rice', '130.0', 'g')
('Garlic clove', '3', None)
('Green chilli', None, None)
