In [2]:
# Imports
import requests
from bs4 import BeautifulSoup
import sqlite3
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sqlite3
import time

In [3]:
def scrape_gousto_recipe(url, driver):
    try:
        driver.get(url)

        # Wait for the main recipe title to load (adjust as needed)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )

        title = driver.find_element(By.TAG_NAME, "h1").text.strip() if driver.find_elements(By.TAG_NAME, "h1") else "Title not found"

        ingredients_list = []
        ingredients_elements = driver.find_elements(By.CSS_SELECTOR, "ul.IngredientList_ingredientList__14UI0 li")
        for ingredient in ingredients_elements:
            ingredients_list.append(ingredient.text.strip())
        ingredients = "\n".join(ingredients_list)

        instructions_list = []
        instructions_elements = driver.find_elements(By.CSS_SELECTOR, "li[class*='instruction']")# Adjust CSS selector
        for instruction in instructions_elements:
            instructions_list.append(instruction.text.strip())
        instructions = "\n".join(instructions_list)

        # Extract meat type, portions, etc. (requires inspecting the website's HTML structure).
        meat_type = "To be determined"
        portions = "To be determined"

        return {
            'title': title,
            'ingredients': ingredients,
            'url': url
        }

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [11]:
def parse_ingredient(ingredient_line):
    """
    Parses an ingredient line into its name, quantity, and unit.

    Args:
        ingredient_line: The ingredient line string.

    Returns:
        A tuple containing (name, quantity, unit), or (name, None, None) if no quantity/unit found.
    """
    ingredient_line.replace('†','')
    pattern = r"^(.*?)(?:\s*\(([\d\.]+)\s*([a-zA-Z]*)\))?(?:\s*x(\d+))?\.?$"
    match = re.match(pattern, ingredient_line.strip())

    if match:
        name = match.group(1).strip()
        quantity = match.group(2)
        unit = match.group(3)
        multiple = match.group(4)

        if multiple:
            if quantity:
                quantity = float(quantity) * int(multiple)
            else:
                quantity = int(multiple)

        if quantity:
            return (name, float(quantity), unit) if unit else (name, float(quantity), None)
        else:
            return (name, None, None)
    else:
        return (ingredient_line.strip(), None, None)

In [70]:
for line in ingredients.split('\n'):
    print(parse_ingredient(line))

('Red chilli', None, None)
('Cornflour', 4.0, 'tbsp')
('Spring onion', None, None)
('Five-spice mix', 1.0, 'tsp')
('Red pepper', None, None)
('British chicken breast portions', 2.0, 'pcs')
('Fresh root ginger', 15.0, 'g')
('Red chilli relish', 25.0, 'g')
('Garlic clove', 2.0, None)
('Chinese rice wine', 15.0, 'ml')
('Soy sauce', 15.0, 'ml')
('White potato', 4.0, None)
('Honey', 25.0, 'g')


In [12]:
def insert_recipe_data(recipe_data, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print(tables)
    # Insert into the recipes table
    cursor.execute("INSERT INTO recipes (title) VALUES (?)",
                   (recipe_data['title'],))
    #cursor.execute("INSERT INTO recipes (title, instructions_url) VALUES (?, ?)",
                   #(recipe_data['title'], recipe_data['url'],))
    recipe_id = cursor.lastrowid # Get the newly inserted recipe ID

    for line in recipe_data['ingredients'].split('\n'):
        ingredient_name, quantity, unit = parse_ingredient(line)
        # Check if the ingredient already exists
        cursor.execute("SELECT ingredient_id FROM ingredients WHERE ingredient_name = ?", (ingredient_name,))
        existing_ingredient = cursor.fetchone()

        if existing_ingredient:
            ingredient_id = existing_ingredient[0]
        else:
            # Insert the new ingredient
            cursor.execute("INSERT INTO ingredients (ingredient_name) VALUES (?)", (ingredient_name,))
            ingredient_id = cursor.lastrowid

        # Insert into the recipe_ingredients table
        cursor.execute("INSERT INTO recipe_ingredients (recipe_id, ingredient_id, quantity, unit) VALUES (?, ?, ?, ?)",
                       (recipe_id, ingredient_id, quantity, unit))
        
    # Insert ingredients and link them to the recipe
#    for ingredient_name, quantity, unit in recipe_data['ingredients']:
        # Check if the ingredient already exists
#        cursor.execute("SELECT ingredient_id FROM ingredients WHERE ingredient_name = ?", (ingredient_name,))
#        existing_ingredient = cursor.fetchone()

#        if existing_ingredient:
#            ingredient_id = existing_ingredient[0]
#        else:
#            # Insert the new ingredient
#            cursor.execute("INSERT INTO ingredients (ingredient_name) VALUES (?)", (ingredient_name,))
#            ingredient_id = cursor.lastrowid

        # Insert into the recipe_ingredients table
#        cursor.execute("INSERT INTO recipe_ingredients (recipe_id, ingredient_id, quantity, unit) VALUES (?, ?, ?, ?)",
#                       (recipe_id, ingredient_id, quantity, unit))

    conn.commit()
    conn.close()

In [10]:
conn = sqlite3.connect('recipes_test.db')
cursor = conn.cursor()
conn.commit()
conn.close()

In [13]:
service = Service(executable_path='chromedriver-win64/chromedriver.exe') # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
url = 'https://www.gousto.co.uk/cookbook/recipes/sticky-chilli-chicken-breast-with-salt-pepper-chips'
try:
    recipe_data = scrape_gousto_recipe(url, driver)

    if recipe_data:
        insert_recipe_data(recipe_data, 'C:/Users/brxce/Documents/Python Projects/mealplanner/recipes.sqlite')
        print("Recipe stored successfully!")

finally:
    driver.quit()

[('recipes',), ('ingredients',), ('recipe_ingredients',)]
Recipe stored successfully!


In [19]:
conn.close()

In [20]:
service = Service(executable_path='chromedriver-win64/chromedriver.exe') # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
url = 'https://www.gousto.co.uk/cookbook/recipes/sticky-chilli-chicken-breast-with-salt-pepper-chips'
recipe_data = scrape_gousto_recipe(url, driver)
print(recipe_data['title'])

Sticky Chicken With Salt & Pepper Chips


In [91]:
print(len('Sticky Chicken With Salt & Pepper Chips'))

39


In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

def get_recipe_urls_from_category(category_url, driver):
    driver.get(category_url)
    time.sleep(2) #allow page to load.
    recipe_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/cookbook/']") #find all links containing cookbook
    urls = []
    for link in recipe_links:
        urls.append(link.get_attribute("href"))
    return urls

In [19]:
service = Service(executable_path='chromedriver-win64/chromedriver.exe') # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
urls = get_recipe_urls_from_category('https://www.gousto.co.uk/cookbook/', driver)
driver.quit()

In [20]:
set(urls)

{'https://www.gousto.co.uk/cookbook/',
 'https://www.gousto.co.uk/cookbook/10-minute-meals',
 'https://www.gousto.co.uk/cookbook/20-minute-meals',
 'https://www.gousto.co.uk/cookbook/600-calorie-meals',
 'https://www.gousto.co.uk/cookbook/american-recipes',
 'https://www.gousto.co.uk/cookbook/asian-recipes',
 'https://www.gousto.co.uk/cookbook/aubergine-recipes',
 'https://www.gousto.co.uk/cookbook/avocado-recipes',
 'https://www.gousto.co.uk/cookbook/bacon-recipes',
 'https://www.gousto.co.uk/cookbook/beef',
 'https://www.gousto.co.uk/cookbook/beef-pork',
 'https://www.gousto.co.uk/cookbook/beef-recipes',
 'https://www.gousto.co.uk/cookbook/beef-recipes/open-steak-sandwich-balsamic-onions-chips',
 'https://www.gousto.co.uk/cookbook/beef-recipes/simply-perfect-beef-spag-bol',
 'https://www.gousto.co.uk/cookbook/beetroot-recipes',
 'https://www.gousto.co.uk/cookbook/burger-recipes',
 'https://www.gousto.co.uk/cookbook/cauliflower-recipes',
 'https://www.gousto.co.uk/cookbook/chicken',
 

In [22]:
urls = ['https://www.gousto.co.uk/cookbook/10-minute-meals?page=12',
 'https://www.gousto.co.uk/cookbook/20-minute-meals?page=7',
 'https://www.gousto.co.uk/cookbook/600-calorie-meals',
 'https://www.gousto.co.uk/cookbook/american-recipes',
 'https://www.gousto.co.uk/cookbook/asian-recipes',
 'https://www.gousto.co.uk/cookbook/aubergine-recipes',
 'https://www.gousto.co.uk/cookbook/avocado-recipes',
 'https://www.gousto.co.uk/cookbook/bacon-recipes',
 'https://www.gousto.co.uk/cookbook/beef',
 'https://www.gousto.co.uk/cookbook/beef-pork',
 'https://www.gousto.co.uk/cookbook/beef-recipes',
 'https://www.gousto.co.uk/cookbook/beef-recipes/open-steak-sandwich-balsamic-onions-chips',
 'https://www.gousto.co.uk/cookbook/beef-recipes/simply-perfect-beef-spag-bol',
 'https://www.gousto.co.uk/cookbook/beetroot-recipes',
 'https://www.gousto.co.uk/cookbook/burger-recipes',
 'https://www.gousto.co.uk/cookbook/cauliflower-recipes',
 'https://www.gousto.co.uk/cookbook/chicken',
 'https://www.gousto.co.uk/cookbook/chicken-breast-recipes',
 'https://www.gousto.co.uk/cookbook/chicken-recipes',
 'https://www.gousto.co.uk/cookbook/chicken-recipes/chicken-stuffing-sarnie-with-plum-chutney',
 'https://www.gousto.co.uk/cookbook/chicken-recipes/joes-popeyes-chicken-with-crispy-potatoes',
 'https://www.gousto.co.uk/cookbook/chicken-thigh-recipes',
 'https://www.gousto.co.uk/cookbook/chickpea-recipes',
 'https://www.gousto.co.uk/cookbook/chilli-recipes',
 'https://www.gousto.co.uk/cookbook/chinese-recipes',
 'https://www.gousto.co.uk/cookbook/christmas-inspired',
 'https://www.gousto.co.uk/cookbook/christmas-recipes',
 'https://www.gousto.co.uk/cookbook/cod-recipes',
 'https://www.gousto.co.uk/cookbook/curry-recipes',
 'https://www.gousto.co.uk/cookbook/dairy-free',
 'https://www.gousto.co.uk/cookbook/dairy-free-recipes',
 'https://www.gousto.co.uk/cookbook/diy-pizza-base',
 'https://www.gousto.co.uk/cookbook/easter',
 'https://www.gousto.co.uk/cookbook/egg-recipes',
 'https://www.gousto.co.uk/cookbook/everyday-favourites',
 'https://www.gousto.co.uk/cookbook/family-classics',
 'https://www.gousto.co.uk/cookbook/fathers-day',
 'https://www.gousto.co.uk/cookbook/fish-recipes',
 'https://www.gousto.co.uk/cookbook/fish-recipes/cheesy-cod-spinach-gratin-with-crispy-potatoes',
 'https://www.gousto.co.uk/cookbook/flavour-saviour',
 'https://www.gousto.co.uk/cookbook/flavours-of-mexico',
 'https://www.gousto.co.uk/cookbook/french-recipes',
 'https://www.gousto.co.uk/cookbook/gluten-free',
 'https://www.gousto.co.uk/cookbook/gluten-free-recipes',
 'https://www.gousto.co.uk/cookbook/gousto-x-marmite',
 'https://www.gousto.co.uk/cookbook/greek-recipes',
 'https://www.gousto.co.uk/cookbook/haddock-recipes',
 'https://www.gousto.co.uk/cookbook/halloumi-recipes',
 'https://www.gousto.co.uk/cookbook/healthy-choices',
 'https://www.gousto.co.uk/cookbook/indian-recipes',
 'https://www.gousto.co.uk/cookbook/indulge',
 'https://www.gousto.co.uk/cookbook/italian-recipes',
 'https://www.gousto.co.uk/cookbook/japanese-recipes',
 'https://www.gousto.co.uk/cookbook/kale-recipes',
 'https://www.gousto.co.uk/cookbook/lamb-recipes',
 'https://www.gousto.co.uk/cookbook/lean-in-15',
 'https://www.gousto.co.uk/cookbook/leek-recipes',
 'https://www.gousto.co.uk/cookbook/lentil-recipes',
 'https://www.gousto.co.uk/cookbook/lighter',
 'https://www.gousto.co.uk/cookbook/low-calorie-recipes',
 'https://www.gousto.co.uk/cookbook/meatball-recipes',
 'https://www.gousto.co.uk/cookbook/mediterranean-recipes',
 'https://www.gousto.co.uk/cookbook/mexican-recipes',
 'https://www.gousto.co.uk/cookbook/mince-recipes',
 'https://www.gousto.co.uk/cookbook/moroccan-recipes',
 'https://www.gousto.co.uk/cookbook/mushroom-recipes',
 'https://www.gousto.co.uk/cookbook/noodle-recipes',
 'https://www.gousto.co.uk/cookbook/one-pot-meals',
 'https://www.gousto.co.uk/cookbook/other-meats',
 'https://www.gousto.co.uk/cookbook/oven-ready',
 'https://www.gousto.co.uk/cookbook/pasta-recipes',
 'https://www.gousto.co.uk/cookbook/plant-based',
 'https://www.gousto.co.uk/cookbook/plant-based-recipes',
 'https://www.gousto.co.uk/cookbook/plant-bistro',
 'https://www.gousto.co.uk/cookbook/pork-fillet-recipes',
 'https://www.gousto.co.uk/cookbook/pork-recipes',
 'https://www.gousto.co.uk/cookbook/pork-recipes/sticky-chinese-pork-chops-sesame-pak-choi',
 'https://www.gousto.co.uk/cookbook/prawn-recipes',
 'https://www.gousto.co.uk/cookbook/prepped-in-5',
 'https://www.gousto.co.uk/cookbook/pumpkin-recipes',
 'https://www.gousto.co.uk/cookbook/quinoa-recipes',
 'https://www.gousto.co.uk/cookbook/recipes',
 'https://www.gousto.co.uk/cookbook/recipes/beetroot-wellington-with-onion-gravy',
 'https://www.gousto.co.uk/cookbook/recipes/cheesy-pizza-topped-chicken-with-mixed-salad',
 'https://www.gousto.co.uk/cookbook/recipes/chicken-date-tamarind-curry-with-brown-rice',
 'https://www.gousto.co.uk/cookbook/recipes/plant-based-butternut-squash-mac-n-cheeze',
 'https://www.gousto.co.uk/cookbook/recipes/plant-based-smoky-bacon-fried-rice',
 'https://www.gousto.co.uk/cookbook/recipes/speedy-ginger-chilli-prawns-with-rice',
 'https://www.gousto.co.uk/cookbook/recipes/speedy-harissa-tomato-chicken-with-couscous',
 'https://www.gousto.co.uk/cookbook/recipes/spicy-rainbow-veg-black-bean-curry',
 'https://www.gousto.co.uk/cookbook/recipes/tofu-fingers-minty-peas-tartare-sauce',
 'https://www.gousto.co.uk/cookbook/red-cabbage-recipes',
 'https://www.gousto.co.uk/cookbook/rice-recipes',
 'https://www.gousto.co.uk/cookbook/risotto-recipes',
 'https://www.gousto.co.uk/cookbook/salad-recipes',
 'https://www.gousto.co.uk/cookbook/salmon-recipes',
 'https://www.gousto.co.uk/cookbook/sausage-recipes',
 'https://www.gousto.co.uk/cookbook/sea-bass-recipes',
 'https://www.gousto.co.uk/cookbook/soup-recipes',
 'https://www.gousto.co.uk/cookbook/spanish-recipes',
 'https://www.gousto.co.uk/cookbook/steak-recipes',
 'https://www.gousto.co.uk/cookbook/stew-recipes',
 'https://www.gousto.co.uk/cookbook/stir-fry-recipes',
 'https://www.gousto.co.uk/cookbook/sweet-potato-recipes',
 'https://www.gousto.co.uk/cookbook/thai-recipes',
 'https://www.gousto.co.uk/cookbook/tofu-recipes',
 'https://www.gousto.co.uk/cookbook/turkey-recipes',
 'https://www.gousto.co.uk/cookbook/turkish-recipes',
 'https://www.gousto.co.uk/cookbook/valentines-day',
 'https://www.gousto.co.uk/cookbook/vegan-recipes',
 'https://www.gousto.co.uk/cookbook/vegan-recipes/tofu-nuggets-bbq-beans-wedges',
 'https://www.gousto.co.uk/cookbook/vegetarian',
 'https://www.gousto.co.uk/cookbook/vegetarian-recipes',
 'https://www.gousto.co.uk/cookbook/vietnamese-recipes',
 'https://www.gousto.co.uk/cookbook/world-food']

In [24]:
urls = ['https://www.gousto.co.uk/cookbook/10-minute-meals']

In [26]:
def get_recipe_urls_from_category(category_url, driver):
    driver.get(category_url)
    time.sleep(2) #allow page to load.
    recipe_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/cookbook/']") #find all links containing cookbook
    urls = []
    for link in recipe_links:
        urls.append(link.get_attribute("href"))
    return urls

def scrape_all_gousto_recipes(category_urls, driver):
    visited_urls = set()
    all_recipe_data = []

    for category_url in category_urls:
        recipe_urls = get_recipe_urls_from_category(category_url, driver)
        for recipe_url in recipe_urls:
            if recipe_url not in visited_urls:
                visited_urls.add(recipe_url)
                recipe_data = scrape_gousto_recipe(recipe_url, driver) #your existing scrape function.
                if recipe_data:
                    all_recipe_data.append(recipe_data)
                time.sleep(1) #rate limiting.
    return all_recipe_data

In [27]:
service = Service(executable_path='chromedriver-win64/chromedriver.exe') # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
category_urls = urls

try:
    all_recipe_data = scrape_all_gousto_recipes(category_urls, driver)
    #store all_recipe_data into the database.
finally:
    driver.quit()

In [35]:
for i in all_recipe_data:
    if i['ingredients']:
        insert_recipe_data(i, 'C:/Users/brxce/Documents/Python Projects/mealplanner/recipes.sqlite')

[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]
[('recipes',), ('ingredients',), ('recipe_ingredients',)]


In [30]:
insert_recipe_data(all_recipe_data, 'C:/Users/brxce/Documents/Python Projects/mealplanner/recipes.sqlite')

[('recipes',), ('ingredients',), ('recipe_ingredients',)]


TypeError: list indices must be integers or slices, not str

In [41]:
service = Service(executable_path='chromedriver-win64/chromedriver.exe') # Replace with your chromedriver path.
driver = webdriver.Chrome(service=service)
url = 'https://www.gousto.co.uk/cookbook/20-minute-meals'
scrape_gousto_recipe(url, driver)

{'title': 'Speedy',
 'ingredients': '',
 'url': 'https://www.gousto.co.uk/cookbook/20-minute-meals'}