In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import os
import json
import time
from urllib.parse import urljoin, urlparse

from playwright.async_api import async_playwright

In [3]:
url = "https://www.bbcgoodfood.com"
base_dir = 'data/raw'
delay_seconds = 5

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [4]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()

In [5]:
# Obtain xpath from inspect website
post_xpath = "/html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div/div[1]/div/article/div[2]/a"

In [6]:
recipes_links = []
page_id = 1
page_url = page.url
while page_id < 101:
    page_url = f'{url}/search?page={page_id}'
    # print(page_url)
    await page.goto(page_url)
    await page.wait_for_load_state()
    time.sleep(delay_seconds)
    if page.url != page_url:
        break
    for elm in await page.locator("xpath=" + post_xpath).element_handles():
        post_url = urljoin(page_url, await elm.get_attribute("href"))
        if "premium" not in post_url:
            recipes_links.append(post_url)
    page_id += 1
print(f'{page_id - 1} pages added to recipes_links')
print(len(recipes_links), len(set(recipes_links)))

100 pages added to recipes_links
2617 2617


In [7]:
recipes_links[:10]

['https://www.bbcgoodfood.com/recipes/medal-cookies',
 'https://www.bbcgoodfood.com/recipes/next-level-moussaka',
 'https://www.bbcgoodfood.com/recipes/crispy-chipotle-chicken-fajitas',
 'https://www.bbcgoodfood.com/recipes/mango-sgroppino',
 'https://www.bbcgoodfood.com/recipes/green-chicken-salad',
 'https://www.bbcgoodfood.com/recipes/spicy-cucumber-watermelon-salad',
 'https://www.bbcgoodfood.com/recipes/mediterranean-salad-with-hummus-dressing',
 'https://www.bbcgoodfood.com/recipes/summer-tomato-cheese-toastie',
 'https://www.bbcgoodfood.com/recipes/beef-red-chimichurri-quinoa',
 'https://www.bbcgoodfood.com/recipes/beef-sandwich-with-pink-pickled-onions']

In [8]:
def formatTitle(txt):
    return txt.replace("-", " ").lower()

In [9]:
def save_page(data, path: str, encoding: str = "utf-8") -> None:
    """Save recipes array in one file."""
    
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [10]:
def get_title(url):
    """Return the file path for saving the forum post."""
    path_components = urlparse(url).path.split('/')
    title = formatTitle(path_components[2])
    return title


### Get the html for each post

In [12]:
all_recipes = []
path = f'{base_dir}/all_recipes.json'

for ix, recipe_link in enumerate(recipes_links):
    print(ix, recipe_link)
    title = get_title(recipe_link)
    try:
        await page.goto(recipe_link)
        await page.wait_for_load_state()
    except Exception as e:
        print(f"Error loading {recipe_link}: {e}")
        continue  # Continuar con la siguiente URL si hay un error
    time.sleep(delay_seconds)
    html = await page.content()
    print(len(html))
    
    # Add the recipe to the list of all recipes
    recipe_info = {
        "id": ix,
        "title": title,
        "url": recipe_link,
        "html": html,
    }
    
    all_recipes.append(recipe_info)
    
    # if ix > 10:
    #     break

# Save all the recipes in one json file
save_page(all_recipes, path)

0 https://www.bbcgoodfood.com/recipes/medal-cookies
528207
1 https://www.bbcgoodfood.com/recipes/next-level-moussaka
518448
2 https://www.bbcgoodfood.com/recipes/crispy-chipotle-chicken-fajitas
521737
3 https://www.bbcgoodfood.com/recipes/mango-sgroppino
484091
4 https://www.bbcgoodfood.com/recipes/green-chicken-salad
499546
5 https://www.bbcgoodfood.com/recipes/spicy-cucumber-watermelon-salad
515926
6 https://www.bbcgoodfood.com/recipes/mediterranean-salad-with-hummus-dressing
512537
7 https://www.bbcgoodfood.com/recipes/summer-tomato-cheese-toastie
502292
8 https://www.bbcgoodfood.com/recipes/beef-red-chimichurri-quinoa
504568
9 https://www.bbcgoodfood.com/recipes/beef-sandwich-with-pink-pickled-onions
495828
10 https://www.bbcgoodfood.com/recipes/vermicelli-noodle-beef-salad
495833
11 https://www.bbcgoodfood.com/recipes/chocolate-chip-pecan-butternut-bread
510379
12 https://www.bbcgoodfood.com/recipes/warm-trout-melon-salad-with-lime-chilli-dressing
514223
13 https://www.bbcgoodfood

In [13]:
await browser.close()
await playwright.stop()