## Web Scrapping with Playwright

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import os
import time
from urllib.parse import urljoin, urlparse

from playwright.async_api import async_playwright

In [3]:
url = "https://www.bbcgoodfood.com"
base_dir = 'data/raw'
delay_seconds = 15

if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [4]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()

### Get all recipe categories by meal type

In [5]:
# Obtain xpath from inspect website
post_xpath = "/html/body/div/div[4]/main/div[2]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/ul/li"

### Format text function

In [6]:
def formatText(txt):
    return txt.replace(" ", "-").lower()

In [7]:
post_links = []
page_url = page.url
meal_types = [] # avoid duplicates 
while True:
    page_url = f'{url}/search?'
    print(page_url)
    await page.goto(page_url)
    await page.wait_for_load_state()
    time.sleep(delay_seconds)
    # Indicator of whether a new meal type was found in this iteration
    found_new_meal_type = False

    # Iterator to find meal types from filter input
    for elm in await page.locator("xpath="+post_xpath).element_handles():
        item_name = await elm.get_attribute("data-item-name")
        meal_type = formatText(item_name)
        if meal_type not in meal_types:
            meal_types.append(meal_type)
            found_new_meal_type = True
            post_url = urljoin(page_url, f'search?tab=recipe&mealType={meal_type}')
            post_links.append(post_url)
    
    # If no new meal type was found, exit the loop
    if not found_new_meal_type:
        break
print(len(post_links), len(set(post_links)), len(meal_types))

https://www.bbcgoodfood.com/search?
https://www.bbcgoodfood.com/search?
26 26 26


In [8]:
post_links

['https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=breads',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=breakfast',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=brunch',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=buffet',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=canapes',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=cheese-course',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=cocktails',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=condiment',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=dessert',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=dinner',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=drink',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=fish-course',
 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=hdpsummer24',
 'https://www.bbcgoodfood.com/search?tab=r

In [9]:
recipes_links = [{"meal": meal_type, "link": post_link, "recipes_links": []} for meal_type, post_link in zip(meal_types, post_links)]

In [10]:
recipes_links

[{'meal': 'afternoon-tea',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea',
  'recipes_links': []},
 {'meal': 'breads',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=breads',
  'recipes_links': []},
 {'meal': 'breakfast',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=breakfast',
  'recipes_links': []},
 {'meal': 'brunch',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=brunch',
  'recipes_links': []},
 {'meal': 'buffet',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=buffet',
  'recipes_links': []},
 {'meal': 'canapes',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=canapes',
  'recipes_links': []},
 {'meal': 'cheese-course',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=cheese-course',
  'recipes_links': []},
 {'meal': 'cocktails',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=cocktails',
  'recipes_links': []},
 {'mea

### Get All Recipes Links

In [16]:
recipes_xpath = "/html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div/div[1]/div/article/div[2]/a"
                
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[3]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[1]/div[1]/div[2]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[1]/div[1]/div[3]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[3]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[6]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[2]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[3]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[6]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[2]/div[1]/div[2]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[1]/div[1]/div[2]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[1]/div[1]/div[5]/article/div[2]/a
# /html/body/div/div[4]/main/div[2]/div/div[5]/div/div[1]/div[1]/div[1]/div[2]/article/div[2]/a

### Get Links for every path

In [12]:
for item in recipes_links:
    meal = item['meal']
    base_url = item['link']
    page_number = 1

    while True:
        page_url = f'{base_url}&page={page_number}'
        print(page_url)
        await page.goto(page_url)
        await page.wait_for_load_state()
        time.sleep(delay_seconds)

        # check if the page has loaded correctly
        if page.url != page_url:
            break
        
        print("loading...")
        recipe_count = 0
        for elm in await page.locator("xpath=" + post_xpath).element_handles():
            post_url = urljoin(page_url, await elm.get_attribute("href"))
            name = ' '.join(post_url[post_url.rfind("/") + 1:].split('-')).title()
            item['recipes_links'].append({'name': name, 'link': post_url})
            recipe_count += 1
        
        print(f"{recipe_count} recipes added to '{meal}'.")

        if recipe_count == 0:
            break

        page_number += 1

print("\nDone!")

https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=1
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=2
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=3
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=4
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=5
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=6
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=7
loading...
26 recipes added to 'afternoon-tea'.
https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=8
loading...
26 recipes added to 'afternoon-tea'.
https://

In [14]:
recipes_links

[{'meal': 'afternoon-tea',
  'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea',
  'recipes_links': [{'name': 'Search?Tab=Recipe&Mealtype=Afternoon Tea&Page=1',
    'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=1'},
   {'name': 'Search?Tab=Recipe&Mealtype=Afternoon Tea&Page=1',
    'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=1'},
   {'name': 'Search?Tab=Recipe&Mealtype=Afternoon Tea&Page=1',
    'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=1'},
   {'name': 'Search?Tab=Recipe&Mealtype=Afternoon Tea&Page=1',
    'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=1'},
   {'name': 'Search?Tab=Recipe&Mealtype=Afternoon Tea&Page=1',
    'link': 'https://www.bbcgoodfood.com/search?tab=recipe&mealType=afternoon-tea&page=1'},
   {'name': 'Search?Tab=Recipe&Mealtype=Afternoon Tea&Page=1',
    'link': 'https://www.bbcgoodfood