# Data Extraction Pipeline

In [2]:
import requests
import time
import re
# import scrapy
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

## Helper Functions

For WordPress Recipe Maker Websites

In [3]:
def extract_nutrition(url_soup) -> pd.DataFrame:
    class_regex = re.compile("wprm-nutrition-label-text-nutrition-container wprm-nutrition-label-text-nutrition-container-")
    nutrition_regex = re.compile("wprm-nutrition-label-container wprm-nutrition-label-container-simple wprm-block-text-.*")

    nutrition_parent = url_soup.find("div", attrs={"class": nutrition_regex})

    nutrition_df = pd.DataFrame([])
    if nutrition_parent:
        nutritions = nutrition_parent.find_all("span", attrs={"class": class_regex})

        for nutrition in nutritions:
            # Extract the nutrition label text
            nutrition_name = re.sub("wprm-nutrition-label-text-nutrition-container-", "", nutrition.get("class")[1]).strip()
            nutrition_amount = nutrition.find("span", attrs={"class": "wprm-nutrition-label-text-nutrition-value"}).text.strip()
            nutrition_df = pd.concat([nutrition_df, pd.DataFrame({nutrition_name: [nutrition_amount]})], axis=1)
    
    return nutrition_df

def extract_instructions(url_soup) -> int:
    instructions = url_soup.find_all("div", attrs={"class": "wprm-recipe-instruction-group"})

    total_steps = 0
    if instructions:
        instruction_regex = re.compile("^wprm-recipe-[0-9]*-step-[0-9]-[0-9]$")

        for instruction in instructions:
            num_steps = len(instruction.find_all("li", attrs={"id": instruction_regex}))
            total_steps += num_steps
    else:
        print(f"    No instructions found for recipe")
    
    return total_steps

def extract_times(url_soup) -> pd.DataFrame:
    time_regex = re.compile("wprm-recipe-details wprm-recipe-details-[a-z]* wprm-recipe-.*")

    time_info = url_soup.find_all("span", attrs={"class": time_regex})

    compiled_times = {}
    if time_info:
        for time in time_info:
            time_desc = time.get("class")[2][12:] # Get time type (prep/cook/total)
            current_time = time.text # Get time in string
            
            # Convert current time into minutes
            # Initialize total time in minutes if it hasn't exist
            try:
                compiled_times[time_desc]
            except: 
                compiled_times[time_desc] = 0

            minutes = re.search("(.*) minute(s)?", current_time)
            hours = re.search("(.*) hour(s)?", current_time)

            if minutes:
                compiled_times[time_desc] += int(minutes.group(1))
                
            if hours:
                compiled_times[time_desc] += int(hours.group(1)) * 60
    else:
        print(f"    No time info found for recipe")
                
    return pd.DataFrame([compiled_times])

## The Woks of Life

### Get all recipe links

In [None]:
pages = range(1, 94) # Adjust this range to scrape more or fewer pages
link_img_df = pd.DataFrame([])

for page in pages:
    page_url = f"https://thewoksoflife.com/blog/page/{page}/"

    response = requests.get(page_url)
    main_soup = BeautifulSoup(response.text, 'html.parser')

    # Get all recipe links on the page
    df = pd.DataFrame([])
    for recipe in main_soup.find_all('a', attrs={"target": "_self"}):
        img = recipe.find("img")

        # Check if image exists, if not, skip the recipe
        if not img:
            continue

        recipe_url = recipe.get("href")
        img_url = img.get("data-lazy-src") or img.get("src")
        df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)

    # Append to main dataframe
    link_img_df = pd.concat([link_img_df, df], ignore_index=True)
    print(f"Found {len(df)} recipe links in category")

print(f"Total recipe links found: {len(link_img_df)}")
link_img_df

     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
Found 18 recipe links in category
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     No image found
     N

Unnamed: 0,recipe_url,img_url
0,https://thewoksoflife.com/chinese-beef-stick-s...,https://thewoksoflife.com/wp-content/uploads/2...
1,https://thewoksoflife.com/chinese-meat-pie/,https://thewoksoflife.com/wp-content/uploads/2...
2,https://thewoksoflife.com/edamame-hummus/,https://thewoksoflife.com/wp-content/uploads/2...
3,https://thewoksoflife.com/chinese-eggs-puff-pa...,https://thewoksoflife.com/wp-content/uploads/2...
4,https://thewoksoflife.com/string-bean-stir-fry...,https://thewoksoflife.com/wp-content/uploads/2...
...,...,...
1668,https://thewoksoflife.com/from-expat-to-local-...,https://thewoksoflife.com/wp-content/uploads/2...
1669,https://thewoksoflife.com/the-one-phrase-youll...,https://thewoksoflife.com/wp-content/uploads/2...
1670,https://thewoksoflife.com/stir-fried-snow-pea-...,https://thewoksoflife.com/wp-content/uploads/2...
1671,https://thewoksoflife.com/how-did-i-end-up-in-...,https://thewoksoflife.com/wp-content/uploads/2...


### Scrape all recipe links

In [13]:
woks_of_life_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        woks_of_life_df = pd.concat([woks_of_life_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

woks_of_life_df

Scraping recipe: https://thewoksoflife.com/chinese-beef-stick-skewers/, 1/1673
Scraping recipe: https://thewoksoflife.com/chinese-meat-pie/, 2/1673
Scraping recipe: https://thewoksoflife.com/edamame-hummus/, 3/1673
Scraping recipe: https://thewoksoflife.com/chinese-eggs-puff-pastry/, 4/1673
Scraping recipe: https://thewoksoflife.com/string-bean-stir-fry-pork-zha-cai/, 5/1673
Scraping recipe: https://thewoksoflife.com/miso-caesar-dressing/, 6/1673
Scraping recipe: https://thewoksoflife.com/shaking-beef-recipe-vietnamese-bo-luc-lac/, 7/1673
Scraping recipe: https://thewoksoflife.com/steamed-fish-black-bean-sauce/, 8/1673
Scraping recipe: https://thewoksoflife.com/green-sauce-recipe/, 9/1673
Scraping recipe: https://thewoksoflife.com/cold-tossed-tofu/, 10/1673
Scraping recipe: https://thewoksoflife.com/pork-dill-dumplings/, 11/1673
Scraping recipe: https://thewoksoflife.com/coconut-chia-pudding-grass-jelly/, 12/1673
Scraping recipe: https://thewoksoflife.com/cantonese-braised-ribs-bitter-

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,custom_time,total_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,cook_time,serving_size
0,Chinese Beef on a Stick (Beef Skewers),https://thewoksoflife.com/chinese-beef-stick-s...,"[flank steak, sugar, cornstarch, garlic powder...",https://thewoksoflife.com/wp-content/uploads/2...,4,15.0,240.0,255.0,137,3,...,275,271,0.2,1,0.1,0.1,25,1,,
1,"Chinese Meat Pie, 千层肉饼",https://thewoksoflife.com/chinese-meat-pie/,"[all-purpose flour, boiling water, room temper...",https://thewoksoflife.com/wp-content/uploads/2...,7,10.0,60.0,110.0,498,51,...,483,392,3,2,338,7,48,4,40.0,
2,Edamame Hummus,https://thewoksoflife.com/edamame-hummus/,"[frozen shelled edamame, tahini, water, lemon,...",https://thewoksoflife.com/wp-content/uploads/2...,3,5.0,,15.0,193,8,...,299,251,3,1,126,14,49,2,10.0,
3,Chinese Egg Puff Pastry,https://thewoksoflife.com/chinese-eggs-puff-pa...,"[puff pastry dough, eggs, water, toasted sesam...",https://thewoksoflife.com/wp-content/uploads/2...,6,15.0,,45.0,461,32,...,494,159,2,1,364,2,69,3,30.0,
4,String Bean Stir-Fry with Pork & Zha cai,https://thewoksoflife.com/string-bean-stir-fry...,"[ground pork, cornstarch, white pepper, water,...",https://thewoksoflife.com/wp-content/uploads/2...,4,20.0,,35.0,239,11,...,468,392,3,4,816,16,58,2,15.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,Braised Oxtails,https://thewoksoflife.com/braised-oxtails/,"[oxtails, oil, fresh ginger, garlic, Shaoxing ...",https://thewoksoflife.com/wp-content/uploads/2...,4,15.0,,135.0,474,4,...,962,38,1,2,,0.7,43,7.2,120.0,
1306,Tea Eggs,https://thewoksoflife.com/tea-eggs/,"[eggs, water, dry Chinese tea leaves of your c...",https://thewoksoflife.com/wp-content/uploads/2...,3,720.0,,765.0,66,1,...,276,67,1,1,238,1,30,1,45.0,
1307,Roast Pork with Five-Spice,https://thewoksoflife.com/roast-pork-with-five...,"[pork belly, salt, sugar, five spice powder, g...",https://thewoksoflife.com/wp-content/uploads/2...,4,60.0,,135.0,,,...,,,,,,,,,75.0,
1308,Chinese Banquet Fried Rice,https://thewoksoflife.com/banquet-fried-rice/,"[cooked rice, vegetable oil, eggs, handful of ...",https://thewoksoflife.com/wp-content/uploads/2...,6,10.0,,25.0,391,54,...,726,262,3,4,3945,13.6,51,1.5,15.0,


In [19]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
woks_of_life_df.img_url = woks_of_life_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)
woks_of_life_df.img_url

0       https://thewoksoflife.com/wp-content/uploads/2...
1       https://thewoksoflife.com/wp-content/uploads/2...
2       https://thewoksoflife.com/wp-content/uploads/2...
3       https://thewoksoflife.com/wp-content/uploads/2...
4       https://thewoksoflife.com/wp-content/uploads/2...
                              ...                        
1305    https://thewoksoflife.com/wp-content/uploads/2...
1306    https://thewoksoflife.com/wp-content/uploads/2...
1307    https://thewoksoflife.com/wp-content/uploads/2...
1308    https://thewoksoflife.com/wp-content/uploads/2...
1309    https://thewoksoflife.com/wp-content/uploads/2...
Name: img_url, Length: 1310, dtype: object

In [20]:
str_cols = woks_of_life_df.columns[:4]
numeric_cols = woks_of_life_df.columns[4:]
woks_of_life_df[numeric_cols] = woks_of_life_df[numeric_cols].astype(np.float64)
woks_of_life_df[str_cols] = woks_of_life_df[str_cols].astype("string")
woks_of_life_df.to_pickle("../data/raw/woks_of_life_recipes.pkl", protocol=4)

## Just One Cookbook

### Get all recipe links

In [23]:
pages = range(1, 22) # Adjust this range to scrape more or fewer pages
link_img_df = pd.DataFrame([])

for page in pages:
    page_url = f"https://www.justonecookbook.com/categories/recipes/?fwp_paged={page}/" 

    response = requests.get(page_url)
    main_soup = BeautifulSoup(response.text, 'html.parser')

    # Get all recipe links on the page
    df = pd.DataFrame([])
    for recipe in main_soup.find_all('div', attrs={"class": "gridlink"}):
        img = recipe.find("img")

        # Check if image exists, if not, skip the recipe
        if not img:
            continue

        recipe_url = recipe.find("a").get("href")
        img_url = img.get("data-lazy-src") or img.get("src")
        df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)

    # Append to main dataframe
    link_img_df = pd.concat([link_img_df, df], ignore_index=True)
    print(f"Found {len(df)} recipe links in category")

print(f"Total recipe links found: {len(link_img_df)}")
link_img_df

Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Found 48 recipe links in category
Total recipe links found: 1008


Unnamed: 0,recipe_url,img_url
0,https://www.justonecookbook.com/matcha-mochi-w...,https://www.justonecookbook.com/wp-content/upl...
1,https://www.justonecookbook.com/salmon-foil/,https://www.justonecookbook.com/wp-content/upl...
2,https://www.justonecookbook.com/how-to-make-cu...,https://www.justonecookbook.com/wp-content/upl...
3,https://www.justonecookbook.com/matcha-butter-...,https://www.justonecookbook.com/wp-content/upl...
4,https://www.justonecookbook.com/okonomiyaki/,https://www.justonecookbook.com/wp-content/upl...
...,...,...
1003,https://www.justonecookbook.com/potato-leek-soup/,https://www.justonecookbook.com/wp-content/upl...
1004,https://www.justonecookbook.com/shrimp-celery-...,https://www.justonecookbook.com/wp-content/upl...
1005,https://www.justonecookbook.com/easy-paella/,https://www.justonecookbook.com/wp-content/upl...
1006,https://www.justonecookbook.com/meat-doria/,https://www.justonecookbook.com/wp-content/upl...


### Scrape all recipe links

In [25]:
just_one_cookbook_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        just_one_cookbook_df = pd.concat([just_one_cookbook_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

just_one_cookbook_df

Scraping recipe: https://www.justonecookbook.com/matcha-mochi-waffles/, 1/1008
Scraping recipe: https://www.justonecookbook.com/salmon-foil/, 2/1008
Scraping recipe: https://www.justonecookbook.com/how-to-make-curry-roux/, 3/1008
Scraping recipe: https://www.justonecookbook.com/matcha-butter-mochi/, 4/1008
Scraping recipe: https://www.justonecookbook.com/okonomiyaki/, 5/1008
Scraping recipe: https://www.justonecookbook.com/salmon-and-ikura-dashi-chazuke/, 6/1008
Scraping recipe: https://www.justonecookbook.com/loco-moco/, 7/1008
Scraping recipe: https://www.justonecookbook.com/takoyaki-recipe/, 8/1008
Scraping recipe: https://www.justonecookbook.com/plum-cake/, 9/1008
Scraping recipe: https://www.justonecookbook.com/stir-fried-pork-liver-and-garlic-chives/, 10/1008
Scraping recipe: https://www.justonecookbook.com/beef-udon/, 11/1008
Scraping recipe: https://www.justonecookbook.com/nitsuke-recipe/, 12/1008
Scraping recipe: https://www.justonecookbook.com/crispy-baked-chicken/, 13/1008
S

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,total_time,prep_time,cook_time,calories,carbohydrates,...,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,polyunsaturated_fat,monounsaturated_fat,custom_time
0,Matcha Mochi Waffles,https://www.justonecookbook.com/matcha-mochi-w...,"[unsalted butter, matcha (green tea powder), m...",https://www.justonecookbook.com/wp-content/upl...,20,105,15.0,25.0,,,...,,,,,,,,,,
1,Salmon in Foil (Video) 鮭のホイル焼き,https://www.justonecookbook.com/salmon-foil/,"[skin-on salmon fillets, Diamond Crystal koshe...",https://www.justonecookbook.com/wp-content/upl...,23,160,8.0,17.0,314,13,...,3,5,7459,6,43,1,,,,
2,How to Make Japanese Curry Roux (Video) カレールーの作り方,https://www.justonecookbook.com/how-to-make-cu...,"[unsalted butter, all-purpose flour (plain flo...",https://www.justonecookbook.com/wp-content/upl...,16,560,5.0,30.0,1175,92,...,13,1,3132,4,155,12,1,4,22,
3,Matcha Butter Mochi,https://www.justonecookbook.com/matcha-butter-...,"[matcha (green tea powder), unsalted butter, m...",https://www.justonecookbook.com/wp-content/upl...,22,372,20.0,50.0,,,...,,,,,,,,,,60.0
4,Okonomiyaki Recipe (Video) お好み焼き,https://www.justonecookbook.com/okonomiyaki/,"[all-purpose flour (plain flour), Diamond Crys...",https://www.justonecookbook.com/wp-content/upl...,25,240,15.0,25.0,725,62,...,9,12,502,73,153,4,,5,16,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,Potato Leek Soup ポテトとリークのスープ,https://www.justonecookbook.com/potato-leek-soup/,"[unsalted butter, russet potato, leek, vegetab...",https://www.justonecookbook.com/wp-content/upl...,7,160,10.0,30.0,181,19,...,1,7,744,6,161,1,,1,2,
974,Easy Shrimp and Celery Stir-Fry,https://www.justonecookbook.com/shrimp-celery-...,"[celery, large egg (50 g each w/o shell), toas...",https://www.justonecookbook.com/wp-content/upl...,9,120,5.0,15.0,257,7,...,1,4,542,2,101,3,,5,5,
975,Easy Paella-Inspired Rice Casserole,https://www.justonecookbook.com/easy-paella/,"[manila clams, Spanish saffron, warm water, ch...",https://www.justonecookbook.com/wp-content/upl...,11,270,10.0,60.0,485,32,...,3,4,1867,52,179,5,,3,8,240.0
976,Meat Doria (Rice Gratin) ミートドリア,https://www.justonecookbook.com/meat-doria/,"[cooked Japanese short-grain rice, onion, cele...",https://www.justonecookbook.com/wp-content/upl...,14,260,15.0,45.0,764,66,...,3,12,3201,14,430,7,,2,16,


In [26]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
just_one_cookbook_df.img_url = just_one_cookbook_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)
just_one_cookbook_df.img_url

0      https://www.justonecookbook.com/wp-content/upl...
1      https://www.justonecookbook.com/wp-content/upl...
2      https://www.justonecookbook.com/wp-content/upl...
3      https://www.justonecookbook.com/wp-content/upl...
4      https://www.justonecookbook.com/wp-content/upl...
                             ...                        
973    https://www.justonecookbook.com/wp-content/upl...
974    https://www.justonecookbook.com/wp-content/upl...
975    https://www.justonecookbook.com/wp-content/upl...
976    https://www.justonecookbook.com/wp-content/upl...
977    https://www.justonecookbook.com/wp-content/upl...
Name: img_url, Length: 978, dtype: object

In [27]:
str_cols = just_one_cookbook_df.columns[:4]
numeric_cols = just_one_cookbook_df.columns[4:]
just_one_cookbook_df[numeric_cols] = just_one_cookbook_df[numeric_cols].astype(np.float64)
just_one_cookbook_df[str_cols] = just_one_cookbook_df[str_cols].astype("string")
just_one_cookbook_df.to_pickle("../data/raw/just_one_cookbook_recipes.pkl", protocol=4)

## RecipeTinEats

### Get all recipe and its image links

In [30]:
pages = range(1, 81) # Adjust this range to scrape more or fewer pages
link_img_df = pd.DataFrame([])

for page in pages:
    page_url = f"https://www.recipetineats.com/recipes/?fwp_paged={page}/" 

    response = requests.get(page_url)
    main_soup = BeautifulSoup(response.text, 'html.parser')

    # Get all recipe links on the page
    df = pd.DataFrame([])
    for recipe in main_soup.find_all('a', attrs={"class": "entry-image-link"}):
        img = recipe.find("img")

        # Check if image exists, if not, skip the recipe
        if not img:
            continue

        recipe_url = recipe.get("href")
        img_url = img.get("data-lazy-src") or img.get("src")
        df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)

    # Append to main dataframe
    link_img_df = pd.concat([link_img_df, df], ignore_index=True)
    print(f"Found {len(df)} recipe links in category")

print(f"Total recipe links found: {len(link_img_df)}")
link_img_df

Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recipe links in category
Found 20 recip

Unnamed: 0,recipe_url,img_url
0,https://www.recipetineats.com/birria-tacos/,https://www.recipetineats.com/tachyon/2025/09/...
1,https://www.recipetineats.com/seafood-chowder/,https://www.recipetineats.com/tachyon/2025/09/...
2,https://www.recipetineats.com/maple-pecan-pie-...,https://www.recipetineats.com/tachyon/2025/08/...
3,https://www.recipetineats.com/moroccan-spiral-...,https://www.recipetineats.com/tachyon/2025/08/...
4,https://www.recipetineats.com/crispy-chinese-l...,https://www.recipetineats.com/tachyon/2025/08/...
...,...,...
1595,https://www.recipetineats.com/chickpea-rice-pi...,https://www.recipetineats.com/tachyon/2014/06/...
1596,https://www.recipetineats.com/vegetarian-chow-...,https://www.recipetineats.com/tachyon/2014/06/...
1597,https://www.recipetineats.com/cinnamon-sugar-t...,https://www.recipetineats.com/tachyon/2014/06/...
1598,https://www.recipetineats.com/dan-dan-noodle-s...,https://www.recipetineats.com/tachyon/2014/06/...


### Scrape all recipe links

In [34]:
recipe_tin_eats_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        recipe_tin_eats_df = pd.concat([recipe_tin_eats_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

recipe_tin_eats_df

Scraping recipe: https://www.recipetineats.com/birria-tacos/, 1/1600
Scraping recipe: https://www.recipetineats.com/seafood-chowder/, 2/1600
Scraping recipe: https://www.recipetineats.com/maple-pecan-pie-bars/, 3/1600
Scraping recipe: https://www.recipetineats.com/moroccan-spiral-meatball-zucchini-bake/, 4/1600
Scraping recipe: https://www.recipetineats.com/crispy-chinese-lemon-chicken/, 5/1600
Scraping recipe: https://www.recipetineats.com/broccoli-pearl-crunch-salad/, 6/1600
Scraping recipe: https://www.recipetineats.com/mini-cinnamon-muffins/, 7/1600
Scraping recipe: https://www.recipetineats.com/filipino-pork-adobo/, 8/1600
Scraping recipe: https://www.recipetineats.com/starring-dozer-row-row-row-for-charity/, 9/1600
    No ingredients found for recipe
Scraping recipe: https://www.recipetineats.com/vodka-pasta/, 10/1600
Scraping recipe: https://www.recipetineats.com/quick-and-dirty-focaccia/, 11/1600
Scraping recipe: https://www.recipetineats.com/chicken-breast-recipe/, 12/1600
Scr

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,custom_time,serving_size
0,Birria Tacos,https://www.recipetineats.com/birria-tacos/,"[guajillo chillies, dried, ancho chillies, dri...",https://www.recipetineats.com/tachyon/2025/09/...,17,30.0,180.0,,,,...,,,,,,,,,,
1,Seafood Chowder,https://www.recipetineats.com/seafood-chowder/,[seafood marinara mix or mixed fresh seafood –...,https://www.recipetineats.com/tachyon/2025/09/...,11,20.0,20.0,40.0,623,41,...,1596,842,5,6,7723,21,74,2,,
2,Maple pecan pie bars,https://www.recipetineats.com/maple-pecan-pie-...,"[unsalted butter, brown sugar, cooking salt / ...",https://www.recipetineats.com/tachyon/2025/08/...,10,20.0,30.0,,246,24,...,124,97,1,16,270,0.1,32,1,120.0,
3,Moroccan Spiral Meatball Zucchini Bake,https://www.recipetineats.com/moroccan-spiral-...,"[panko breadcrumbs, onion, lamb mince, egg, ga...",https://www.recipetineats.com/tachyon/2025/08/...,10,25.0,35.0,,495,10,...,736,458,1,2,517,4,58,3,,
4,Crispy Chinese Lemon Chicken,https://www.recipetineats.com/crispy-chinese-l...,"[chicken thigh fillets, cooking salt / kosher ...",https://www.recipetineats.com/tachyon/2025/08/...,11,20.0,10.0,,358,32,...,811,390,0.4,18,62,12,30,1,20.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,Chickpea Rice Pilaf,https://www.recipetineats.com/chickpea-rice-pi...,"[uncooked basmati rice, water, cooking salt / ...",https://www.recipetineats.com/tachyon/2014/06/...,4,5.0,10.0,15.0,,,...,,,,,,,,,,
1402,Real Vegetarian Chow Mein,https://www.recipetineats.com/vegetarian-chow-...,"[fresh chow mein noodles, oil, garlic cloves, ...",https://www.recipetineats.com/tachyon/2014/06/...,11,10.0,5.0,15.0,390,40.4,...,1079,667,3.7,8.7,7900,99.8,270,3.6,,436
1403,Cinnamon Sugar Tortilla Crisps,https://www.recipetineats.com/cinnamon-sugar-t...,"[ tortillas, butter, cinnamon, sugar]",https://www.recipetineats.com/tachyon/2014/06/...,7,5.0,5.0,10.0,9,1.2,...,4,3,0.5,0.1,,,,,,3
1404,Dan Dan Noodle Soup (Vegetarian),https://www.recipetineats.com/dan-dan-noodle-s...,"[dried rice stick noodles, vegetable or chicke...",https://www.recipetineats.com/tachyon/2014/06/...,7,5.0,10.0,15.0,288,42.4,...,1554,852,4.7,6.2,8550,138.6,170,3.2,,599


In [35]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
recipe_tin_eats_df.img_url = recipe_tin_eats_df.img_url.str.replace(r'\?resize=.*$', '', regex=True)
recipe_tin_eats_df

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,custom_time,serving_size
0,Birria Tacos,https://www.recipetineats.com/birria-tacos/,"[guajillo chillies, dried, ancho chillies, dri...",https://www.recipetineats.com/tachyon/2025/09/...,17,30.0,180.0,,,,...,,,,,,,,,,
1,Seafood Chowder,https://www.recipetineats.com/seafood-chowder/,[seafood marinara mix or mixed fresh seafood –...,https://www.recipetineats.com/tachyon/2025/09/...,11,20.0,20.0,40.0,623,41,...,1596,842,5,6,7723,21,74,2,,
2,Maple pecan pie bars,https://www.recipetineats.com/maple-pecan-pie-...,"[unsalted butter, brown sugar, cooking salt / ...",https://www.recipetineats.com/tachyon/2025/08/...,10,20.0,30.0,,246,24,...,124,97,1,16,270,0.1,32,1,120.0,
3,Moroccan Spiral Meatball Zucchini Bake,https://www.recipetineats.com/moroccan-spiral-...,"[panko breadcrumbs, onion, lamb mince, egg, ga...",https://www.recipetineats.com/tachyon/2025/08/...,10,25.0,35.0,,495,10,...,736,458,1,2,517,4,58,3,,
4,Crispy Chinese Lemon Chicken,https://www.recipetineats.com/crispy-chinese-l...,"[chicken thigh fillets, cooking salt / kosher ...",https://www.recipetineats.com/tachyon/2025/08/...,11,20.0,10.0,,358,32,...,811,390,0.4,18,62,12,30,1,20.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,Chickpea Rice Pilaf,https://www.recipetineats.com/chickpea-rice-pi...,"[uncooked basmati rice, water, cooking salt / ...",https://www.recipetineats.com/tachyon/2014/06/...,4,5.0,10.0,15.0,,,...,,,,,,,,,,
1402,Real Vegetarian Chow Mein,https://www.recipetineats.com/vegetarian-chow-...,"[fresh chow mein noodles, oil, garlic cloves, ...",https://www.recipetineats.com/tachyon/2014/06/...,11,10.0,5.0,15.0,390,40.4,...,1079,667,3.7,8.7,7900,99.8,270,3.6,,436
1403,Cinnamon Sugar Tortilla Crisps,https://www.recipetineats.com/cinnamon-sugar-t...,"[ tortillas, butter, cinnamon, sugar]",https://www.recipetineats.com/tachyon/2014/06/...,7,5.0,5.0,10.0,9,1.2,...,4,3,0.5,0.1,,,,,,3
1404,Dan Dan Noodle Soup (Vegetarian),https://www.recipetineats.com/dan-dan-noodle-s...,"[dried rice stick noodles, vegetable or chicke...",https://www.recipetineats.com/tachyon/2014/06/...,7,5.0,10.0,15.0,288,42.4,...,1554,852,4.7,6.2,8550,138.6,170,3.2,,599


In [36]:
str_cols = recipe_tin_eats_df.columns[:4]
numeric_cols = recipe_tin_eats_df.columns[4:]
recipe_tin_eats_df[numeric_cols] = recipe_tin_eats_df[numeric_cols].astype(np.float64)
recipe_tin_eats_df[str_cols] = recipe_tin_eats_df[str_cols].astype("string")
recipe_tin_eats_df.to_pickle("../data/raw/recipe_tin_eats_recipes.pkl", protocol=4)

## Omnivores Cookbook

### Get all recipe and its image links

In [567]:
pages = range(1, 17) # Adjust this range to scrape more or fewer pages
link_img_df = pd.DataFrame([])

for page in pages:
    page_url = f"https://omnivorescookbook.com/category/recipe/?_paged={page}/"

    response = requests.get(page_url)
    main_soup = BeautifulSoup(response.text, 'html.parser')

    # Get all recipe links on the page
    df = pd.DataFrame([])
    for recipe in main_soup.find_all('div', attrs={"class": "gridlink"}):
        img_url = recipe.parent.find("div", attrs={"class": "gridimage-a"}).find("img").get("data-src") or recipe.parent.find("div", attrs={"class": "gridimage-a"}).find("img").get("src")
        
        # Check if image exists, if not, skip the recipe
        if not img_url:
            print("No image found")
            continue

        recipe_url = recipe.find("a").get("href")
        df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)

    # Append to main dataframe
    link_img_df = pd.concat([link_img_df, df], ignore_index=True)
    print(f"Found {len(df)} recipe links in category")

print(f"Total recipe links found: {len(link_img_df)}")
link_img_df

Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Found 40 recipe links in category
Total recipe links found: 640


Unnamed: 0,recipe_url,img_url
0,https://omnivorescookbook.com/ginger-fried-rice/,https://omnivorescookbook.com/wp-content/uploa...
1,https://omnivorescookbook.com/kung-pao-tofu/,https://omnivorescookbook.com/wp-content/uploa...
2,https://omnivorescookbook.com/air-fryer-chines...,https://omnivorescookbook.com/wp-content/uploa...
3,https://omnivorescookbook.com/pork-lo-mein/,https://omnivorescookbook.com/wp-content/uploa...
4,https://omnivorescookbook.com/sichuan-dumplings/,https://omnivorescookbook.com/wp-content/uploa...
...,...,...
635,https://omnivorescookbook.com/tomato-noodle-soup/,https://omnivorescookbook.com/wp-content/uploa...
636,https://omnivorescookbook.com/chinese-spicy-ro...,https://omnivorescookbook.com/wp-content/uploa...
637,https://omnivorescookbook.com/honey-lotus-root...,https://omnivorescookbook.com/wp-content/uploa...
638,https://omnivorescookbook.com/chinese-pork-stock/,https://omnivorescookbook.com/wp-content/uploa...


### Scrape all recipe links

In [568]:
omnivores_cookbook_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        omnivores_cookbook_df = pd.concat([omnivores_cookbook_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

omnivores_cookbook_df

Scraping recipe: https://omnivorescookbook.com/ginger-fried-rice/, 1/640
Scraping recipe: https://omnivorescookbook.com/kung-pao-tofu/, 2/640
Scraping recipe: https://omnivorescookbook.com/air-fryer-chinese-roast-chicken/, 3/640
Scraping recipe: https://omnivorescookbook.com/pork-lo-mein/, 4/640
Scraping recipe: https://omnivorescookbook.com/sichuan-dumplings/, 5/640
Scraping recipe: https://omnivorescookbook.com/dumpling-wrappers-recipe/, 6/640
Scraping recipe: https://omnivorescookbook.com/zucchini-and-pork-stir-fry/, 7/640
Scraping recipe: https://omnivorescookbook.com/air-fryer-honey-sriracha-wings/, 8/640
Scraping recipe: https://omnivorescookbook.com/shanghai-fried-noodles/, 9/640
Scraping recipe: https://omnivorescookbook.com/rose-lemonade/, 10/640
Scraping recipe: https://omnivorescookbook.com/hojicha-purin/, 11/640
Scraping recipe: https://omnivorescookbook.com/cashew-shrimp/, 12/640
Scraping recipe: https://omnivorescookbook.com/yunnan-lime-chicken-salad/, 13/640
Scraping rec

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,sodium,potassium,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7,15,10.0,25.0,1,413,...,680,169,1.2,0.8,46,3,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8,20,20.0,40.0,1,302,...,534,374,3,7.3,229,3,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3,10,50.0,180.0,1,237,...,983,213,0.1,1,14,1,120.0,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7,15,15.0,30.0,1,457,...,724,723,3.9,11.1,96,4,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6,45,15.0,60.0,1,346,...,1062,303,1.1,4.1,9,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,Tomato Noodle Soup – The Ultimate Comfort Food,https://omnivorescookbook.com/tomato-noodle-soup/,"[vegetable oil, green onion, tomatoes, pork st...",https://omnivorescookbook.com/wp-content/uploa...,7,5,20.0,25.0,739,423,...,2005,529,4.3,7.8,70,3.1,,39.6,2600,
624,Chinese Spicy Roast Fish (重庆烤鱼),https://omnivorescookbook.com/chinese-spicy-ro...,"[whole sea bass, olive oil, chili pepper flake...",https://omnivorescookbook.com/wp-content/uploa...,10,20,30.0,50.0,265,410,...,2004,401,10.4,21.6,30,1.3,,7.4,550,
625,Honey Lotus Root with Sticky Rice (糯米藕),https://omnivorescookbook.com/honey-lotus-root...,"[glutinous rice, white sugar, 400 grams fresh ...",https://omnivorescookbook.com/wp-content/uploa...,10,20,80.0,100.0,109,194,...,29,409,,14.3,40,2,,40.4,,
626,Chinese Pork Stock,https://omnivorescookbook.com/chinese-pork-stock/,"[pork leg bones, Shaoxing wine, thumb ginger]",https://omnivorescookbook.com/wp-content/uploa...,9,5,240.0,245.0,43,61,...,24,179,,,,3,,,,


In [571]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
omnivores_cookbook_df.img_url = omnivores_cookbook_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)
omnivores_cookbook_df

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,sodium,potassium,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"['ground chicken', 'Shaoxing wine', 'soy sauce...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,680.0,169.0,1.2,0.8,46.0,3.0,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"['firm tofu', 'soy sauce', 'maple syrup', 'cor...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,534.0,374.0,3.0,7.3,229.0,3.0,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"['whole chicken', 'Shaoxing wine', 'salt', 'bl...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,983.0,213.0,0.1,1.0,14.0,1.0,120.0,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"['fresh lo mein noodles', 'chicken stock', 'oy...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,724.0,723.0,3.9,11.1,96.0,4.0,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"['soy sauce', 'scallions', 'ginger', 'bay leaf...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1062.0,303.0,1.1,4.1,9.0,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,Tomato Noodle Soup – The Ultimate Comfort Food,https://omnivorescookbook.com/tomato-noodle-soup/,"['vegetable oil', 'green onion', 'tomatoes', '...",https://omnivorescookbook.com/wp-content/uploa...,7.0,5.0,20.0,25.0,739.0,423.0,...,2005.0,529.0,4.3,7.8,70.0,3.1,,39.6,2600.0,
624,Chinese Spicy Roast Fish (重庆烤鱼),https://omnivorescookbook.com/chinese-spicy-ro...,"['whole sea bass', 'olive oil', 'chili pepper ...",https://omnivorescookbook.com/wp-content/uploa...,10.0,20.0,30.0,50.0,265.0,410.0,...,2004.0,401.0,10.4,21.6,30.0,1.3,,7.4,550.0,
625,Honey Lotus Root with Sticky Rice (糯米藕),https://omnivorescookbook.com/honey-lotus-root...,"['glutinous rice', 'white sugar', '400 grams f...",https://omnivorescookbook.com/wp-content/uploa...,10.0,20.0,80.0,100.0,109.0,194.0,...,29.0,409.0,,14.3,40.0,2.0,,40.4,,
626,Chinese Pork Stock,https://omnivorescookbook.com/chinese-pork-stock/,"['pork leg bones', 'Shaoxing wine', 'thumb gin...",https://omnivorescookbook.com/wp-content/uploa...,9.0,5.0,240.0,245.0,43.0,61.0,...,24.0,179.0,,,,3.0,,,,


In [572]:
str_cols = omnivores_cookbook_df.columns[:4]
numeric_cols = omnivores_cookbook_df.columns[4:]
omnivores_cookbook_df[numeric_cols] = omnivores_cookbook_df[numeric_cols].astype(np.float64)
omnivores_cookbook_df[str_cols] = omnivores_cookbook_df[str_cols].astype("string")
omnivores_cookbook_df.to_pickle("../data/raw/omnivores_cookbook_recipes.pkl", protocol=4)

## Love and Lemons

### Get all recipe and its image links

In [41]:
link_img_df = pd.DataFrame([])

page_url = "https://www.loveandlemons.com/recipes/"
response = requests.get(page_url)
main_soup = BeautifulSoup(response.text, 'html.parser')

# Get all recipe links on the page
df = pd.DataFrame([])
for recipe in main_soup.find('ol', attrs={"id": "recipeindex"}).find_all("li"):
    img = recipe.find("img")

    # Check if image exists, if not, skip the recipe
    if not img:
        continue

    recipe_url = recipe.find("a").get("href")
    img_url = img.get("data-original") or img.get("src")
    df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)

    # Append to main dataframe
    link_img_df = pd.concat([link_img_df, df], ignore_index=True)

link_img_df = link_img_df.drop_duplicates(subset=["recipe_url"]).reset_index(drop=True)
print(f"Total recipe links found: {len(link_img_df)}")

Total recipe links found: 1519


### Scrape all recipe links

In [58]:
love_and_lemons_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})] or [id.text for id in recipe_soup.find_all("li", attrs={"class": "ingredient"})]
      
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        love_and_lemons_df = pd.concat([love_and_lemons_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

love_and_lemons_df

Scraping recipe: https://www.loveandlemons.com/easy-dinner-ideas/, 1/1519
Scraping recipe: https://www.loveandlemons.com/salad-recipes/, 2/1519
Scraping recipe: https://www.loveandlemons.com/soup-recipes/, 3/1519
Scraping recipe: https://www.loveandlemons.com/healthy-breakfast-ideas/, 4/1519
Scraping recipe: https://www.loveandlemons.com/healthy-lunch-ideas/, 5/1519
Scraping recipe: https://www.loveandlemons.com/butternut-squash-soup/, 6/1519
Scraping recipe: https://www.loveandlemons.com/pesto-recipe/, 7/1519
Scraping recipe: https://www.loveandlemons.com/how-to-cook-spaghetti-squash/, 8/1519
Scraping recipe: https://www.loveandlemons.com/broccoli-cheddar-soup/, 9/1519
Scraping recipe: https://www.loveandlemons.com/tzatziki-sauce/, 10/1519
Scraping recipe: https://www.loveandlemons.com/overnight-oats-recipe/, 11/1519
Scraping recipe: https://www.loveandlemons.com/minestrone-soup/, 12/1519
Scraping recipe: https://www.loveandlemons.com/baked-potato/, 13/1519
Scraping recipe: https://ww

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,custom_time
0,60 Easy Dinner Ideas,https://www.loveandlemons.com/easy-dinner-ideas/,"[ marinara sauce, ricotta cheese, garlic clove...",https://cdn.loveandlemons.com/wp-content/uploa...,7,10.0,20.0,30.0,
1,51 Best Salad Recipes,https://www.loveandlemons.com/salad-recipes/,"[extra-virgin olive oil, red wine vinegar, gar...",https://cdn.loveandlemons.com/wp-content/uploa...,2,10.0,5.0,15.0,
2,35 Best Soup Recipes,https://www.loveandlemons.com/soup-recipes/,"[extra-virgin olive oil, large white onion, se...",https://cdn.loveandlemons.com/wp-content/uploa...,4,10.0,30.0,40.0,
3,60 Healthy Breakfast Ideas,https://www.loveandlemons.com/healthy-breakfas...,"[large yellow tomato, diced red onion, chopped...",https://cdn.loveandlemons.com/wp-content/uploa...,4,15.0,10.0,25.0,
4,41 Healthy Lunch Ideas,https://www.loveandlemons.com/healthy-lunch-id...,"[A grain, Massaged kale, A legume, A roasted v...",https://cdn.loveandlemons.com/wp-content/uploa...,2,15.0,20.0,35.0,
...,...,...,...,...,...,...,...,...,...
1504,Roasted Broccoli & Cranberry Salad,https://www.loveandlemons.com/roasted-broccoli...,"[2 cups broccoli florets, ¼ cup pine nuts, 4 c...",https://cdn.loveandlemons.com/wp-content/uploa...,0,,,,
1505,Molten Chocolate Cakes,https://www.loveandlemons.com/chocolate-molten...,[2 sticks + 6 tablespoons butter or vegan butt...,https://cdn.loveandlemons.com/wp-content/uploa...,0,,,,
1506,Sweet Curry Brown Sugar Popcorn,https://www.loveandlemons.com/sweet-curry-popc...,"[2 to 3 tablespoons coconut oil, ½ cup popping...",https://cdn.loveandlemons.com/wp-content/uploa...,0,,,,
1507,Vegan Nachos with Cashew “Cheese”,https://www.loveandlemons.com/vegan-nachos-cas...,"[Corn tortillas or tortilla chips, 1 ripe avoc...",https://cdn.loveandlemons.com/wp-content/uploa...,0,,,,


In [59]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
love_and_lemons_df.img_url = love_and_lemons_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)
love_and_lemons_df.img_url

0       https://cdn.loveandlemons.com/wp-content/uploa...
1       https://cdn.loveandlemons.com/wp-content/uploa...
2       https://cdn.loveandlemons.com/wp-content/uploa...
3       https://cdn.loveandlemons.com/wp-content/uploa...
4       https://cdn.loveandlemons.com/wp-content/uploa...
                              ...                        
1504    https://cdn.loveandlemons.com/wp-content/uploa...
1505    https://cdn.loveandlemons.com/wp-content/uploa...
1506    https://cdn.loveandlemons.com/wp-content/uploa...
1507    https://cdn.loveandlemons.com/wp-content/uploa...
1508    https://cdn.loveandlemons.com/wp-content/uploa...
Name: img_url, Length: 1509, dtype: object

In [60]:
str_cols = love_and_lemons_df.columns[:4]
numeric_cols = love_and_lemons_df.columns[4:]
love_and_lemons_df[numeric_cols] = love_and_lemons_df[numeric_cols].astype(np.float64)
love_and_lemons_df[str_cols] = love_and_lemons_df[str_cols].astype("string")
love_and_lemons_df.to_pickle("../data/raw/love_and_lemons_recipes.pkl", protocol=4)

## Minimalist Baker

### Get all recipe and its image links

In [None]:
pages = range(1, 86) # Adjust this range to scrape more or fewer pages
link_img_df = pd.DataFrame([])

for page in pages:
    page_url = f"https://minimalistbaker.com/recipe-index/?fwp_paged={page}/"
    
    response = requests.get(page_url)
    main_soup = BeautifulSoup(response.text, 'html.parser')

    # Get all recipe links on the page
    df = pd.DataFrame([])
    for recipe in main_soup.find_all('a', attrs={"class": "post-summary__image"}):
        img = recipe.find("img")
        
        # Check if image exists, if not, skip the recipe
        if not img_url:
            print("No image found")
            continue

        recipe_url = recipe.get("href")
        img_url = img.get("src")
        df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)
        print(f"Found {len(df)} recipe links in category")

    # Append to main dataframe
    link_img_df = pd.concat([link_img_df, df], ignore_index=True)

link_img_df = link_img_df.drop_duplicates(subset=["recipe_url"]).reset_index(drop=True)
print(f"Total recipe links found: {len(link_img_df)}")
link_img_df

Total recipe links found: 1700


Unnamed: 0,recipe_url,img_url
0,https://minimalistbaker.com/cauliflower-salad/,https://minimalistbaker.com/wp-content/uploads...
1,https://minimalistbaker.com/cherry-tomato-whit...,https://minimalistbaker.com/wp-content/uploads...
2,https://minimalistbaker.com/crispy-oven-roaste...,https://minimalistbaker.com/wp-content/uploads...
3,https://minimalistbaker.com/frozen-peach-daiqu...,https://minimalistbaker.com/wp-content/uploads...
4,https://minimalistbaker.com/lemony-quinoa-kale...,https://minimalistbaker.com/wp-content/uploads...
...,...,...
1695,https://minimalistbaker.com/vegan-peach-cupcak...,https://minimalistbaker.com/wp-content/uploads...
1696,https://minimalistbaker.com/7-ingredient-muesl...,https://minimalistbaker.com/wp-content/uploads...
1697,https://minimalistbaker.com/cucumber-cooler-co...,https://minimalistbaker.com/wp-content/uploads...
1698,https://minimalistbaker.com/strawberry-cake-ma...,https://minimalistbaker.com/wp-content/uploads...


### Scrape all recipe links

In [74]:
minimalist_baker_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        minimalist_baker_df = pd.concat([minimalist_baker_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

minimalist_baker_df

Scraping recipe: https://minimalistbaker.com/cauliflower-salad/, 1/1700
Scraping recipe: https://minimalistbaker.com/cherry-tomato-white-bean-pasta/, 2/1700
Scraping recipe: https://minimalistbaker.com/crispy-oven-roasted-okra/, 3/1700
Scraping recipe: https://minimalistbaker.com/frozen-peach-daiquiri/, 4/1700
Scraping recipe: https://minimalistbaker.com/lemony-quinoa-kale-salad-with-chickpeas/, 5/1700
Scraping recipe: https://minimalistbaker.com/gluten-free-funfetti-cupcakes/, 6/1700
Scraping recipe: https://minimalistbaker.com/coconut-lime-energy-bites/, 7/1700
Scraping recipe: https://minimalistbaker.com/zucchini-pesto-pasta/, 8/1700
Scraping recipe: https://minimalistbaker.com/jerk-tofu-roasted-plantain-bowls/, 9/1700
Scraping recipe: https://minimalistbaker.com/easy-mango-cucumber-salad/, 10/1700
Scraping recipe: https://minimalistbaker.com/vegan-chocolate-cheesecake-cups/, 11/1700
Scraping recipe: https://minimalistbaker.com/cucumber-lime-agua-fresca/, 12/1700
Scraping recipe: ht

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,total_time,serving_size,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,cook_time
0,Cauliflower Salad with Dates & Pistachios,https://minimalistbaker.com/cauliflower-salad/,"[lemon zest, lemon juice, olive oil, maple syr...",https://minimalistbaker.com/wp-content/uploads...,3,10.0,10,1,130,19.7,...,0,259,433,3.4,14.1,54,43,42,0.9,
1,Garlicky Cherry Tomato Pasta with White Beans,https://minimalistbaker.com/cherry-tomato-whit...,"[olive oil, garlic, sliced , shallot, minced...",https://minimalistbaker.com/wp-content/uploads...,5,10.0,25,1,538,90.1,...,0,399,1068,16.9,7.4,333,35,88,3.9,15.0
2,Crispy Oven Roasted Okra,https://minimalistbaker.com/crispy-oven-roaste...,"[okra , avocado or olive oil, sea salt]",https://minimalistbaker.com/wp-content/uploads...,5,5.0,25,1,98,8.4,...,0,300,339,3.6,1.7,40,26,94,0.7,20.0
3,Frozen Peach Daiquiri (2 Ways!),https://minimalistbaker.com/frozen-peach-daiqu...,"[frozen sliced peaches, frozen pineapple chunk...",https://minimalistbaker.com/wp-content/uploads...,3,5.0,5,1,55,14.3,...,0,4,156,1.8,10.6,80,93,19,0.7,
4,Lemony Quinoa Kale Salad with Crispy Chickpeas,https://minimalistbaker.com/lemony-quinoa-kale...,"[chopped sun-dried tomatoes, cooked quinoa, ch...",https://minimalistbaker.com/wp-content/uploads...,7,10.0,30,1,474,51.4,...,0,1013,819,11,12.2,290,39,159,4.1,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1679,Vegan Peach Cupcakes with Honey Buttercream Fr...,https://minimalistbaker.com/vegan-peach-cupcak...,"[light vanilla soy milk, apple cider vinegar, ...",https://minimalistbaker.com/wp-content/uploads...,12,25.0,70,1,202,31,...,0,22,,1,16,,,,,45.0
1680,7-Ingredient Muesli Bread,https://minimalistbaker.com/7-ingredient-muesl...,"[warm water, unbleached flour, whole wheat flo...",https://minimalistbaker.com/wp-content/uploads...,10,30.0,60,1,106,19.4,...,0,196,67,1.1,1.5,0,0,10,1.3,30.0
1681,Cucumber Cooler Cocktails,https://minimalistbaker.com/cucumber-cooler-co...,"[gin , cucumber, lime, tonic water, leaves, su...",https://minimalistbaker.com/wp-content/uploads...,4,5.0,5,1,162,12,...,0,16,,0,11.2,,,,,
1682,Strawberry Cake Magic Shell,https://minimalistbaker.com/strawberry-cake-ma...,"[white chocolate chips , coconut oil, strawber...",https://minimalistbaker.com/wp-content/uploads...,3,,5,2,144,12,...,0,,,0,12,,,,,5.0


In [75]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
minimalist_baker_df.img_url = minimalist_baker_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)
minimalist_baker_df.img_url

0       https://minimalistbaker.com/wp-content/uploads...
1       https://minimalistbaker.com/wp-content/uploads...
2       https://minimalistbaker.com/wp-content/uploads...
3       https://minimalistbaker.com/wp-content/uploads...
4       https://minimalistbaker.com/wp-content/uploads...
                              ...                        
1679    https://minimalistbaker.com/wp-content/uploads...
1680    https://minimalistbaker.com/wp-content/uploads...
1681    https://minimalistbaker.com/wp-content/uploads...
1682    https://minimalistbaker.com/wp-content/uploads...
1683    https://minimalistbaker.com/wp-content/uploads...
Name: img_url, Length: 1684, dtype: object

In [76]:
str_cols = minimalist_baker_df.columns[:4]
numeric_cols = minimalist_baker_df.columns[4:]
minimalist_baker_df[numeric_cols] = minimalist_baker_df[numeric_cols].astype(np.float64)
minimalist_baker_df[str_cols] = minimalist_baker_df[str_cols].astype("string")
minimalist_baker_df.to_pickle("../data/raw/minimalist_baker_recipes.pkl", protocol=4)

## Spoon Fork Bacon (redo image)

### Get all recipe links

#### Grab all categories and total pages

In [77]:
response = requests.get("https://www.spoonforkbacon.com/recipes/")
main_soup = BeautifulSoup(response.text, 'html.parser')

category_links = [category.get("href") for categories in main_soup.find_all("ul", attrs={"class": "feast-category-index-list feast-grid-half feast-desktop-grid-fourth"}) for category in categories.find_all("a")]

total_pages = []
for category_link in category_links:
    response = requests.get(category_link)
    category_soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get total pages in the category, if no pagination, then 1 page
    try:
        pages = np.int64(category_soup.find("li", attrs={"class": "pagination-next"}).find_previous_sibling().find("a").get("href").split("/")[-2])
    except:
        pages = 1

    total_pages.append(pages)
    print(f"Total pages in {category_link}: {pages}")

Total pages in https://www.spoonforkbacon.com/category/asian-recipes/: 6
Total pages in https://www.spoonforkbacon.com/category/appetizers-snacks-recipes/: 12
Total pages in https://www.spoonforkbacon.com/category/chicken-poultry-recipes/: 6
Total pages in https://www.spoonforkbacon.com/category/pasta-noodles-recipes/: 7
Total pages in https://www.spoonforkbacon.com/category/appetizers-snacks-recipes/: 12
Total pages in https://www.spoonforkbacon.com/category/asian-recipes/: 6
Total pages in https://www.spoonforkbacon.com/category/bbq-recipes/: 3
Total pages in https://www.spoonforkbacon.com/category/beef-recipes/: 3
Total pages in https://www.spoonforkbacon.com/category/bread-scones-muffins-biscuits-recipes/: 3
Total pages in https://www.spoonforkbacon.com/category/breakfast-brunch-recipes/: 6
Total pages in https://www.spoonforkbacon.com/category/burgers-sandwiches-sliders-recipes/: 3
Total pages in https://www.spoonforkbacon.com/category/cakes-cupcakes-recipes/: 2
Total pages in htt

#### Grab all recipe links and its image from each categories

In [83]:
link_img_df = pd.DataFrame([])
for pages, category_link in zip(total_pages, category_links):
    for page in range(1, pages + 1):
        print(f"Category link: {category_link}, page: {page}/{pages}")
        response = requests.get(category_link + f"page/{page}/") if page > 1 else requests.get(category_link)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get all recipe links on the page
        df = pd.DataFrame([])
        for recipe in soup.find_all("a", attrs={"class": "entry-image-link"}):
            img = recipe.find("img")

            # Check if image exists, if not, skip the recipe
            if not img:
                print("No image found")
                continue

            recipe_url = recipe.get("href")
            img_url = img.get("data-src") or img.get("src")
            df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe_url], "img_url": [img_url]})], ignore_index=True)

        # Append to main dataframe
        link_img_df = pd.concat([link_img_df, df], ignore_index=True)
        print(f"Found {len(df)} recipe links in category")

link_img_df = link_img_df.drop_duplicates(subset=["recipe_url"]).reset_index(drop=True)
link_img_df

Category link: https://www.spoonforkbacon.com/category/asian-recipes/, page: 1/6
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/asian-recipes/, page: 2/6
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/asian-recipes/, page: 3/6
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/asian-recipes/, page: 4/6
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/asian-recipes/, page: 5/6
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/asian-recipes/, page: 6/6
Found 1 recipe links in category
Category link: https://www.spoonforkbacon.com/category/appetizers-snacks-recipes/, page: 1/12
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/appetizers-snacks-recipes/, page: 2/12
Found 16 recipe links in category
Category link: https://www.spoonforkbacon.com/category/

Unnamed: 0,recipe_url,img_url
0,https://www.spoonforkbacon.com/air-fryer-crab-...,https://www.spoonforkbacon.com/wp-content/uplo...
1,https://www.spoonforkbacon.com/easy-char-siu-c...,https://www.spoonforkbacon.com/wp-content/uplo...
2,https://www.spoonforkbacon.com/garlic-noodles/,https://www.spoonforkbacon.com/wp-content/uplo...
3,https://www.spoonforkbacon.com/fried-rice-recipe/,https://www.spoonforkbacon.com/wp-content/uplo...
4,https://www.spoonforkbacon.com/cold-soba-noodl...,https://www.spoonforkbacon.com/wp-content/uplo...
...,...,...
867,https://www.spoonforkbacon.com/vegan-swedish-m...,https://www.spoonforkbacon.com/wp-content/uplo...
868,https://www.spoonforkbacon.com/roasted-chickpe...,https://www.spoonforkbacon.com/wp-content/uplo...
869,https://www.spoonforkbacon.com/cauliflower-rec...,https://www.spoonforkbacon.com/wp-content/uplo...
870,https://www.spoonforkbacon.com/jalapeno-chedda...,https://www.spoonforkbacon.com/wp-content/uplo...


### Scrape all recipe links

In [85]:
spoon_fork_bacon_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        spoon_fork_bacon_df = pd.concat([spoon_fork_bacon_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue
    
spoon_fork_bacon_df

Scraping recipe: https://www.spoonforkbacon.com/air-fryer-crab-rangoon/, 1/872
Scraping recipe: https://www.spoonforkbacon.com/easy-char-siu-chinese-bbq-pork/, 2/872
Scraping recipe: https://www.spoonforkbacon.com/garlic-noodles/, 3/872
Scraping recipe: https://www.spoonforkbacon.com/fried-rice-recipe/, 4/872
Scraping recipe: https://www.spoonforkbacon.com/cold-soba-noodles-with-shrimp-and-edamame-in-creamy-peanut-sauce/, 5/872
Scraping recipe: https://www.spoonforkbacon.com/bang-bang-shrimp-recipe/, 6/872
Scraping recipe: https://www.spoonforkbacon.com/gochujang-noodles/, 7/872
Scraping recipe: https://www.spoonforkbacon.com/korean-fried-chicken/, 8/872
Scraping recipe: https://www.spoonforkbacon.com/sushi-pizza/, 9/872
Scraping recipe: https://www.spoonforkbacon.com/salmon-recipe-with-soy-glaze/, 10/872
Scraping recipe: https://www.spoonforkbacon.com/masago-sauce/, 11/872
Scraping recipe: https://www.spoonforkbacon.com/yaki-udon/, 12/872
Scraping recipe: https://www.spoonforkbacon.co

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,custom_time,serving_size
0,Air Fryer Crab Rangoon,https://www.spoonforkbacon.com/air-fryer-crab-...,"[cream cheese, imitation crabmeat, thinly slic...",https://www.spoonforkbacon.com/wp-content/uplo...,7,15.0,6.0,21.0,62,6,...,100,24,0.2,1,148,0.2,14,0.3,,
1,Easy Char Siu (Chinese BBQ Pork),https://www.spoonforkbacon.com/easy-char-siu-c...,"[honey, soy sauce, hoisin sauce, unseasoned ri...",https://www.spoonforkbacon.com/wp-content/uplo...,15,5.0,45.0,770.0,228,18,...,1137,483,1,16,4,1,23,2,720.0,
2,Garlic Noodles,https://www.spoonforkbacon.com/garlic-noodles/,"[Chinese style egg noodles , unsalted butter,...",https://www.spoonforkbacon.com/wp-content/uplo...,5,5.0,13.0,18.0,457,67,...,1337,83,2,4,474,2,153,1,,
3,The BEST Fried Rice Recipe,https://www.spoonforkbacon.com/fried-rice-recipe/,"[sesame oil, large eggs, diced Chinese sausage...",https://www.spoonforkbacon.com/wp-content/uplo...,8,10.0,10.0,20.0,400,47,...,898,161,0.4,1,318,2,41,3,,
4,Soba Noodles with Shrimp,https://www.spoonforkbacon.com/cold-soba-noodl...,"[buckwheat soba noodles, warm water, creamy pe...",https://www.spoonforkbacon.com/wp-content/uplo...,7,15.0,8.0,63.0,465,66,...,1363,523,4,7,30,4,94,4,40.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,Vegan Swedish Meatballs over Mashed Potatoes a...,https://www.spoonforkbacon.com/vegan-swedish-m...,"[extra virgin olive oil, divided, sliced cremi...",https://www.spoonforkbacon.com/wp-content/uplo...,9,,,,,,...,,,,,,,,,,
830,Roasted Chickpea Stuffed Sweet Potatoes with C...,https://www.spoonforkbacon.com/roasted-chickpe...,"[cans chickpeas, drained and pat dry, extra vi...",https://www.spoonforkbacon.com/wp-content/uplo...,6,,,,791,115,...,730,1602,24,22,32188,11,194,10,,
831,Best Cauliflower Recipes,https://www.spoonforkbacon.com/cauliflower-rec...,"[head cauliflower, stem and outer leaves remov...",https://www.spoonforkbacon.com/wp-content/uplo...,10,20.0,16.0,36.0,821,79,...,1495,449,3,26,220,57,63,2,,
832,Jalapeño and Cheddar Spoon Bread,https://www.spoonforkbacon.com/jalapeno-chedda...,"[unsalted butter, all purpose flour, yellow c...",https://www.spoonforkbacon.com/wp-content/uplo...,8,10.0,30.0,65.0,361,35,...,1299,178,2,15,735,5,282,1,,


In [86]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
spoon_fork_bacon_df.img_url = spoon_fork_bacon_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)
spoon_fork_bacon_df.img_url

0      https://www.spoonforkbacon.com/wp-content/uplo...
1      https://www.spoonforkbacon.com/wp-content/uplo...
2      https://www.spoonforkbacon.com/wp-content/uplo...
3      https://www.spoonforkbacon.com/wp-content/uplo...
4      https://www.spoonforkbacon.com/wp-content/uplo...
                             ...                        
829    https://www.spoonforkbacon.com/wp-content/uplo...
830    https://www.spoonforkbacon.com/wp-content/uplo...
831    https://www.spoonforkbacon.com/wp-content/uplo...
832    https://www.spoonforkbacon.com/wp-content/uplo...
833    https://www.spoonforkbacon.com/wp-content/uplo...
Name: img_url, Length: 834, dtype: object

In [87]:
str_cols = spoon_fork_bacon_df.columns[:4]
numeric_cols = spoon_fork_bacon_df.columns[4:]
spoon_fork_bacon_df[numeric_cols] = spoon_fork_bacon_df[numeric_cols].astype(np.float64)
spoon_fork_bacon_df[str_cols] = spoon_fork_bacon_df[str_cols].astype("string")
spoon_fork_bacon_df.to_pickle("../data/raw/spoon_fork_bacon_recipes.pkl", protocol=4)

## Daily Dish Recipes

### Get all recipe links

#### Grab all categories and total pages

In [419]:
response = requests.get("https://dailydishrecipes.com/recipe-index/")
main_soup = BeautifulSoup(response.text, 'html.parser')

category_links = list(set([category.get("href") for category in main_soup.find_all("a", attrs={"class": "kb-advanced-image-link"})]))

total_pages = []
for category_link in category_links:
    response = requests.get(category_link)
    category_soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get total pages in the category, if no pagination, then 1 page
    try:
        pages = np.int64(category_soup.find_all("a", attrs={"class": "page-numbers"})[-2].text)
    except:
        pages = 1

    total_pages.append(pages)
    print(f"Total pages in {category_link}: {pages}")

Total pages in https://dailydishrecipes.com/category/cooking-method/cast-iron-skillet-recipes/: 1
Total pages in https://dailydishrecipes.com/category/breakfast-and-brunch-recipes/breads/: 3
Total pages in https://dailydishrecipes.com/category/main-dish/tacos-burritos-quesadillas/: 1
Total pages in https://dailydishrecipes.com/category/breakfast-and-brunch-recipes/donut-recipes/: 1
Total pages in https://dailydishrecipes.com/category/main-dish/sandwiches-and-sliders/: 3
Total pages in https://dailydishrecipes.com/category/desserts-and-sweets/cookies/: 2
Total pages in https://dailydishrecipes.com/category/cooking-method/30-minutes-or-less/: 2
Total pages in https://dailydishrecipes.com/category/seasonal-recipes/summer-recipes/: 4
Total pages in https://dailydishrecipes.com/category/seasonal-recipes/fall-recipes/: 6
Total pages in https://dailydishrecipes.com/category/just-a-few-ingredients/two-ingredients/: 1
Total pages in https://dailydishrecipes.com/category/condiments-and-more/jell

#### Grab all recipe links and its image from each categories

In [527]:
link_img_df = pd.DataFrame([])
for pages, category_link in zip(total_pages, category_links):
    for page in range(1, pages + 1):
        print(f"Category link: {category_link}, page: {page}/{pages}")
        response = requests.get(category_link + f"page/{page}/") if page > 1 else requests.get(category_link)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get all recipe links on the page
        df = pd.DataFrame([])
        for recipe in soup.find_all("li", attrs={"class": "entry-list-item"}):
            img_url = recipe.find("img")

            # Check if image exists, if not, skip the recipe
            if not img_url:
                print("No image found")
                continue

            df = pd.concat([df, pd.DataFrame({"recipe_url": [recipe.find("a").get("href")], "img_url": [img_url.get("src")]})], ignore_index=True)

        # Append to main dataframe
        link_img_df = pd.concat([link_img_df, df], ignore_index=True)
        print(f"Found {len(df)} recipe links in category")

link_img_df = link_img_df.drop_duplicates(subset=["recipe_url"]).reset_index(drop=True)

Category link: https://dailydishrecipes.com/category/cooking-method/cast-iron-skillet-recipes/, page: 1/1
Found 7 recipe links in category
Category link: https://dailydishrecipes.com/category/breakfast-and-brunch-recipes/breads/, page: 1/3
Found 24 recipe links in category
Category link: https://dailydishrecipes.com/category/breakfast-and-brunch-recipes/breads/, page: 2/3
Found 24 recipe links in category
Category link: https://dailydishrecipes.com/category/breakfast-and-brunch-recipes/breads/, page: 3/3
Found 5 recipe links in category
Category link: https://dailydishrecipes.com/category/main-dish/tacos-burritos-quesadillas/, page: 1/1
Found 18 recipe links in category
Category link: https://dailydishrecipes.com/category/breakfast-and-brunch-recipes/donut-recipes/, page: 1/1
Found 19 recipe links in category
Category link: https://dailydishrecipes.com/category/main-dish/sandwiches-and-sliders/, page: 1/3
Found 24 recipe links in category
Category link: https://dailydishrecipes.com/cat

### Scrape all recipe links

In [540]:
daily_dish_recipes_df = pd.DataFrame([])

# Scrape each recipe link
for i, recipe in link_img_df.iterrows():
    recipe_link = recipe["recipe_url"]
    img_url = recipe["img_url"]
    
    try:
        print(f"Scraping recipe: {recipe_link}, {i+1}/{len(link_img_df)}")
        response = requests.get(recipe_link)

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {recipe_link}")
            continue
            
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Get recipe title
        title_regex = re.compile(r'\b[\w-]*title[\w-]*\b', re.IGNORECASE)
        recipe_title = recipe_soup.find("h1", attrs={"class": title_regex}).text.strip()

        # Get ingredients
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue

        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        df = pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_link], "ingredients": [ingredients], "img_url": [img_url], "num_steps": [total_steps]})
        combined_recipe = pd.concat([df, compiled_times, nutrition_df], axis=1)
        daily_dish_recipes_df = pd.concat([daily_dish_recipes_df, combined_recipe], ignore_index=True)
    except Exception as e:
        print(f"Error processing recipe {recipe_link}: {e}")
        continue

daily_dish_recipes_df

Scraping recipe: https://dailydishrecipes.com/creamy-chicken-spinach-skillet/, 1/894
Scraping recipe: https://dailydishrecipes.com/sweet-skillet-cornbread/, 2/894
Scraping recipe: https://dailydishrecipes.com/creamy-shrimp-sausage-skillet/, 3/894
Scraping recipe: https://dailydishrecipes.com/snickerdoodle-skillet-cake/, 4/894
Scraping recipe: https://dailydishrecipes.com/loaded-skillet-white-pesto-pizza/, 5/894
Scraping recipe: https://dailydishrecipes.com/one-skillet-steak-fajitas/, 6/894
Scraping recipe: https://dailydishrecipes.com/fast-and-delicious-breakfast-skillet-potatoes/, 7/894
Scraping recipe: https://dailydishrecipes.com/chocolate-brownie-cranberry-muffins/, 8/894
Scraping recipe: https://dailydishrecipes.com/best-air-fryer-cheese-garlic-bread/, 9/894
Scraping recipe: https://dailydishrecipes.com/our-favorite-homemade-sandwich-bread/, 10/894
Scraping recipe: https://dailydishrecipes.com/mocha-nut-bread/, 11/894
Scraping recipe: https://dailydishrecipes.com/best-biscuits-for

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,custom_time,serving_size
0,Creamy Chicken Spinach Skillet,https://dailydishrecipes.com/creamy-chicken-sp...,"[olive oil, salt, black pepper, ground cumin, ...",https://dailydishrecipes.com/wp-content/upload...,10,5.0,20.0,25.0,340,5,...,717,576,1,2,1622,6,165,1,,
1,Sweet Skillet Cornbread,https://dailydishrecipes.com/sweet-skillet-cor...,"[cornmeal, all purpose flour, granulated sugar...",https://dailydishrecipes.com/wp-content/upload...,5,5.0,40.0,45.0,412,47,...,431,147,2,18,366,0.4,162,2,,
2,Creamy Shrimp and Sausage Skillet,https://dailydishrecipes.com/creamy-shrimp-sau...,"[olive oil, garlic, Smoked Sausage, shrimp, Ol...",https://dailydishrecipes.com/wp-content/upload...,6,5.0,20.0,25.0,234,3,...,623,170,0.3,1,938,3,25,1,,
3,Snickerdoodle Skillet Cake,https://dailydishrecipes.com/snickerdoodle-ski...,"[butter, granulated sugar, eggs, all purpose f...",https://dailydishrecipes.com/wp-content/upload...,9,10.0,30.0,40.0,356,49,...,321,130,1,27,514,0.01,17,2,,
4,Cast Iron Skillet Loaded White Pesto Pizza,https://dailydishrecipes.com/loaded-skillet-wh...,"[pizza crust, olive oil, white pesto sauce, ki...",https://dailydishrecipes.com/wp-content/upload...,7,8.0,8.0,16.0,453,28,...,1042,132,2,2,949,8,175,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,Garlic Herb Spiralizer Fries,https://dailydishrecipes.com/garlic-herb-spira...,"[russet potatoes, extra virgin olive oil, garl...",https://dailydishrecipes.com/wp-content/upload...,7,15.0,25.0,40.0,183,35,...,301,794,3,1,101,14,38,2,,
870,Parmesan Hasselback Potatoes,https://dailydishrecipes.com/parmesan-hasselba...,"[baking potatoes, butter, garlic cloves, salt,...",https://dailydishrecipes.com/wp-content/upload...,9,15.0,45.0,60.0,239,40,...,308,923,4,1,269,13,89,3,,
871,Mashed Broccoli and Cauliflower Casserole,https://dailydishrecipes.com/mashed-cauliflowe...,"[cauliflower, broccoli, butter, garlic, sea sa...",https://dailydishrecipes.com/wp-content/upload...,4,20.0,25.0,45.0,59,5,...,77,247,2,1,288,50,59,0.4,,
872,Baked Asparagus Fries with Garlic Aioli Sauce,https://dailydishrecipes.com/baked-asparagus-f...,"[asparagus, egg whites, flour, panko bread cru...",https://dailydishrecipes.com/wp-content/upload...,9,10.0,20.0,30.0,930,50,...,1733,671,7,7,1839,24,239,8,,


In [552]:
# Get full resolution image URLs by removing size specifiers (e.g., -300x200)
daily_dish_recipes_df.img_url = daily_dish_recipes_df.img_url.str.replace(r'-\d+x\d+(?=\.(?:jpg|jpeg|png|webp))','', regex=True)

In [554]:
str_cols = daily_dish_recipes_df.columns[:4]
numeric_cols = daily_dish_recipes_df.columns[4:]
daily_dish_recipes_df[numeric_cols] = daily_dish_recipes_df[numeric_cols].astype(np.float64)
daily_dish_recipes_df[str_cols] = daily_dish_recipes_df[str_cols].astype("string")
daily_dish_recipes_df.to_pickle("../data/raw/daily_dish_recipes.pkl", protocol=4)

## Sandbox

In [None]:
response = requests.get("https://dailydishrecipes.com/smores-pastries/")
url_soup = BeautifulSoup(response.text, 'html.parser')

srcset_regex = re.compile(r'.*\d')
image_regex = re.compile(r'\b(wp-image).*\b')
img_url = url_soup.find("div", attrs={"class": "wprm-recipe-image"}).find("img").get("srcset")
img_url


# Some images are lazy-loaded, so we need to get the actual image URL from data
# if img_url.startswith("data:image/"):
#     img_url = url_soup.find("figure", attrs={"class": title_img_regex}).find("img").get("data-lazy-src")

    # except:
    #     # Fallback method if the above fails
    #     print("    Fallback image extraction method used")
    #     regex = re.compile(r"\w*wp.*image\w*") 
    #     img_url = url_soup.find_all("img", attrs={"class": regex})[1]["src"]

    # return img_url