# Data Extraction Pipeline

In [189]:
import requests
import time
import re
# import scrapy
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

## Helper Functions

In [71]:
def extract_nutrition(url_soup) -> pd.DataFrame:
    class_regex = re.compile("wprm-nutrition-label-text-nutrition-container wprm-nutrition-label-text-nutrition-container-")
    nutrition_regex = re.compile("wprm-nutrition-label-container wprm-nutrition-label-container-simple wprm-block-text-.*")

    nutrition_parent = url_soup.find("div", attrs={"class": nutrition_regex})

    nutrition_df = pd.DataFrame([])
    if nutrition_parent:
        nutritions = nutrition_parent.find_all("span", attrs={"class": class_regex})

        for nutrition in nutritions:
            # Extract the nutrition label text
            nutrition_name = re.sub("wprm-nutrition-label-text-nutrition-container-", "", nutrition.get("class")[1]).strip()
            nutrition_amount = nutrition.find("span", attrs={"class": "wprm-nutrition-label-text-nutrition-value"}).text.strip()
            nutrition_df = pd.concat([nutrition_df, pd.DataFrame({nutrition_name: [nutrition_amount]})], axis=1)
    
    return nutrition_df

def extract_instructions(url_soup) -> int:
    instructions = url_soup.find_all("div", attrs={"class": "wprm-recipe-instruction-group"})

    total_steps = 0
    if instructions:
        instruction_regex = re.compile("^wprm-recipe-[0-9]*-step-[0-9]-[0-9]$")

        for instruction in instructions:
            num_steps = len(instruction.find_all("li", attrs={"id": instruction_regex}))
            total_steps += num_steps
    else:
        print(f"    No instructions found for recipe")
    
    return total_steps

def extract_times(url_soup) -> pd.DataFrame:
    time_regex = re.compile("wprm-recipe-details wprm-recipe-details-[a-z]* wprm-recipe-.*")

    time_info = url_soup.find_all("span", attrs={"class": time_regex})

    compiled_times = {}
    if time_info:
        for time in time_info:
            time_desc = time.get("class")[2][12:] # Get time type (prep/cook/total)
            current_time = time.text # Get time in string
            
            # Convert current time into minutes
            # Initialize total time in minutes if it hasn't exist
            try:
                compiled_times[time_desc]
            except: 
                compiled_times[time_desc] = 0

            minutes = re.search("(.*) minute(s)?", current_time)
            hours = re.search("(.*) hour(s)?", current_time)

            if minutes:
                compiled_times[time_desc] += int(minutes.group(1))
                
            if hours:
                compiled_times[time_desc] += int(hours.group(1)) * 60
    else:
        print(f"    No time info found for recipe")
                
    return pd.DataFrame([compiled_times])

## The Woks of Life

In [None]:
# The Woks of Life Recipe Scraper
pages = range(1, 83) # Adjust this range to scrape more or fewer pages (max 83)
recipes = pd.DataFrame([])

for page in pages:
    print(f"Scraping page: {page}")
    page_url = f"https://thewoksoflife.com/blog/page/{page}/"

    response = requests.get(page_url)

    if response.status_code != 200:
        raise Exception(f"Failed to retrieve the page: {page_url}")
        
    main_soup = BeautifulSoup(response.text, 'html.parser')

    for recipe in main_soup.find_all('a', attrs={"class": "entry-title-link"}):
        recipe_url = recipe.get('href')
        recipe_title = recipe.text

        response = requests.get(recipe_url)
        if response.status_code != 200:
            print(f"Failed to retrieve recipe page: {recipe_url}")
            continue

        # Process the recipe page
        print(f"Processing recipe: {recipe_url}")
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract ingredients information
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue
        
        # Extract nutrition information
        class_regex = re.compile("wprm-nutrition-label-text-nutrition-container wprm-nutrition-label-text-nutrition-container-")
        nutrition_parent = recipe_soup.find("div", attrs={"class": "wprm-nutrition-label-container wprm-nutrition-label-container-simple wprm-block-text-normal"})

        nutrition_df = pd.DataFrame([])
        if nutrition_parent:
            nutritions = nutrition_parent.find_all("span", attrs={"class": class_regex})

            for nutrition in nutritions:
                # Extract the nutrition label text
                nutrition_name = re.sub("wprm-nutrition-label-text-nutrition-container-", "", nutrition.get("class")[1]).strip()
                nutrition_amount = nutrition.find("span", attrs={"class": "wprm-nutrition-label-text-nutrition-value"}).text.strip()
                nutrition_df = pd.concat([nutrition_df, pd.DataFrame({nutrition_name: [nutrition_amount]})], axis=1)

        # Extract instructions information
        instructions = recipe_soup.find_all("div", attrs={"class": "wprm-recipe-instruction-group"})

        total_steps = 0
        if instructions:
            instruction_regex = re.compile("^wprm-recipe-[0-9]*-step-[0-9]-[0-9]$")

            for instruction in instructions:
                num_steps = len(instruction.find_all("li", attrs={"id": instruction_regex}))
                total_steps += num_steps
        else:
            print(f"    No instructions found for recipe")

        # Extract time information
        time_regex = re.compile("wprm-recipe-details wprm-recipe-details-[a-z]* wprm-recipe-.*")

        time_info = recipe_soup.find_all("span", attrs={"class": time_regex})

        compiled_times = {}
        if time_info:
            for time in time_info:
                time_desc = time.get("class")[2][12:] # Get time type (prep/cook/total)
                current_time = time.text # Get time in string
                
                # Convert current time into minutes
                # Initialize total time in minutes if it hasn't exist
                try:
                    compiled_times[time_desc]
                except: 
                    compiled_times[time_desc] = 0

                minutes = re.search("(.*) minute(s)?", current_time)
                hours = re.search("(.*) hour(s)?", current_time)

                if minutes:
                    compiled_times[time_desc] += int(minutes.group(1))
                    
                if hours:
                    compiled_times[time_desc] += int(hours.group(1)) * 60
        else:
            print(f"    No time info found for recipe")
                    
        compiled_times = pd.DataFrame([compiled_times])

        # Combine nutrition data with the recipe data
        combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
        recipes = pd.concat([recipes, combined_recipe], ignore_index=True)

recipes

Scraping page: 1
Processing recipe: https://thewoksoflife.com/little-crispy-pork-xiaosurou/
Processing recipe: https://thewoksoflife.com/2025-spring-farm-garden-update/
    No ingredients found for recipe
Processing recipe: https://thewoksoflife.com/ginger-beef/
Processing recipe: https://thewoksoflife.com/garlic-fried-rice/
Processing recipe: https://thewoksoflife.com/seared-ahi-tuna/
Processing recipe: https://thewoksoflife.com/how-to-start-solids/
    No ingredients found for recipe
Processing recipe: https://thewoksoflife.com/wontons-with-peanut-sauce/
Processing recipe: https://thewoksoflife.com/chickpea-tofu/
Processing recipe: https://thewoksoflife.com/sourdough-english-muffins/
    No time info found for recipe
Processing recipe: https://thewoksoflife.com/century-eggs-guide/
    No ingredients found for recipe
Processing recipe: https://thewoksoflife.com/braised-daikon-shiitake-mushrooms/
Processing recipe: https://thewoksoflife.com/banana-oat-pancakes/
Processing recipe: https

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,protein,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Little Crispy Pork (Xiaosurou – 小酥肉),https://thewoksoflife.com/little-crispy-pork-x...,"[whole red Sichuan peppercorns, boneless pork ...",7,40.0,35.0,75.0,317,19,15,...,117,441,440,1,1,122,1,35,1,
1,Ginger Beef,https://thewoksoflife.com/ginger-beef/,"[flank steak, cornstarch, neutral oil, oyster ...",7,30.0,15.0,45.0,374,17,25,...,68,612,438,0.4,5,30,1,35,2,
2,Garlic Fried Rice,https://thewoksoflife.com/garlic-fried-rice/,"[neutral oil, garlic, cooked white Jasmine ric...",3,10.0,10.0,20.0,311,59,7,...,,610,124,1,1,31,3,37,1,
3,Seared Ahi Tuna,https://thewoksoflife.com/seared-ahi-tuna/,"[ahi tuna steaks, fresh mandarin orange or cle...",7,20.0,5.0,25.0,324,15,23,...,32,400,583,5,7,1969,20,108,3,
4,Wontons with Peanut Sauce,https://thewoksoflife.com/wontons-with-peanut-...,"[wontons, creamy peanut butter, sugar, soy sau...",5,15.0,10.0,25.0,161,6,5,...,,252,146,1,3,17,1,19,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,Scallion Ginger Shrimp Recipe (Redux!),https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4,10.0,5.0,15.0,191,2,24,...,286,1043,145,1,1,120,7.2,173,2.6,
1169,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4,30.0,30.0,60.0,517,53,13,...,61,418,346,2,28,225,,97,2.2,
1170,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3,10.0,,10.0,206,30,1,...,,593,234,2,28,691,23,9,1,
1171,Cantonese Chicken & Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3,20.0,10.0,30.0,,,,...,,,,,,,,,,


In [None]:
recipes[recipes.columns[3:]] = recipes[recipes.columns[3:]].astype(np.float64)
recipes[recipes.columns[:2]] = recipes[recipes.columns[:2]].astype("string")
recipes.to_pickle("woks_of_life_recipes.pkl")

In [None]:
# recipe = main_soup.find_all('a', attrs={"class": "entry-title-link"})[0]
# recipe_url = recipe.get('href')
recipe_url = "https://thewoksoflife.com/braised-daikon-shiitake-mushrooms/"
response = requests.get(recipe_url)
recipe_soup = BeautifulSoup(response.text, 'html.parser')

[id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]
nutrition_parent = recipe_soup.find("div", attrs={"class": "wprm-nutrition-label-container wprm-nutrition-label-container-simple wprm-block-text-normal"})

class_regex = re.compile("wprm-nutrition-label-text-nutrition-container wprm-nutrition-label-text-nutrition-container-")
nutrition_df = pd.DataFrame([])
for nutrition in nutrition_parent.find_all("span", attrs={"class": class_regex}):
    # Extract the nutrition label text
    nutrition_name = re.sub("wprm-nutrition-label-text-nutrition-container-", "", nutrition.get("class")[1]).strip()
    nutrition_amount = nutrition.find("span", attrs={"class": "wprm-nutrition-label-text-nutrition-value"}).text.strip()
    nutrition_df = pd.concat([nutrition_df, pd.DataFrame({nutrition_name: [nutrition_amount]})], axis=1)



# Extract time information
time_regex = re.compile("wprm-recipe-details wprm-recipe-details-[a-z]* wprm-recipe-.*")

time_info = recipe_soup.find_all("span", attrs={"class": time_regex})

if not time_info:
    print(f"    No time information found for recipe")
    # continue

compiled_times = {}
for time in time_info:
    time_desc = time.get("class")[2][12:] # Get time type (prep/cook/total)
    current_time = time.text # Get time in string
    
    # print(current_time)

    # Convert current time into minutes

    # Initialize total time in minutes if it hasn't exist
    try:
        compiled_times[time_desc]
    except: 
        compiled_times[time_desc] = 0

    minutes = re.search("(.*) minute(s)?", current_time)
    hours = re.search("(.*) hour(s)?", current_time)

    if minutes:
        compiled_times[time_desc] += int(minutes.group(1))
        
    if hours:
        compiled_times[time_desc] += int(hours.group(1)) * 60

compiled_times = pd.DataFrame([compiled_times])

Unnamed: 0,prep_time,cook_time,total_time
0,120,30,150


## Just One Cookbook

In [None]:
# Just One Cookbook Scraper
pages = range(1, 24) # Adjust this range to scrape more or fewer pages (max 24)
recipes = pd.DataFrame([])

for page in pages:
    print(f"Scraping page: {page}")
    page_url = f"https://www.justonecookbook.com/recipes/page/{page}/" # Adjust

    response = requests.get(page_url)

    if response.status_code != 200:
        raise Exception(f"Failed to retrieve the page: {page_url}")
        
    main_soup = BeautifulSoup(response.text, 'html.parser')

    for recipe in main_soup.find_all("h3", attrs={"class": "article-title"}): # Adjust
        recipe_url = recipe.find("a").get('href') # Adjust
        recipe_title = recipe.find("a").text.strip() # Adjust
        
        response = requests.get(recipe_url)
        if response.status_code != 200:
            print(f"Failed to retrieve recipe page: {recipe_url}")
            continue

        # Process the recipe page
        print(f"Processing recipe: {recipe_url}")
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract ingredients information
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue
        
        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
        recipes = pd.concat([recipes, combined_recipe], ignore_index=True)

recipes

Scraping page: 1
Processing recipe: https://www.justonecookbook.com/easy-tonkotsu-ramen-recipe/
Processing recipe: https://www.justonecookbook.com/pan-fried-curry-chicken/
Processing recipe: https://www.justonecookbook.com/udon-noodles/
Processing recipe: https://www.justonecookbook.com/carrot-ginger-dressing/
Processing recipe: https://www.justonecookbook.com/tomato-egg-vermicelli-soup/
Processing recipe: https://www.justonecookbook.com/how-to-slice-meat/
Processing recipe: https://www.justonecookbook.com/butter-ponzu-beef/
Processing recipe: https://www.justonecookbook.com/japanese-cutting-techniques/
    No ingredients found for recipe
Processing recipe: https://www.justonecookbook.com/honey-gochujang-chicken/
Processing recipe: https://www.justonecookbook.com/spinach-and-mushroom-miso-soup/
Processing recipe: https://www.justonecookbook.com/chicken-bok-choy-stir-fry/
Processing recipe: https://www.justonecookbook.com/karaage/
Processing recipe: https://www.justonecookbook.com/moms-

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe 豚骨ラーメン,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken (Video),https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15,40.0,10.0,10.0,,290,10,...,76,989,441,0.5,4,46,1,11,1,
2,Homemade Udon Noodles (Video) 手打ちうどん,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36,120.0,60.0,,150.0,361,76,...,,198,106,3,1,,,21,5,
3,Carrot Ginger Dressing 人参ドレッシング,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10,20.0,10.0,,,121,7,...,,170,96,1,5,3792,2,13,1,
4,Tomato Egg Vermicelli Soup (Video),https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10,30.0,5.0,10.0,,123,16,...,93,299,215,2,3,737,11,79,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,Hamachi Teriyaki with Yuzu Kosho ぶりの照り焼き（柚子胡椒バ...,https://www.justonecookbook.com/hamachi-yellow...,"[yellowtail (hamachi, buri), freshly ground bl...",8,40.0,5.0,15.0,,314,8,...,70,591,725,1,4,273,3,61,1,
1003,Teriyaki Pork Balls 照り焼きポークボール,https://www.justonecookbook.com/teriyaki-pork-...,"[sake, potato starch or cornstarch, thinly sli...",6,60.0,15.0,15.0,,429,25,...,107,769,874,1,7,75,3,35,1,
1004,Kinmedai Nitsuke (Braised Alfonsino) キンメダイ煮付け,https://www.justonecookbook.com/kinmedai-nitsuke/,"[kinmedai (alfonsino), sake, mirin, sugar, soy...",8,60.0,5.0,25.0,,170,6,...,57,539,351,1,5,,1,12,1,
1005,What is Otoshibuta (Drop Lid) and How to Make ...,https://www.justonecookbook.com/how-to-make-ot...,"[aluminum foil, chopstick]",4,,5.0,,,,,...,,,,,,,,,,


In [30]:
recipes[recipes.columns[3:]] = recipes[recipes.columns[3:]].astype(np.float64)
recipes[recipes.columns[:2]] = recipes[recipes.columns[:2]].astype("string")
recipes.to_pickle("just_one_cookbook_recipes.pkl")

In [160]:
pd.read_pickle("just_one_cookbook_recipes.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time
0,Easy Tonkotsu Ramen Recipe 豚骨ラーメン,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",36,190,60,130,60
1,Pan-Fried Curry Chicken (Video),https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15,20,10,10,0
2,Homemade Udon Noodles (Video) 手打ちうどん,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",37,60,60,0,150
3,Carrot Ginger Dressing 人参ドレッシング,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10,10,10,0,0
4,Tomato Egg Vermicelli Soup (Video),https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10,15,5,10,0
...,...,...,...,...,...,...,...,...
1002,Hamachi Teriyaki with Yuzu Kosho ぶりの照り焼き（柚子胡椒バ...,https://www.justonecookbook.com/hamachi-yellow...,"[yellowtail (hamachi, buri), freshly ground bl...",8,20,5,15,0
1003,Teriyaki Pork Balls 照り焼きポークボール,https://www.justonecookbook.com/teriyaki-pork-...,"[sake, potato starch or cornstarch, thinly sli...",6,30,15,15,0
1004,Kinmedai Nitsuke (Braised Alfonsino) キンメダイ煮付け,https://www.justonecookbook.com/kinmedai-nitsuke/,"[kinmedai (alfonsino), sake, mirin, sugar, soy...",8,30,5,25,0
1005,What is Otoshibuta (Drop Lid) and How to Make ...,https://www.justonecookbook.com/how-to-make-ot...,"[aluminum foil, chopstick]",4,0,5,0,0


## RecipeTinEats

In [None]:
# RecipeTinEats Scraper
pages = range(1, 80) # Adjust this range to scrape more or fewer pages (max 80)
recipes = pd.DataFrame([])

for page in pages:
    print(f"Scraping page: {page}")
    page_url = f"https://www.recipetineats.com/recipes/?fwp_paged={page}/" # Adjust

    response = requests.get(page_url)

    if response.status_code != 200:
        raise Exception(f"Failed to retrieve the page: {page_url}")
        
    main_soup = BeautifulSoup(response.text, 'html.parser')

    for recipe in main_soup.find_all('a', attrs={"class": "entry-title-link"}): # Adjust
        recipe_url = recipe.get('href') # Adjust
        recipe_title = recipe.text.strip() # Adjust

        response = requests.get(recipe_url)
        if response.status_code != 200:
            print(f"Failed to retrieve recipe page: {recipe_url}")
            continue

        # Process the recipe page
        print(f"Processing recipe: {recipe_url}")
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract ingredients information
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue
        
        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)


        # Combine nutrition data with the recipe data
        combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
        recipes = pd.concat([recipes, combined_recipe], ignore_index=True)

recipes

Scraping page: 1
Processing recipe: https://www.recipetineats.com/cheese-herb-garlic-quick-bread/
Processing recipe: https://www.recipetineats.com/ginger-chicken-and-rice/
Processing recipe: https://www.recipetineats.com/easy-french-apple-tart/
Processing recipe: https://www.recipetineats.com/chinese-eggplant-and-minced-pork/
Processing recipe: https://www.recipetineats.com/tomahawk-steak/
Processing recipe: https://www.recipetineats.com/cowboy-butter/
Processing recipe: https://www.recipetineats.com/country-harvest-root-vegetable-soup/
Processing recipe: https://www.recipetineats.com/dozer-turns-13/
    No ingredients found for recipe
Processing recipe: https://www.recipetineats.com/puttanesca-fish-tray-bake/
Processing recipe: https://www.recipetineats.com/b85-beef-sausage-rolls/
Processing recipe: https://www.recipetineats.com/melting-afghani-chickpea-curry/
Processing recipe: https://www.recipetineats.com/bake-with-brooki-penguin-plagiarism-allegations-statement/
    No ingredients

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,custom_time,total_time,serving_size,calories,...,potassium,fiber,sugar,vitamin_a,calcium,iron,polyunsaturated_fat,monounsaturated_fat,trans_fat,vitamin_c
0,"Cheese, Herb & Garlic Quick Bread...",https://www.recipetineats.com/cheese-herb-garl...,"[block of cheese, fresh rosemary, fresh parsle...",10,15.0,50.0,15.0,80.0,104,271,...,230,1.2,1.7,300,240,2.2,,,,
1,Ginger chicken and rice,https://www.recipetineats.com/ginger-chicken-a...,"[oil, chicken thighs, long grain rice, chicken...",8,10.0,25.0,10.0,45.0,,520,...,831,3,10,154,41,3,4,9,0.1,2
2,Easy French apple tart,https://www.recipetineats.com/easy-french-appl...,"[butter puff pastry, caster sugar, gala apples...",8,12.0,30.0,,,,340,...,126,3,15,137,11,1,2,10,0.1,4
3,Chinese eggplant and minced pork –...,https://www.recipetineats.com/chinese-eggplant...,"[vegetable oil, large eggplant or 2 medium/sma...",10,10.0,20.0,,30.0,,383,...,374,4,11,58,24,1,8,20,0.1,4
4,Tomahawk Steak,https://www.recipetineats.com/tomahawk-steak/,"[tomahawk steak, cooking salt/kosher salt, bla...",13,5.0,60.0,15.0,80.0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382,Mexican Couscous Express,https://www.recipetineats.com/mexican-couscous...,"[salt, cumin powder, onion powder, couscous, b...",5,10.0,,,10.0,,,...,,,,,,,,,,
1383,Middle Eastern Roasted Eggplant with Couscous,https://www.recipetineats.com/middle-eastern-r...,"[medium eggplants, garlic cloves, ground cumin...",7,15.0,40.0,,55.0,,,...,,,,,,,,,,
1384,Crispy Couscous Pancake with Tomato &...,https://www.recipetineats.com/crispy-couscous-...,"[olive oil, couscous, vegetable stock powder (...",10,5.0,20.0,,25.0,,,...,,,,,,,,,,
1385,Watermelon Salad,https://www.recipetineats.com/watermelon-salad/,"[watermelon, small Spanish red onion, red wine...",3,10.0,,,10.0,,,...,,,,,,,,,,


In [32]:
recipes[recipes.columns[3:]] = recipes[recipes.columns[3:]].astype(np.float64)
recipes[recipes.columns[:2]] = recipes[recipes.columns[:2]].astype("string")
recipes.to_pickle("recipe_tin_eats_recipes.pkl")

## Natasha's Kitchen

In [None]:
# Natasha's Kitchen Scraper
page_url = f"https://natashaskitchen.com/recipes/" # Adjust

response = requests.get(page_url)

if response.status_code != 200:
    raise Exception(f"Failed to retrieve the page: {page_url}")
    
main_soup = BeautifulSoup(response.text, 'html.parser')

recipes = pd.DataFrame(columns=["recipe_title", "recipe_url", "ingredients", "num_steps"])

for page in main_soup.find('div', attrs={"class": "ingredients"}).find_all("li"):
    subpage_url = page.find("a").get("href")
    print(f"---Processing subpage: {subpage_url}---")
    
    response = requests.get(subpage_url)
    subpage_soup = BeautifulSoup(response.text, 'html.parser')
    try:
        for recipe in subpage_soup.find("div", attrs={"class": "imagegrid imagegrid3-main"}).find_all("div", attrs={"class": "gridtitle"}):
            recipe_url = recipe.find("a").get("href")
            recipe_title = recipe.text.strip()
            
            print(f"Processing recipe: {recipe_url}")
            
            if recipe_url in list(recipes["recipe_url"]):
                print(f"    Recipe already exist: {recipe_url}")
                continue

            time.sleep(1) # Prevent timeout
            response = requests.get(recipe_url)

            if response.status_code != 200:
                print(f"    Failed to retrieve recipe page: {recipe_url}")
                continue

            # Process the recipe page
            recipe_soup = BeautifulSoup(response.text, 'html.parser')

            # Extract ingredients information
            ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

            if not ingredients:
                print(f"    No ingredients found for recipe")
                continue

            # Extract nutrition information
            nutrition_df = extract_nutrition(recipe_soup)

            # Extract instructions information
            total_steps = extract_instructions(recipe_soup)

            # Extract time information
            compiled_times = extract_times(recipe_soup)

            # Combine nutrition data with the recipe data
            combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
            recipes = pd.concat([recipes, combined_recipe], ignore_index=True)
    except:
        print(f"Error processing: {subpage_url}")

recipes

---Processing subpage: https://natashaskitchen.com/tag/chow-mein-noodles/---
Error processing: https://natashaskitchen.com/tag/chow-mein-noodles/
---Processing subpage: https://natashaskitchen.com/tag/chuck-roast/---
Processing recipe: https://natashaskitchen.com/pot-roast/
Processing recipe: https://natashaskitchen.com/vegetable-beef-soup-recipe/
Processing recipe: https://natashaskitchen.com/mississippi-pot-roast-recipe/
---Processing subpage: https://natashaskitchen.com/tag/cilantro/---
Processing recipe: https://natashaskitchen.com/shrimp-tacos/
Processing recipe: https://natashaskitchen.com/ceviche-recipe/
Processing recipe: https://natashaskitchen.com/easy-taco-salad-recipe/
Processing recipe: https://natashaskitchen.com/persimmon-pomegranate-salad/
Processing recipe: https://natashaskitchen.com/fish-taco-bowl/
Processing recipe: https://natashaskitchen.com/cilantro-lime-rice/
Processing recipe: https://natashaskitchen.com/7-layer-dip-recipe/
Processing recipe: https://natashaski

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,custom_time,total_time,serving_size,calories,...,trans_fat,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron
0,Classic Pot Roast (with VIDEO),https://natashaskitchen.com/pot-roast/,"[beef chuck roast, coarse salt, freshly ground...",10,30.0,210.0,60.0,300.0,7,511,...,2,156,1780,1326,2,4,6846,11,84,6
1,Vegetable Beef Soup Recipe,https://natashaskitchen.com/vegetable-beef-sou...,"[olive oil, beef stew meat, medium onion, beef...",6,20.0,95.0,,115.0,,263,...,,70,1474,1361,3,4,2983,23,71,4
2,Mississippi Pot Roast Recipe,https://natashaskitchen.com/mississippi-pot-ro...,"[chuck roast, ground black pepper, beef broth,...",5,15.0,480.0,,495.0,,386,...,1,132,416,605,0.3,0.2,223,6,34,4
3,Shrimp Tacos (with Video),https://natashaskitchen.com/shrimp-tacos/,"[sour cream, mayonnaise, lime juice, garlic po...",6,20.0,10.0,,30.0,,308,...,,163,849,391,5,4,739,39,218,2
4,Ceviche Recipe (with VIDEO),https://natashaskitchen.com/ceviche-recipe/,"[medium shrimp, lime juice, cucumber, avocados...",4,15.0,,120.0,135.0,,156,...,0.003,91,77,630,4,4,470,27,61,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,Chicken and Wild Rice Soup,https://natashaskitchen.com/chicken-wild-rice-...,"[unsalted butter, carrots, yellow onion, celer...",5,15.0,35.0,,50.0,,307,...,,92,322,415,2,2,3359,4,49,1
1038,Garlic and Herb Burger Recipe,https://natashaskitchen.com/garlic-and-herb-bu...,"[ground beef, large cloves garlic, Worcestersh...",4,15.0,12.0,,27.0,,596,...,2,123,524,767,2,6,1603,11,260,5
1039,Zucchini Dip Recipe,https://natashaskitchen.com/zucchini-dip-recipe/,"[or 5 small/medium zucchini/squash, med/large ...",5,10.0,30.0,,40.0,,,...,,,,,,,,,,
1040,Grilled Zucchini Recipe,https://natashaskitchen.com/grilled-garlic-zuc...,"[small/medium zucchini, extra virgin olive oil...",4,6.0,4.0,,10.0,,70,...,,,205,375,2,4,322,34,28,1


In [185]:
recipes[recipes.columns[3:]] = recipes[recipes.columns[3:]].astype(np.float64)
recipes[recipes.columns[:2]] = recipes[recipes.columns[:2]].astype("string")
recipes.to_pickle("natashas_kitchen_recipes.pkl")

recipes = pd.read_pickle("natashas_kitchen_recipes.pkl")
recipes

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,protein,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,custom_time,serving_size
0,Spice Rubbed Sirloin,https://natashaskitchen.com/spice-rubbed-sirloin/,"[Top Sirloin Steak, paprika, brown sugar, chil...",8.0,40.0,10.0,50.0,,,,...,,,,,,,,,,
1,Loaded Cheeseburger Recipe,https://natashaskitchen.com/loaded-cheeseburge...,"[ground chuck, McCormick Grill Mates ""Montreal...",3.0,10.0,12.0,22.0,,,,...,,,,,,,,,,
2,Maple-Roasted Acorn Squash (VIDEO),https://natashaskitchen.com/maple-roasted-acor...,"[Acorn Squash, extra light olive oil, unsalted...",5.0,10.0,50.0,60.0,232.0,29.0,2.0,...,300.0,775.0,3.0,6.0,1141.0,24.00,88.0,2.0,,
3,Overnight Pizza Dough Recipe (VIDEO),https://natashaskitchen.com/pizza-dough-recipe/,"[warm water, active dry yeast, honey, fine sea...",7.0,10.0,0.0,10.0,193.0,41.0,5.0,...,439.0,56.0,1.0,1.0,,,8.0,2.0,,
4,English Muffins Recipe (VIDEO),https://natashaskitchen.com/english-muffins-re...,"[whole milk minus 1 Tbsp, unsalted butter, hon...",7.0,30.0,10.0,180.0,147.0,25.0,5.0,...,260.0,81.0,1.0,4.0,93.0,0.02,38.0,0.4,140.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2493,Lemon and White Wine Chicken,https://natashaskitchen.com/lemon-and-white-wi...,"[large boneless, all-purpose flour, salt, fres...",6.0,20.0,15.0,35.0,,,,...,,,,,,,,,,
2495,Whole Wheat and Blueberry Waffles,https://natashaskitchen.com/whole-wheat-and-bl...,"[large eggs, whole wheat flour, baking powder,...",5.0,5.0,15.0,20.0,,,,...,,,,,,,,,,
2497,Garlic and Herb Burger Recipe,https://natashaskitchen.com/garlic-and-herb-bu...,"[ground beef, large cloves garlic, Worcestersh...",4.0,15.0,12.0,27.0,596.0,28.0,37.0,...,524.0,767.0,2.0,6.0,1603.0,11.00,260.0,5.0,,
2499,Grilled Zucchini Recipe,https://natashaskitchen.com/grilled-garlic-zuc...,"[small/medium zucchini, extra virgin olive oil...",4.0,6.0,4.0,10.0,70.0,6.0,2.0,...,205.0,375.0,2.0,4.0,322.0,34.00,28.0,1.0,,


In [188]:
recipes[recipes.total_time == recipes.total_time.max()]

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,calories,carbohydrates,protein,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,custom_time,serving_size
2443,Sourdough Pizza Dough Recipe (VIDEO),https://natashaskitchen.com/sourdough-pizza-do...,"[00 flour, fine sea salt, water, active sourdo...",7.0,20.0,10.0,1380.0,239.0,50.0,7.0,...,585.0,67.0,2.0,0.2,,,11.0,3.0,1350.0,


## Love and Lemons

In [202]:
# Love and Lemons Scraper
page_url = f"https://www.loveandlemons.com/recipes/" # Adjust

response = requests.get(page_url)

if response.status_code != 200:
    raise Exception(f"Failed to retrieve the page: {page_url}")
    
main_soup = BeautifulSoup(response.text, 'html.parser')

recipes = pd.DataFrame(columns=["recipe_title", "recipe_url", "ingredients", "num_steps"])

for page in main_soup.find('ol', attrs={"id": "recipeindex"}).find_all("a"):
    recipe_url = page.get("href")
    recipe_title = page.text.strip()

    response = requests.get(recipe_url)
    if response.status_code != 200:
        print(f"Failed to retrieve recipe page: {recipe_url}")
        continue

    # Process the recipe page
    print(f"Processing recipe: {recipe_url}")
    recipe_soup = BeautifulSoup(response.text, 'html.parser')

    # Extract ingredients information
    ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

    if not ingredients:
        print(f"    No ingredients found for recipe")
        continue

    # Extract nutrition information
    nutrition_df = extract_nutrition(recipe_soup)

    # Extract instructions information
    total_steps = extract_instructions(recipe_soup)

    # Extract time information
    compiled_times = extract_times(recipe_soup)

    # Combine nutrition data with the recipe data
    combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
    recipes = pd.concat([recipes, combined_recipe], ignore_index=True)

recipes

Processing recipe: https://www.loveandlemons.com/easy-dinner-ideas/
Processing recipe: https://www.loveandlemons.com/salad-recipes/
Processing recipe: https://www.loveandlemons.com/healthy-breakfast-ideas/
Processing recipe: https://www.loveandlemons.com/picnic-food-ideas/
Processing recipe: https://www.loveandlemons.com/best-veggie-burger-recipe/
Processing recipe: https://www.loveandlemons.com/tzatziki-sauce/
Processing recipe: https://www.loveandlemons.com/pesto-recipe/
Processing recipe: https://www.loveandlemons.com/overnight-oats-recipe/
Processing recipe: https://www.loveandlemons.com/pasta-salad/
Processing recipe: https://www.loveandlemons.com/mexican-street-corn-salad/
    No time info found for recipe
Processing recipe: https://www.loveandlemons.com/chia-seed-pudding/
Processing recipe: https://www.loveandlemons.com/potato-salad-recipe/
Processing recipe: https://www.loveandlemons.com/cucumber-salad/
Processing recipe: https://www.loveandlemons.com/margarita-recipe/
Processi

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,custom_time
0,60 Easy Dinner Ideas,https://www.loveandlemons.com/easy-dinner-ideas/,"[ marinara sauce, ricotta cheese, garlic clove...",7,10.0,20.0,30.0,
1,51 Best Salad Recipes,https://www.loveandlemons.com/salad-recipes/,"[extra-virgin olive oil, red wine vinegar, gar...",2,10.0,5.0,15.0,
2,60 Healthy Breakfast Ideas,https://www.loveandlemons.com/healthy-breakfas...,"[large yellow tomato, diced red onion, chopped...",4,15.0,10.0,25.0,
3,51 Easy Picnic Food Ideas,https://www.loveandlemons.com/picnic-food-ideas/,"[avocados, lemon, toasted bread, heirloom toma...",2,5.0,,,
4,Best Veggie Burger,https://www.loveandlemons.com/best-veggie-burg...,"[extra-virgin olive oil, shallots, mushrooms, ...",8,80.0,60.0,140.0,
...,...,...,...,...,...,...,...,...
882,Spaghetti Squash w/ Chickpeas & Kale,https://www.loveandlemons.com/roasted-spaghett...,"[spaghetti squash, extra-virgin olive oil, sha...",5,10.0,50.0,60.0,
883,Vegan Carrot Cake,https://www.loveandlemons.com/vegan-carrot-cake/,"[flour, baking powder*, baking soda, cinnamon,...",8,20.0,30.0,50.0,
884,Carrot Greens Chimichurri,https://www.loveandlemons.com/carrot-greens-ch...,"[finely chopped carrot greens, dried oregano, ...",1,10.0,,,
885,Matcha Latte,https://www.loveandlemons.com/matcha-latte/,"[matcha powder, hot water, coconut milk, Maple...",3,5.0,,5.0,


In [None]:
recipes[recipes.columns[3:]] = recipes[recipes.columns[3:]].astype(np.float64)
recipes[recipes.columns[:2]] = recipes[recipes.columns[:2]].astype("string")
recipes.to_pickle("love_and_lemons_recipes.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,custom_time
0,60 Easy Dinner Ideas,https://www.loveandlemons.com/easy-dinner-ideas/,"[ marinara sauce, ricotta cheese, garlic clove...",7.0,10.0,20.0,30.0,
1,51 Best Salad Recipes,https://www.loveandlemons.com/salad-recipes/,"[extra-virgin olive oil, red wine vinegar, gar...",2.0,10.0,5.0,15.0,
2,60 Healthy Breakfast Ideas,https://www.loveandlemons.com/healthy-breakfas...,"[large yellow tomato, diced red onion, chopped...",4.0,15.0,10.0,25.0,
3,51 Easy Picnic Food Ideas,https://www.loveandlemons.com/picnic-food-ideas/,"[avocados, lemon, toasted bread, heirloom toma...",2.0,5.0,,,
4,Best Veggie Burger,https://www.loveandlemons.com/best-veggie-burg...,"[extra-virgin olive oil, shallots, mushrooms, ...",8.0,80.0,60.0,140.0,
...,...,...,...,...,...,...,...,...
882,Spaghetti Squash w/ Chickpeas & Kale,https://www.loveandlemons.com/roasted-spaghett...,"[spaghetti squash, extra-virgin olive oil, sha...",5.0,10.0,50.0,60.0,
883,Vegan Carrot Cake,https://www.loveandlemons.com/vegan-carrot-cake/,"[flour, baking powder*, baking soda, cinnamon,...",8.0,20.0,30.0,50.0,
884,Carrot Greens Chimichurri,https://www.loveandlemons.com/carrot-greens-ch...,"[finely chopped carrot greens, dried oregano, ...",1.0,10.0,,,
885,Matcha Latte,https://www.loveandlemons.com/matcha-latte/,"[matcha powder, hot water, coconut milk, Maple...",3.0,5.0,,5.0,


In [204]:
pd.read_pickle("love_and_lemons_recipes.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,custom_time
0,60 Easy Dinner Ideas,https://www.loveandlemons.com/easy-dinner-ideas/,"[ marinara sauce, ricotta cheese, garlic clove...",7.0,10.0,20.0,30.0,
1,51 Best Salad Recipes,https://www.loveandlemons.com/salad-recipes/,"[extra-virgin olive oil, red wine vinegar, gar...",2.0,10.0,5.0,15.0,
2,60 Healthy Breakfast Ideas,https://www.loveandlemons.com/healthy-breakfas...,"[large yellow tomato, diced red onion, chopped...",4.0,15.0,10.0,25.0,
3,51 Easy Picnic Food Ideas,https://www.loveandlemons.com/picnic-food-ideas/,"[avocados, lemon, toasted bread, heirloom toma...",2.0,5.0,,,
4,Best Veggie Burger,https://www.loveandlemons.com/best-veggie-burg...,"[extra-virgin olive oil, shallots, mushrooms, ...",8.0,80.0,60.0,140.0,
...,...,...,...,...,...,...,...,...
882,Spaghetti Squash w/ Chickpeas & Kale,https://www.loveandlemons.com/roasted-spaghett...,"[spaghetti squash, extra-virgin olive oil, sha...",5.0,10.0,50.0,60.0,
883,Vegan Carrot Cake,https://www.loveandlemons.com/vegan-carrot-cake/,"[flour, baking powder*, baking soda, cinnamon,...",8.0,20.0,30.0,50.0,
884,Carrot Greens Chimichurri,https://www.loveandlemons.com/carrot-greens-ch...,"[finely chopped carrot greens, dried oregano, ...",1.0,10.0,,,
885,Matcha Latte,https://www.loveandlemons.com/matcha-latte/,"[matcha powder, hot water, coconut milk, Maple...",3.0,5.0,,5.0,


## Minimalist Baker

In [216]:
# Minimalist Baker Scraper
pages = range(1, 87) # Adjust this range to scrape more or fewer pages (max 87)
recipes = pd.DataFrame([])

for page in pages:
    print(f"Scraping page: {page}")
    page_url = f"https://minimalistbaker.com/recipe-index/?fwp_paged={page}/" # Adjust

    response = requests.get(page_url)

    if response.status_code != 200:
        raise Exception(f"Failed to retrieve the page: {page_url}")
        
    main_soup = BeautifulSoup(response.text, 'html.parser')

    for recipe in main_soup.find_all('h3', attrs={"class": "post-summary__title"}): # Adjust
        recipe_url = recipe.find("a").get('href') # Adjust
        recipe_title = recipe.text.strip() # Adjust

        response = requests.get(recipe_url)
        if response.status_code != 200:
            print(f"Failed to retrieve recipe page: {recipe_url}")
            continue

        # Process the recipe page
        print(f"Processing recipe: {recipe_url}")
        recipe_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract ingredients information
        ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

        if not ingredients:
            print(f"    No ingredients found for recipe")
            continue
        
        # Extract nutrition information
        nutrition_df = extract_nutrition(recipe_soup)

        # Extract instructions information
        total_steps = extract_instructions(recipe_soup)

        # Extract time information
        compiled_times = extract_times(recipe_soup)

        # Combine nutrition data with the recipe data
        combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
        recipes = pd.concat([recipes, combined_recipe], ignore_index=True)

recipes

Scraping page: 1
Processing recipe: https://minimalistbaker.com/zucchini-pesto-pasta/
Processing recipe: https://minimalistbaker.com/jerk-tofu-roasted-plantain-bowls/
Processing recipe: https://minimalistbaker.com/easy-mango-cucumber-salad/
Processing recipe: https://minimalistbaker.com/vegan-chocolate-cheesecake-cups/
Processing recipe: https://minimalistbaker.com/cucumber-lime-agua-fresca/
Processing recipe: https://minimalistbaker.com/creamy-mocha-chia-pudding/
Processing recipe: https://minimalistbaker.com/easy-raspberry-compote/
Processing recipe: https://minimalistbaker.com/creamy-tuscan-shrimp-pasta-dairy-free/
Processing recipe: https://minimalistbaker.com/gluten-free-lemon-poppy-seed-muffins-vegan/
Processing recipe: https://minimalistbaker.com/italian-lentil-soup-zuppa-di-lenticchie/
Processing recipe: https://minimalistbaker.com/chocolate-pot-de-creme-sweet-potato/
Processing recipe: https://minimalistbaker.com/banana-chocolate-pecan-muffins-vegan-gf/
Processing recipe: http

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,serving_size,calories,carbohydrates,...,trans_fat,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron
0,Zucchini Pesto Pasta with Roasted Tomatoes & C...,https://minimalistbaker.com/zucchini-pesto-pasta/,"[zucchini, halved lengthwise and cut into 1/4-...",6,15.0,25.0,40,1,614,91.3,...,0,0,581,1108,10.9,9.7,341,48,115,3.3
1,Jerk Tofu & Roasted Plantain Bowls,https://minimalistbaker.com/jerk-tofu-roasted-...,"[DIY Jamaican Jerk Seasoning, olive oil, tamar...",9,10.0,35.0,45,1,606,88.1,...,0,0,1436,1297,10.4,52.2,638,93,210,6
2,Easy Mango Cucumber Salad,https://minimalistbaker.com/easy-mango-cucumbe...,"[chopped ripe mango, cut into small cubes, cuc...",2,10.0,,10,1,108,26.7,...,0,0,298,450,3.7,22.1,328,62,33,0.6
3,No-Bake Vegan Chocolate Cheesecake Cups (5 Min...,https://minimalistbaker.com/vegan-chocolate-ch...,"[Oreo-style cookies, raw cashews, plain, unswe...",8,5.0,,5,1,334,29.3,...,0,0,27,424,3,17.2,0,0.8,39,2.4
4,Cucumber Lime Agua Fresca,https://minimalistbaker.com/cucumber-lime-agua...,"[water, cucumbers, roughly chopped , limes, ju...",3,10.0,,10,1,53,13.2,...,0,0,3,177,1.3,9.2,23,9,19,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1682,Cold Brew Mocha Frappe,https://minimalistbaker.com/cold-brew-mocha-fr...,"[cold brew coffee, ice cubes, light chocolate ...",3,10.0,,10,1,174,23.5,...,,,,,0,22,,,,
1683,French Toast Sugar Cookies,https://minimalistbaker.com/french-toast-sugar...,"[butter, sugar, egg, all-purpose flour, baking...",7,15.0,10.0,25,1,158,26,...,,,,,0,18.2,,,,
1684,Watermelon Limeade Cocktails,https://minimalistbaker.com/watermelon-limeade...,"[seedless watermelon, limes*, white rum , suga...",6,10.0,,10,1,147,31.1,...,0,0,,,2.5,23.1,,,,
1685,Strawberry Danish French Toast,https://minimalistbaker.com/strawberry-danish-...,"[wheat bread, egg, white, milk , vanilla, cinn...",5,5.0,5.0,10,0,356,38,...,,,,,5,16,,,,


In [217]:
recipes[recipes.columns[3:]] = recipes[recipes.columns[3:]].astype(np.float64)
recipes[recipes.columns[:2]] = recipes[recipes.columns[:2]].astype("string")
recipes.to_pickle("minimalist_baker_recipes.pkl")

In [219]:
pd.read_pickle("minimalist_baker_recipes.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,prep_time,cook_time,total_time,serving_size,calories,carbohydrates,...,trans_fat,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron
0,Zucchini Pesto Pasta with Roasted Tomatoes & C...,https://minimalistbaker.com/zucchini-pesto-pasta/,"[zucchini, halved lengthwise and cut into 1/4-...",6.0,15.0,25.0,40.0,1.0,614.0,91.3,...,0.0,0.0,581.0,1108.0,10.9,9.7,341.0,48.0,115.0,3.3
1,Jerk Tofu & Roasted Plantain Bowls,https://minimalistbaker.com/jerk-tofu-roasted-...,"[DIY Jamaican Jerk Seasoning, olive oil, tamar...",9.0,10.0,35.0,45.0,1.0,606.0,88.1,...,0.0,0.0,1436.0,1297.0,10.4,52.2,638.0,93.0,210.0,6.0
2,Easy Mango Cucumber Salad,https://minimalistbaker.com/easy-mango-cucumbe...,"[chopped ripe mango, cut into small cubes, cuc...",2.0,10.0,,10.0,1.0,108.0,26.7,...,0.0,0.0,298.0,450.0,3.7,22.1,328.0,62.0,33.0,0.6
3,No-Bake Vegan Chocolate Cheesecake Cups (5 Min...,https://minimalistbaker.com/vegan-chocolate-ch...,"[Oreo-style cookies, raw cashews, plain, unswe...",8.0,5.0,,5.0,1.0,334.0,29.3,...,0.0,0.0,27.0,424.0,3.0,17.2,0.0,0.8,39.0,2.4
4,Cucumber Lime Agua Fresca,https://minimalistbaker.com/cucumber-lime-agua...,"[water, cucumbers, roughly chopped , limes, ju...",3.0,10.0,,10.0,1.0,53.0,13.2,...,0.0,0.0,3.0,177.0,1.3,9.2,23.0,9.0,19.0,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1682,Cold Brew Mocha Frappe,https://minimalistbaker.com/cold-brew-mocha-fr...,"[cold brew coffee, ice cubes, light chocolate ...",3.0,10.0,,10.0,1.0,174.0,23.5,...,,,,,0.0,22.0,,,,
1683,French Toast Sugar Cookies,https://minimalistbaker.com/french-toast-sugar...,"[butter, sugar, egg, all-purpose flour, baking...",7.0,15.0,10.0,25.0,1.0,158.0,26.0,...,,,,,0.0,18.2,,,,
1684,Watermelon Limeade Cocktails,https://minimalistbaker.com/watermelon-limeade...,"[seedless watermelon, limes*, white rum , suga...",6.0,10.0,,10.0,1.0,147.0,31.1,...,0.0,0.0,,,2.5,23.1,,,,
1685,Strawberry Danish French Toast,https://minimalistbaker.com/strawberry-danish-...,"[wheat bread, egg, white, milk , vanilla, cinn...",5.0,5.0,5.0,10.0,0.0,356.0,38.0,...,,,,,5.0,16.0,,,,


## Combine

In [None]:

just_one_cookbook_df = pd.read_pickle("just_one_cookbook_recipes.pkl")
love_and_lemons_df = pd.read_pickle("love_and_lemons_recipes.pkl")
minimalist_baker_df = pd.read_pickle("minimalist_baker_recipes.pkl")
natashas_kitchen_df = pd.read_pickle("natashas_kitchen_recipes.pkl")
recipe_tin_eats_df = pd.read_pickle("recipe_tin_eats_recipes.pkl")
woks_of_life_df = pd.read_pickle("woks_of_life_recipes.pkl")

complete_cookbook = pd.concat([just_one_cookbook_df, love_and_lemons_df, minimalist_baker_df, natashas_kitchen_df, recipe_tin_eats_df, woks_of_life_df], axis=0)

# Set protocol=4 when saving for compability
# complete_cookbook.to_pickle("complete_cookbook.pkl")
complete_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe 豚骨ラーメン,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken (Video),https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles (Video) 手打ちうどん,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Carrot Ginger Dressing 人参ドレッシング,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,
4,Tomato Egg Vermicelli Soup (Video),https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168,Scallion Ginger Shrimp Recipe (Redux!),https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
1169,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
1170,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
1171,Cantonese Chicken & Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


## Testing

In [226]:
recipe_url = "https://www.gimmesomeoven.com/chimichurri-steak-bites/#tasty-recipes-89201"

recipe_title = "a" # Adjust

recipes = pd.DataFrame([])

response = requests.get(recipe_url)
print(response.status_code)
if response.status_code != 200:
    print(f"Failed to retrieve recipe page: {recipe_url}")
    # continue

# Process the recipe page
print(f"Processing recipe: {recipe_url}")
recipe_soup = BeautifulSoup(response.text, 'html.parser')

# Extract ingredients information
ingredients = [id.text for id in recipe_soup.find_all("span", attrs={"class": "wprm-recipe-ingredient-name"})]

if not ingredients:
    print(f"    No ingredients found for recipe")
    # continue

# Extract nutrition information
nutrition_df = extract_nutrition(recipe_soup)

# Extract instructions information
total_steps = extract_instructions(recipe_soup)

# Extract time information
compiled_times = extract_times(recipe_soup)

# Combine nutrition data with the recipe data
combined_recipe = pd.concat([pd.DataFrame({"recipe_title": [recipe_title], "recipe_url": [recipe_url], "ingredients": [ingredients], "num_steps": [total_steps]}), compiled_times, nutrition_df], axis=1)
recipes = pd.concat([recipes, combined_recipe], ignore_index=True)

recipes

403
Failed to retrieve recipe page: https://www.gimmesomeoven.com/chimichurri-steak-bites/#tasty-recipes-89201
Processing recipe: https://www.gimmesomeoven.com/chimichurri-steak-bites/#tasty-recipes-89201
    No ingredients found for recipe
    No instructions found for recipe
    No time info found for recipe


Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps
0,a,https://www.gimmesomeoven.com/chimichurri-stea...,[],0
