In [2]:
import numpy as np
import pandas as pd
import joblib

Goal: Get ingredients

List of fried chicken recipes:

# Option 1: Use previously scraped data

In [3]:
# Define path to processed data
data_file = "../11_raw_data/final_processed_df.csv"

In [4]:
df = pd.read_csv(data_file)

In [5]:
df.shape

(33691, 31)

In [7]:
df.head(1).T

Unnamed: 0,0
recipe_title,Corned Beef Roast
recipe_title_wc,3
average_rating,4.4
number_of_ratings,68.0
description,This corned beef roast is easy to prepare and ...
description_wc,21
additional_description,['Preheat the oven to 300 degrees F (150 degre...
additional_description_wc,81
description_flavour_text,Corned beef roast cooked in the oven for five ...
description_flavour_text_wc,26


In [12]:
search_terms = "|".join(["fried","chicken"])
cond = df["recipe_title"].str.contains(search_terms)
df.loc[cond,"recipe_title"]

529                           Quick and Easy Refried Beans
1062                        Easy Nachos with Refried Beans
1080                     Chicken Nachos with Refried Beans
4938                   Instant Pot® Charro (Refried Beans)
6876                    Refried Bean and Cheese Enchiladas
7580                       Refried Beans Without the Refry
8160                           Taco Dip with Refried Beans
14367                            Chef John's Refried Beans
15096                           Instant Pot® Refried Beans
15114                             Vegetarian Refried Beans
20188                           Spicy Cheesy Refried Beans
23517                               Fat Free Refried Beans
23763                                   Best Refried Beans
24420                                    Refried Bean Soup
25019    Pan-fried Polenta with Corn, Kale and Goat Cheese
25598                                  Vegan Refried Beans
28922                                  Refried Black Bea

Filtering existing data revealed no fried chicken recipes in existing data. Thus, option 2, scraping, is necessary.

# Option 2: Scraping directly from Allrecipes

In [47]:
import re
# Package for scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import time

## Construct Query URL

Copy and paste sample site search for visual comparison:

- https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=0&q=fried%20chicken
- https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=24&q=fried%20chicken
- https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=48&q=fried%20chicken

Each search result page contains 24 result tiles. Thus, adding 24 to the offset in a loop will allows us to scrape all recipes related to fried chicken.

In [23]:
# This is the search term users will enter to scrape recipes related to fried chicken
search_term = "fried chicken"

# Replace white space with encoding
search_term = re.sub(" ","%20",search_term)

# First page of search results can be gathered from offset = 0
base_offset = 0

# Construct query
query_url = f"https://www.allrecipes.com/search?{search_term}={search_term}&offset={base_offset}&q={search_term}"

In [24]:
# Check constructed query with query from actual site
query_from_site = "https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=0&q=fried%20chicken"

if query_url == query_from_site:
    print(f"Constructed query matches.")
else:
    print(f"Constructed query mismatch.")

Constructed query matches.


## Scraping recipe URLs from Allrecipes.com search result

In [27]:
# Define default options for Selenium webdriver
chrome_options = webdriver.ChromeOptions()

# Define command_executor, access this through localhost:4444
command_executor = "http://172.21.0.2:4444"

In [28]:
# Initiate webdriver, with command executor found within the Selenium Grid docker container
# For this line to work, the `selenium/standalone-chrome:118.0` docker image
driver = webdriver.Remote(
    command_executor = command_executor,
    options          = chrome_options
)

In [29]:
# Use the driver to scrape the url
driver.get(query_url)

In [30]:
# Extract the text of the response into a variable
html = driver.page_source

# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Exit the driver
driver.quit()

In [41]:
# Extract and print URLs identified using class found through element inspector
for a in soup.find_all("a", {"class":"comp mntl-card-list-items mntl-document-card mntl-card card card--no-image"}):
    print(a.get("href"))

https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/
https://www.allrecipes.com/recipe/8841/oven-fried-chicken-ii/
https://www.allrecipes.com/recipe/89268/triple-dipped-fried-chicken/
https://www.allrecipes.com/recipe/220128/chef-johns-buttermilk-fried-chicken/
https://www.allrecipes.com/recipe/150306/the-best-chicken-fried-steak/
https://www.allrecipes.com/recipe/241152/fried-chicken-wings/
https://www.allrecipes.com/recipe/8970/millie-pasquinellis-fried-chicken/
https://www.allrecipes.com/recipe/16573/chicken-fried-chicken/
https://www.allrecipes.com/recipe/8635/southern-fried-chicken/
https://www.allrecipes.com/recipe/238844/korean-fried-chicken/
https://www.allrecipes.com/recipe/24778/better-than-best-fried-chicken/
https://www.allrecipes.com/recipe/159972/tender-pan-fried-chicken-breasts/
https://www.allrecipes.com/article/how-to-reheat-fried-chicken/
https://www.allrecipes.com/recipe/55867/baked-bbq-fried-chicken/
https://www.allrecipes.com/recipe/86047/garlic-chicken-fri

Defining a `class` while extracting recipe URLs prevents any URLs that are not related to the search term from appearing when extracting URLs from the BeautifulSoup object.
Now to write a loop to repeat the URL scraping process but for all search results pages:

In [48]:
# Initiate a blank list containing the base seach result page
search_result_pages = [query_url]

# Gather the number of search result pages
for a in soup.find_all("a", {"class":"button--outlined-little-round type--rabbit-bold"}):
    result_page = a.get("href")

    # Replace whitespace with encoding
    result_page = re.sub(" ", "%20",result_page)
    
    # Append processed search result pages
    search_result_pages.append(result_page)

# Initiate blank list to store URLs
recipe_url_list = []

# For each search result page, extract and append recipe urls
for index, result_page in enumerate(search_result_pages):
    
    # Define default options for Selenium webdriver
    chrome_options = webdriver.ChromeOptions()
    
    # Define command_executor, access this through localhost:4444
    command_executor = "http://172.21.0.2:4444"

    # Initiate webdriver, with command executor found within the Selenium Grid docker container
    driver = webdriver.Remote(
        command_executor = command_executor,
        options          = chrome_options
    )

    # Use the driver to scrape the url
    driver.get(result_page)

    # Extract the text of the response into a variable
    html = driver.page_source
    
    # Parse the response text using Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Exit the driver
    driver.quit()

    # Extract and print URLs identified using class found through element inspector
    for a in soup.find_all("a", {"class":"comp mntl-card-list-items mntl-document-card mntl-card card card--no-image"}):
        recipe_url = a.get("href")
        recipe_url_list.append(recipe_url)

    time.sleep(5)
    print(f"Completed {index + 1} of {len(search_result_pages)}.", end = "\r")

Completed 5 of 5.

Examine the results:

In [49]:
len(recipe_url_list)

120

In [50]:
recipe_url_list[0:5]

['https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/',
 'https://www.allrecipes.com/recipe/8841/oven-fried-chicken-ii/',
 'https://www.allrecipes.com/recipe/89268/triple-dipped-fried-chicken/',
 'https://www.allrecipes.com/recipe/220128/chef-johns-buttermilk-fried-chicken/',
 'https://www.allrecipes.com/recipe/150306/the-best-chicken-fried-steak/']

In [52]:
# Save the scraped data
dict = {"recipe_url":recipe_url_list}
df = pd.DataFrame(dict)
df.to_csv(f"../11_raw_data/{time.strftime('%Y%m%d-%H%M')}_scraped_recipe_url.csv")

# Part 2: Extract Material Data from Each URL

Sample extracting information from 1 URL first.

In [56]:
# Define sample URL
sample_url = df.loc[0,"recipe_url"]

In [61]:
sample_url

'https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/'

In [62]:
# Define default options for Selenium webdriver
chrome_options = webdriver.ChromeOptions()

# Define command_executor, access this through localhost:4444
command_executor = "http://172.21.0.2:4444"

# Initiate webdriver, with command executor found within the Selenium Grid docker container
driver = webdriver.Remote(
    command_executor = command_executor,
    options          = chrome_options
)

# Use the driver to scrape the url
driver.get(sample_url)

# Extract the text of the response into a variable
html = driver.page_source

# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Exit the driver
driver.quit()

KeyboardInterrupt: 

In [60]:
driver.quit()

In [23]:
# Initiate a blank dictionary to store values
temp_dict = dict()

# column 00: url, the url of the recipe
try:
    temp_dict.update({"recipe_url": url})
except:
    temp_dict.update({"recipe_url": np.NaN})

# column 01: title, the title of the recipe
try:
    temp_dict.update({"title":
                      soup.find("h1", {"id": re.compile("^article-heading_*")}).get_text().strip(' \t\n\r')
                     })
except:
    temp_dict.update({"title": np.NaN})
    
# column 02: image, any image urls found within the recipes
try:
    t_main_img = [img.get("src") for img in soup.find("div", {"class": "loc article-content"}).find_all("img") if img.get("src") != ""]
    t_sub_img = [img.get("data-src") for img in soup.find("div", {"class": "loc article-content"}).find_all("img") if img.get("data-src") != None]
    t_img = list((set(t_main_img+t_sub_img)))
    temp_dict.update({"image":t_img})
except:
    temp_dict.update({"image": np.NaN})

# column 03: rating_average, the target feature
try:
    temp_dict.update({"rating_average":
                      float(soup.find("div", {"id": re.compile("mntl-recipe-review-bar__rating_*")}).get_text().strip(' \t\n\r'))
                     })
except:
    temp_dict.update({"rating_average": np.NaN})
    
# column 04: rating_count, the number of ratings for the recipe
try:
    temp_dict.update({"rating_count":
                      soup.find("div", {"id": re.compile("^mntl-recipe-review-bar__rating-count_*")}).get_text().strip(' \t\n\r()')
                     })
except:
    temp_dict.update({"rating_count": np.NaN})
    
# column 05: review_count, the number of reviews for the recipe
try:
    temp_dict.update({"review_count":
                     soup.find("div", {"id": re.compile("^mntl-recipe-review-bar__comment-count_*")}).get_text().strip(' \t\n\r()')
                     })
except:
    temp_dict.update({"review_count": np.NaN})
    
# column 06: description, the description section beneath each title of the recipe
try:
    temp_dict.update({"description":
                     soup.find("p", {"id" : re.compile("^article-subheading_*")}).get_text().strip(' \t\n\r')
                     })
except:
    temp_dict.update({"description": np.NaN})
    
# column 07: update_date, the last date of update for the recipe
try:
    temp_dict.update({"update_date":
                     soup.find_all("div", {"class": re.compile("^mntl-attribution__item-date*")})[0].get_text()
                     })
except:
    temp_dict.update({"update_date": np.NaN})    

# column 08: ingredient, a list of ingredients and their amounts
try:
    temp_dict.update({"ingredient":
                     [li.get_text().strip(' \t\n\r') for li in soup.find("div", {"id": re.compile("^mntl-structured-ingredients_*")}).find_all("li")]
                     })
except:
    temp_dict.update({"ingredient": np.NaN})
    
# column 09: direction, a list of cooking directions or instructions
try:
    temp_dict.update({"direction":
                      [li.get_text().strip(' \t\n\r') for li in soup.find("div", {"id": re.compile("^recipe__steps-content_*")}).find_all("li")]
                     })
except:
    temp_dict.update({"direction": np.NaN})
    
# column 10: nutrition_summary, a dictionary of nutritional information summary
try:
    tag = soup.find("div", {"id": re.compile("^mntl-nutrition-facts-summary_*")})
    
    t_value = [line.get_text() for line in tag.find_all("td",{"class":"mntl-nutrition-facts-summary__table-cell type--dog-bold"})]
    header_1 = [line.get_text() for line in tag.find_all("td",{"class":"mntl-nutrition-facts-summary__table-cell type--dogg"})]
    header_2 = [line.get_text() for line in tag.find_all("td",{"class":"mntl-nutrition-facts-summary__table-cell type--dog"})]
    t_header = header_1+header_2
    
    temp_dict.update({"nutrition_summary":
                      {key:value for (key,value) in zip(t_header,t_value)}
                     })
except:
    temp_dict.update({"nutrition_summary": np.NaN})
    
# column 11: nutrition_detail, a dictionary of detailed nutritional information
try:
    temp_dict.update({"nutrition_detail":
                      pd.read_html(str(soup.find_all("table",{"class": "mntl-nutrition-facts-label__table"})))[0]\
                      .iloc[:,0].to_list()
                     })
except:
    temp_dict.update({"nutrition_detail": np.NaN})

# column 12: time, a dictionary containing time related values in the recipe
try:
    t_value = [div.get_text().strip(' \t\n\r') for div in soup.find("div", {"id": re.compile("^recipe-details_*")}).find_all("div", {"class":re.compile("^mntl-recipe-details__val*")})]
    t_header = [div.get_text().strip(' \t\n\r') for div in soup.find("div", {"id": re.compile("^recipe-details_*")}).find_all("div", {"class":re.compile("^mntl-recipe-details__la*")})]
    temp_dict.update({"time":
                      {key:value for (key,value) in zip(t_header,t_value)}
                     })
except:
    temp_dict.update({"time": np.NaN})    
    
# column 13: label, a list containing the labels or tags associated with the recipe
try:
    temp_dict.update({"label":
                    [label.get_text() for label in soup.find("div", {"class":re.compile("^loc article-header")}).find_all("span",{"class":"link__wrapper"})]
                     })
except:
    temp_dict.update({"label": np.NaN})    
    
# column 14: review_dict, dictionary containing a JSON dictionary of reviews and other data elements of the webpage
try:
    temp_dict.update({"review_dict":
                     ast.literal_eval(
                         soup.find('script',{"class":"comp allrecipes-schema mntl-schema-unified"}).text
                     )})
except:
    temp_dict.update({"review_dict": np.NaN})

# column 15: description_additional, additional description if available for the recipe
try:
    temp_dict.update({"description_additional":
                     [p.get_text().strip(' \t\n\r') for p in soup.find_all('p',{"class":re.compile("^mntl-sc-block*")})]
                     })
except:
    temp_dict.update({"description_additional": np.NaN})
    
# Create a DataFrame with 1 row using the above data scraped into temp_dict
temp_df = pd.DataFrame({k: pd.Series([v]) for k,v in temp_dict.items()})

In [24]:
temp_df

Unnamed: 0,recipe_url,title,image,rating_average,rating_count,review_count,description,update_date,ingredient,direction,nutrition_summary,nutrition_detail,time,label,review_dict,description_additional
0,https://www.allrecipes.com/recipe/263037/insta...,Instant Pot Best Beef Stew,[https://imagesvc.meredithcorp.io/v3/mm/image?...,4.6,209,161 Reviews,"This Instant Pot stew recipe is the ultimate, ...","Updated on February 26, 2023","[1 tablespoon butter, 1 pound beef chuck, cut ...",[Gather all ingredients.\n\n\n\n\n\n\n\n\n\n\n...,"{'Calories': '352', 'Fat': '16g ', 'Carbs': '3...","[% Daily Value *, Total Fat 16g, Saturated Fat...","{'Prep Time:': '20 mins', 'Cook Time:': '55 mi...","[Recipes, Soups, Stews and Chili Recipes, Stew...","[{'@context': 'http://schema.org', '@type': ['...","[Gather all ingredients., Turn on a multi-func..."


In [15]:
# Packages for general data processing
import numpy as np
import pandas as pd
import re
import ast
import sys
import joblib

# Packages from scraping
from selenium import webdriver
from bs4 import BeautifulSoup