In [2]:
import numpy as np
import pandas as pd
import joblib

Goal: Get ingredients

List of fried chicken recipes:

# Option 1: Use previously scraped data

In [3]:
# Define path to processed data
data_file = "../11_raw_data/final_processed_df.csv"

In [4]:
df = pd.read_csv(data_file)

In [5]:
df.shape

(33691, 31)

In [7]:
df.head(1).T

Unnamed: 0,0
recipe_title,Corned Beef Roast
recipe_title_wc,3
average_rating,4.4
number_of_ratings,68.0
description,This corned beef roast is easy to prepare and ...
description_wc,21
additional_description,['Preheat the oven to 300 degrees F (150 degre...
additional_description_wc,81
description_flavour_text,Corned beef roast cooked in the oven for five ...
description_flavour_text_wc,26


In [12]:
search_terms = "|".join(["fried","chicken"])
cond = df["recipe_title"].str.contains(search_terms)
df.loc[cond,"recipe_title"]

529                           Quick and Easy Refried Beans
1062                        Easy Nachos with Refried Beans
1080                     Chicken Nachos with Refried Beans
4938                   Instant Pot® Charro (Refried Beans)
6876                    Refried Bean and Cheese Enchiladas
7580                       Refried Beans Without the Refry
8160                           Taco Dip with Refried Beans
14367                            Chef John's Refried Beans
15096                           Instant Pot® Refried Beans
15114                             Vegetarian Refried Beans
20188                           Spicy Cheesy Refried Beans
23517                               Fat Free Refried Beans
23763                                   Best Refried Beans
24420                                    Refried Bean Soup
25019    Pan-fried Polenta with Corn, Kale and Goat Cheese
25598                                  Vegan Refried Beans
28922                                  Refried Black Bea

Filtering existing data revealed no fried chicken recipes in existing data. Thus, option 2, scraping, is necessary.

# Option 2: Scraping directly from Allrecipes

In [47]:
import re
# Package for scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import time

## Construct Query URL

Copy and paste sample site search for visual comparison:

- https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=0&q=fried%20chicken
- https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=24&q=fried%20chicken
- https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=48&q=fried%20chicken

Each search result page contains 24 result tiles. Thus, adding 24 to the offset in a loop will allows us to scrape all recipes related to fried chicken.

In [23]:
# This is the search term users will enter to scrape recipes related to fried chicken
search_term = "fried chicken"

# Replace white space with encoding
search_term = re.sub(" ","%20",search_term)

# First page of search results can be gathered from offset = 0
base_offset = 0

# Construct query
query_url = f"https://www.allrecipes.com/search?{search_term}={search_term}&offset={base_offset}&q={search_term}"

In [24]:
# Check constructed query with query from actual site
query_from_site = "https://www.allrecipes.com/search?fried%20chicken=fried%20chicken&offset=0&q=fried%20chicken"

if query_url == query_from_site:
    print(f"Constructed query matches.")
else:
    print(f"Constructed query mismatch.")

Constructed query matches.


## Scraping recipe URLs from Allrecipes.com search result

In [27]:
# Define default options for Selenium webdriver
chrome_options = webdriver.ChromeOptions()

# Define command_executor, access this through localhost:4444
command_executor = "http://172.21.0.2:4444"

In [28]:
# Initiate webdriver, with command executor found within the Selenium Grid docker container
# For this line to work, the `selenium/standalone-chrome:118.0` docker image
driver = webdriver.Remote(
    command_executor = command_executor,
    options          = chrome_options
)

In [29]:
# Use the driver to scrape the url
driver.get(query_url)

In [30]:
# Extract the text of the response into a variable
html = driver.page_source

# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Exit the driver
driver.quit()

In [41]:
# Extract and print URLs identified using class found through element inspector
for a in soup.find_all("a", {"class":"comp mntl-card-list-items mntl-document-card mntl-card card card--no-image"}):
    print(a.get("href"))

https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/
https://www.allrecipes.com/recipe/8841/oven-fried-chicken-ii/
https://www.allrecipes.com/recipe/89268/triple-dipped-fried-chicken/
https://www.allrecipes.com/recipe/220128/chef-johns-buttermilk-fried-chicken/
https://www.allrecipes.com/recipe/150306/the-best-chicken-fried-steak/
https://www.allrecipes.com/recipe/241152/fried-chicken-wings/
https://www.allrecipes.com/recipe/8970/millie-pasquinellis-fried-chicken/
https://www.allrecipes.com/recipe/16573/chicken-fried-chicken/
https://www.allrecipes.com/recipe/8635/southern-fried-chicken/
https://www.allrecipes.com/recipe/238844/korean-fried-chicken/
https://www.allrecipes.com/recipe/24778/better-than-best-fried-chicken/
https://www.allrecipes.com/recipe/159972/tender-pan-fried-chicken-breasts/
https://www.allrecipes.com/article/how-to-reheat-fried-chicken/
https://www.allrecipes.com/recipe/55867/baked-bbq-fried-chicken/
https://www.allrecipes.com/recipe/86047/garlic-chicken-fri

Defining a `class` while extracting recipe URLs prevents any URLs that are not related to the search term from appearing when extracting URLs from the BeautifulSoup object.
Now to write a loop to repeat the URL scraping process but for all search results pages:

In [48]:
# Initiate a blank list containing the base seach result page
search_result_pages = [query_url]

# Gather the number of search result pages
for a in soup.find_all("a", {"class":"button--outlined-little-round type--rabbit-bold"}):
    result_page = a.get("href")

    # Replace whitespace with encoding
    result_page = re.sub(" ", "%20",result_page)
    
    # Append processed search result pages
    search_result_pages.append(result_page)

# Initiate blank list to store URLs
recipe_url_list = []

# For each search result page, extract and append recipe urls
for index, result_page in enumerate(search_result_pages):
    
    # Define default options for Selenium webdriver
    chrome_options = webdriver.ChromeOptions()
    
    # Define command_executor, access this through localhost:4444
    command_executor = "http://172.21.0.2:4444"

    # Initiate webdriver, with command executor found within the Selenium Grid docker container
    driver = webdriver.Remote(
        command_executor = command_executor,
        options          = chrome_options
    )

    # Use the driver to scrape the url
    driver.get(result_page)

    # Extract the text of the response into a variable
    html = driver.page_source
    
    # Parse the response text using Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Exit the driver
    driver.quit()

    # Extract and print URLs identified using class found through element inspector
    for a in soup.find_all("a", {"class":"comp mntl-card-list-items mntl-document-card mntl-card card card--no-image"}):
        recipe_url = a.get("href")
        recipe_url_list.append(recipe_url)

    time.sleep(5)
    print(f"Completed {index + 1} of {len(search_result_pages)}.", end = "\r")

Completed 5 of 5.

Examine the results:

In [49]:
len(recipe_url_list)

120

In [50]:
recipe_url_list[0:5]

['https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/',
 'https://www.allrecipes.com/recipe/8841/oven-fried-chicken-ii/',
 'https://www.allrecipes.com/recipe/89268/triple-dipped-fried-chicken/',
 'https://www.allrecipes.com/recipe/220128/chef-johns-buttermilk-fried-chicken/',
 'https://www.allrecipes.com/recipe/150306/the-best-chicken-fried-steak/']

In [52]:
# Save the scraped data
dict = {"recipe_url":recipe_url_list}
df = pd.DataFrame(dict)
df.to_csv(f"../11_raw_data/{time.strftime('%Y%m%d-%H%M')}_scraped_recipe_url.csv")

# Part 2: Extract Material Data from Each URL

In [54]:
import re
# Package for scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import json

import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv("../11_raw_data/20231031-2020_scraped_recipe_url.csv", index_col = 0)
df.head()

Unnamed: 0,recipe_url
0,https://www.allrecipes.com/recipe/8805/crispy-...
1,https://www.allrecipes.com/recipe/8841/oven-fr...
2,https://www.allrecipes.com/recipe/89268/triple...
3,https://www.allrecipes.com/recipe/220128/chef-...
4,https://www.allrecipes.com/recipe/150306/the-b...


Sample extracting information from 1 URL first.

In [5]:
# Define sample URL
sample_url = df.loc[0,"recipe_url"]

In [6]:
sample_url

'https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/'

In [7]:
# Define default options for Selenium webdriver
chrome_options = webdriver.ChromeOptions()

# Define command_executor, access this through localhost:4444
command_executor = "http://172.22.0.2:4444"

# Initiate webdriver, with command executor found within the Selenium Grid docker container
driver = webdriver.Remote(
    command_executor = command_executor,
    options          = chrome_options
)

# Use the driver to scrape the url
driver.get(sample_url)

# Extract the text of the response into a variable
html = driver.page_source

# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Exit the driver
driver.quit()

In [58]:
# Find the tag containing JSON data
tag = soup.find('script',{"class":"comp allrecipes-schema mntl-schema-unified"})

# Convert tag contents in JSON dictionary
tag_json = json.loads(tag.contents[0])[0]

# Examine the JSON dictionary keys available to us
for key in tag_json.keys():
    print(key)

@context
@type
headline
datePublished
dateModified
author
description
image
video
publisher
name
aggregateRating
cookTime
nutrition
prepTime
recipeCategory
recipeCuisine
recipeIngredient
recipeInstructions
recipeYield
totalTime
review
mainEntityOfPage
about


In [71]:
time_related_keys = [key for key in list(tag_json.keys()) if "time" in key.lower()]

keys_of_interest = [
    "name",
    "datePublished",
    "dateModified",
    "author",
    "description",
    "aggregateRating",
    "recipeIngredient"
] + time_related_keys

In [72]:
for key in keys_of_interest:
    print(tag_json[key])

Crispy Fried Chicken
1999-05-21T15:28:05.000-04:00
2023-03-01T18:32:09.024-05:00
[{'@type': 'Person', 'name': 'Anonymous'}]
Discover the technique for making deliciously crispy fried chicken with a crunchy coating on the outside and juicy, tender chicken inside.
{'@type': 'AggregateRating', 'ratingValue': '4.6', 'ratingCount': '743'}
['1 (4 pound) chicken, cut into pieces', '1 cup buttermilk', '2 cups all-purpose flour for coating', '1 teaspoon paprika', 'salt and pepper to taste', '2 quarts vegetable oil for frying']
PT35M
PT15M
PT80M


# Repeat Scrape for Each Recipe URL

In [74]:
# Initiate blank dictionary to store data from each recipe
dict = {
    "recipe_url":[],
    "contents":[]
}

In [75]:
for index, recipe_url in enumerate(df["recipe_url"]):
    # Define default options for Selenium webdriver
    chrome_options = webdriver.ChromeOptions()
    
    # Define command_executor, access this through localhost:4444
    command_executor = "http://172.22.0.2:4444"
    
    # Initiate webdriver, with command executor found within the Selenium Grid docker container
    driver = webdriver.Remote(
        command_executor = command_executor,
        options          = chrome_options
    )
    
    # Use the driver to scrape the url
    driver.get(recipe_url)
    
    # Extract the text of the response into a variable
    html = driver.page_source
    
    # Parse the response text using Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Exit the driver
    driver.quit()

    # Find the tag containing JSON data
    tag = soup.find('script',{"class":"comp allrecipes-schema mntl-schema-unified"})
    
    # Convert tag contents in JSON dictionary
    tag_json = json.loads(tag.contents[0])[0]

    # Extract info for each key
    dict["recipe_url"].append(recipe_url)
    dict["contents"].append(tag_json)

    # Rest before next scrape
    time.sleep(10)

In [77]:
fried_chicken_df = pd.DataFrame(dict)

In [78]:
fried_chicken_df.to_csv(f"../11_raw_data/{time.strftime('%Y%m%d-%H%M')}_scraped_fc_recipes.csv")