Goal: Extract ingredients and amounts from web-scrape

In [20]:
import numpy as np
import pandas as pd
import ast

In [11]:
df = pd.read_csv("../11_raw_data/20231031-2328_scraped_fc_recipes.csv", index_col = 0)

In [12]:
df.head()

Unnamed: 0,recipe_url,contents
0,https://www.allrecipes.com/recipe/8805/crispy-...,"{'@context': 'http://schema.org', '@type': ['R..."
1,https://www.allrecipes.com/recipe/8841/oven-fr...,"{'@context': 'http://schema.org', '@type': ['R..."
2,https://www.allrecipes.com/recipe/89268/triple...,"{'@context': 'http://schema.org', '@type': ['R..."
3,https://www.allrecipes.com/recipe/220128/chef-...,"{'@context': 'http://schema.org', '@type': ['R..."
4,https://www.allrecipes.com/recipe/150306/the-b...,"{'@context': 'http://schema.org', '@type': ['R..."


In [13]:
df = df.loc[[0,3],:]

In [14]:
df.head()

Unnamed: 0,recipe_url,contents
0,https://www.allrecipes.com/recipe/8805/crispy-...,"{'@context': 'http://schema.org', '@type': ['R..."
3,https://www.allrecipes.com/recipe/220128/chef-...,"{'@context': 'http://schema.org', '@type': ['R..."


In [22]:
ast.literal_eval(df.loc[0,"contents"]).keys()

dict_keys(['@context', '@type', 'headline', 'datePublished', 'dateModified', 'author', 'description', 'image', 'video', 'publisher', 'name', 'aggregateRating', 'cookTime', 'nutrition', 'prepTime', 'recipeCategory', 'recipeCuisine', 'recipeIngredient', 'recipeInstructions', 'recipeYield', 'totalTime', 'review', 'mainEntityOfPage', 'about'])

In [24]:
ast.literal_eval(df.loc[3,"contents"])["recipeIngredient"]

['1 (3 1/2) pound chicken, cut into 8 pieces',
 '1 teaspoon black pepper',
 '1 teaspoon salt',
 '1 teaspoon paprika',
 '0.5 teaspoon white pepper',
 '0.25 teaspoon dried rosemary',
 '0.25 teaspoon ground thyme',
 '0.25 teaspoon dried oregano',
 '0.25 teaspoon dried sage',
 '0.25 teaspoon cayenne pepper',
 '2 cups buttermilk',
 '2 cups flour',
 '1 teaspoon salt',
 '0.5 teaspoon paprika',
 '0.5 teaspoon cayenne pepper',
 '0.5 teaspoon garlic powder',
 '0.5 teaspoon white pepper',
 '0.5 teaspoon onion powder',
 '2.5 quarts peanut oil for frying']

Observing Chef John's Buttermilk Fried Chicken recipe's ingredient list, there appears to be no systematic way of separating the ingredients into batter and just the chicken. Thus, the ingredient amounts were aggregated by taking the sum of each ingredient.

## Extract Ingredient Amounts

In [45]:
# Store ingredients in a list
ing_list = ast.literal_eval(df.loc[3,"contents"])["recipeIngredient"]

for ing in ing_list:
    # print(str.split(ing," ")[0])
    print(str.split(ing," ")[0])

1
1
1
1
0.5
0.25
0.25
0.25
0.25
0.25
2
2
1
0.5
0.5
0.5
0.5
0.5
2.5


Ingredient amounts were extracted in decimal form.

## Extract Ingrdient Unit of Measurement

In [37]:
# Packages for pre-processing text
import nltk                       # Natural Language Tool Kit
nltk.download('stopwords')        # For processing stop words (words too common to hold significant meaning)
from nltk.corpus import stopwords # Import above downloaded stopwords
import re                         # Regular Expression
import string                     # For identifying punctuation

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [46]:
# Define common American measurements
measurements = [
    "teaspoon", 
    "tablespoon",
    "cup",
    "quart",
    "pound",
    "ounce"
]

# Define stopwords
eng_stopwords = stopwords.words("english")

# Define a stemmer
stemmer = nltk.stem.PorterStemmer()

for ing in ing_list:
    tokens = ing.split(" ")
    stemmed_tokens = []

    for token in tokens[1:]:
        if (not token in eng_stopwords) and token != "":
            stemmed_tokens.append(stemmer.stem(token))

    # print(stemmed_tokens)
    print([token for token in stemmed_tokens if token in measurements])

['pound']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['cup']
['cup']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['quart']


Measurements were successfully extracted.

## Extract Ingredient

In [47]:
# Define common American measurements
measurements = [
    "teaspoon", 
    "tablespoon",
    "cup",
    "quart",
    "pound",
    "ounce"
]

# Define stopwords
eng_stopwords = stopwords.words("english")

# Define a stemmer
stemmer = nltk.stem.PorterStemmer()

for ing in ing_list:
    tokens = ing.split(" ")
    stemmed_tokens = []

    for token in tokens[1:]:
        if (not token in eng_stopwords) and token != "":
            stemmed_tokens.append(stemmer.stem(token))

    # print(stemmed_tokens)
    print(stemmed_tokens)
    # print([token for token in stemmed_tokens if token in measurements])

['(3', '1/2)', 'pound', 'chicken,', 'cut', '8', 'piec']
['teaspoon', 'black', 'pepper']
['teaspoon', 'salt']
['teaspoon', 'paprika']
['teaspoon', 'white', 'pepper']
['teaspoon', 'dri', 'rosemari']
['teaspoon', 'ground', 'thyme']
['teaspoon', 'dri', 'oregano']
['teaspoon', 'dri', 'sage']
['teaspoon', 'cayenn', 'pepper']
['cup', 'buttermilk']
['cup', 'flour']
['teaspoon', 'salt']
['teaspoon', 'paprika']
['teaspoon', 'cayenn', 'pepper']
['teaspoon', 'garlic', 'powder']
['teaspoon', 'white', 'pepper']
['teaspoon', 'onion', 'powder']
['quart', 'peanut', 'oil', 'fri']


In [76]:
# Define common American measurements
ingredient_list = [
    "chicken",
    "cayenne",
    "paprika",
    "rosemary",
    "thyme",
    "oregano",
    "sage",
    "buttermilk",
    "salt",
    "flour",
    "onion",
    "garlic",
    "vegetable",
    "peanut",
    "coconut",
    "white",
    "black"
]

# Define stopwords
eng_stopwords = stopwords.words("english")

# Define a stemmer
stemmer = nltk.stem.PorterStemmer()

for ing in ing_list:
    for punctuation_mark in string.punctuation:
        ing = ing.replace(punctuation_mark,"").lower()
    
    tokens = ing.split(" ")
    stemmed_tokens = []

    # for token in tokens[1:]:
    #     if (not token in eng_stopwords) and token != "":
    #         stemmed_tokens.append(stemmer.stem(token))

    # print(stemmed_tokens)
    # print(stemmed_tokens)
    print([token for token in tokens if token in ingredient_list])

['chicken']
['black']
['salt']
['paprika']
['white']
['rosemary']
['thyme']
['oregano']
['sage']
['cayenne']
['buttermilk']
['flour']
['salt']
['paprika']
['cayenne']
['garlic']
['white']
['onion']
['peanut']


In [78]:
for ing in ing_list:

    # remove punctuation and take lower case
    for punctuation_mark in string.punctuation:
        ing = ing.replace(punctuation_mark,"").lower()

    tokens = ing.split(" ")
    extracted_ingredients = []
    
    if "pepper" in tokens:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token + " pepper")
    elif "powder" in tokens:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token + " powder")
    elif "oil" in tokens:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token + " oil")
    else:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token)
    print(extracted_ingredients)

['chicken']
['black pepper']
['salt']
['paprika']
['white pepper']
['rosemary']
['thyme']
['oregano']
['sage']
['cayenne pepper']
['buttermilk']
['flour']
['salt']
['paprika']
['cayenne pepper']
['garlic powder']
['white pepper']
['onion powder']
['peanut oil']


## Combining Ingredient Amounts, UoM and Ingredients

In [81]:
df

Unnamed: 0,recipe_url,contents
0,https://www.allrecipes.com/recipe/8805/crispy-...,"{'@context': 'http://schema.org', '@type': ['R..."
3,https://www.allrecipes.com/recipe/220128/chef-...,"{'@context': 'http://schema.org', '@type': ['R..."


In [119]:
# Initiate blank dictionary to store ingredients
dict = {
    "recipe_name":[],
    "ing_amt":[],
    "ing_uom":[],
    "ing_name":[]
}

for index, row in enumerate(zip(df["recipe_url"],df["contents"])):

    # Extract ingredients into a list from JSON dictionary
    ing_list = ast.literal_eval(row[1])["recipeIngredient"]
    recipe_name = ast.literal_eval(row[1])["name"]
        
    for ing in ing_list:
        dict["recipe_name"].append(recipe_name)
        
        # Extract ingredient amounts
        try:
            dict["ing_amt"].append(float(ing.split(" ")[0]))
        except:
            dict["ing_amt"].append(np.NaN)
        
        # remove punctuation and take lower case
        for punctuation_mark in string.punctuation:
            ing = ing.replace(punctuation_mark,"").lower()
        
        tokens = ing.split(" ")
        stemmed_tokens = []

        for token in tokens[1:]:
            if (not token in eng_stopwords) and token != "":
                stemmed_tokens.append(stemmer.stem(token))

        # Extract ingredient UoM
        try:
            dict["ing_uom"].append([uom for uom in stemmed_tokens if uom in measurements][0])
        except:
            dict["ing_uom"].append(np.NaN)

        # Extract ingredient name
        if "pepper" in tokens:
            dict["ing_name"].append([name + " pepper" for name in tokens if name in ingredient_list][0])
        elif "powder" in tokens:
            dict["ing_name"].append([name + " powder" for name in tokens if name in ingredient_list][0])
        elif "oil" in tokens:
            dict["ing_name"].append([name + " oil" for name in tokens if name in ingredient_list][0])
        else:
            dict["ing_name"].append([name for name in tokens if name in ingredient_list][0])

    
    
    # print(ing_list)

In [126]:
ing_df = pd.DataFrame(dict)
ing_df.head()

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name
0,Crispy Fried Chicken,1.0,pound,chicken
1,Crispy Fried Chicken,1.0,cup,buttermilk
2,Crispy Fried Chicken,2.0,cup,flour
3,Crispy Fried Chicken,1.0,teaspoon,paprika
4,Crispy Fried Chicken,,,salt pepper


Walah, the ingredients are extracted into a DataFrame where each row represents 1 ingredient with its amounts and unit of measurements.

## Unit Conversion

In [124]:
# metric conversions
conversion_rate = {
    # units regarding mass (metric unit gram)
    "pound"     : 453.59233, # https://www.metric-conversions.org/weight/pounds-to-grams.htm

    # units regarding volume (metric unit mL or cm3)
    "teaspoon"  : 4.9289215, # https://www.metric-conversions.org/volume/us-teaspoons-to-milliliters.htm#metricConversionTable?val=1
    "tablespoon": 14.786765, # https://www.metric-conversions.org/volume/us-tablespoons-to-milliliters.htm
    "quart"     : 946.35295, # https://www.metric-conversions.org/volume/us-liquid-quarts-to-milliliters.htm
    "cup"       : 236.58824  # https://www.metric-conversions.org/volume/us-cups-to-milliliters.htm 
}

In [131]:
# metric conversions
metric_uom = {
    # units regarding mass (metric unit gram)
    "pound"     : "g", # https://www.metric-conversions.org/weight/pounds-to-grams.htm

    # units regarding volume (metric unit mL or cm3)
    "teaspoon"  : "mL", # https://www.metric-conversions.org/volume/us-teaspoons-to-milliliters.htm#metricConversionTable?val=1
    "tablespoon": "mL", # https://www.metric-conversions.org/volume/us-tablespoons-to-milliliters.htm
    "quart"     : "mL", # https://www.metric-conversions.org/volume/us-liquid-quarts-to-milliliters.htm
    "cup"       : "mL"  # https://www.metric-conversions.org/volume/us-cups-to-milliliters.htm 
}

In [132]:
ing_df["ing_amt_metric"] = ing_df["ing_amt"] * ing_df["ing_uom"].map(conversion_rate)
ing_df["ing_uom_metric"] = ing_df["ing_uom"].map(metric_uom)

In [133]:
ing_df

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric
0,Crispy Fried Chicken,1.0,pound,chicken,453.59233,g
1,Crispy Fried Chicken,1.0,cup,buttermilk,236.58824,mL
2,Crispy Fried Chicken,2.0,cup,flour,473.17648,mL
3,Crispy Fried Chicken,1.0,teaspoon,paprika,4.928922,mL
4,Crispy Fried Chicken,,,salt pepper,,
5,Crispy Fried Chicken,2.0,quart,vegetable oil,1892.7059,mL
6,Chef John&#39;s Buttermilk Fried Chicken,1.0,pound,chicken,453.59233,g
7,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,black pepper,4.928922,mL
8,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,salt,4.928922,mL
9,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,paprika,4.928922,mL


Need to fix chicken weight.

## Getting Prices from Walmart

### Wholistic Approach

Go to walmart.com search engine, then extract info from search results.

In [134]:
unique_ing = ing_df["ing_name"].unique()

In [135]:
unique_ing

array(['chicken', 'buttermilk', 'flour', 'paprika', 'salt pepper',
       'vegetable oil', 'black pepper', 'salt', 'white pepper',
       'rosemary', 'thyme', 'oregano', 'sage', 'cayenne pepper',
       'garlic powder', 'onion powder', 'peanut oil'], dtype=object)

In [136]:
import re
# Package for scraping
from selenium import webdriver
from bs4 import BeautifulSoup
import time

In [137]:
# construct query url
search_term = "whole chicken"

# Replace white space with encoding
search_term = re.sub(" ","+",search_term)

# Construct_query
query_url = f"https://www.walmart.ca/en/search?q={search_term}"

In [138]:
query_url

'https://www.walmart.ca/en/search?q=whole+chicken'

In [139]:
# Define default options for Selenium webdriver
chrome_options = webdriver.ChromeOptions()

# Define command_executor, access this through localhost:4444
command_executor = "http://172.18.0.2:4444"

# Initiate webdriver, with command executor found within the Selenium Grid docker container
# For this line to work, the `selenium/standalone-chrome:118.0` docker image
driver = webdriver.Remote(
    command_executor = command_executor,
    options          = chrome_options
)

# Use the driver to scrape the url
driver.get(query_url)

# Extract the text of the response into a variable
html = driver.page_source

# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Exit the driver
driver.quit()

In [141]:
for tag in soup.find_all("a"):
    print(tag.get("href"))

None
#maincontent
/en
/en
/en/my-items
/en/my-items
/en/sign-in?vid=oaoh
/en/orders
/en/ip/prime-organic-whole-chicken/4P4128A0DK1J?from=/search
/en/ip/mina-halal-whole-chicken/6000200224620?from=/search
/en/ip/your-fresh-market-barbecue-flattened-whole-chicken/6000206064495?from=/search
/en/ip/La-Grille-Montreal-Steak-Spice-Seasoning/206844?from=/search
/en/ip/maple-leaf-whole-chicken-wings/6000201049176?from=/search
/en/ip/Swanson-Meals-Chicken-With-Garlic-Parm-Seasoned-Veg-879GR/0VN5Z2IW4WEC?from=/search
/en/ip/Deli-Express-Buffalo-Chicken-Wings/3SGSBA7CAVO8?from=/search
/en/ip/maple-leaf-natural-selections-shaved-deli-chicken-breast-oven-roasted-family-size/6000192347348?from=/search
/en/ip/PF-Changs-Chicken-Pad-Thai/2WSQE9RINUDS?from=/search
/en/ip/Vendre-Gobblet-Gobblers-1/3N6H94YR9PLH?from=/search
/en/ip/prime-whole-chicken-raised-without-antibiotics/6000199462076?from=/search
/en/ip/maple-leaf-whole-chicken/6000193606661?from=/search
/en/ip/Butterball-Stuffed-Frozen-Turkey-Rais

In [143]:
soup.find_all("div",{"class":"gray mr1 f7 f6-l"})

[<div class="gray mr1 f7 f6-l">$1.12/100g</div>,
 <div class="gray mr1 f7 f6-l">$9.98/1kg</div>,
 <div class="gray mr1 f7 f6-l">$3.18/100g</div>,
 <div class="gray mr1 f7 f6-l">$12.98/1kg</div>,
 <div class="gray mr1 f7 f6-l">$3.73/100g</div>,
 <div class="gray mr1 f7 f6-l">$1.13/100g</div>,
 <div class="gray mr1 f7 f6-l">$9.98/1kg</div>,
 <div class="gray mr1 f7 f6-l">$6.87/1kg</div>,
 <div class="gray mr1 f7 f6-l">$19.77/1kg</div>,
 <div class="gray mr1 f7 f6-l">$6.87/1kg</div>,
 <div class="gray mr1 f7 f6-l">$6.87/1kg</div>]

In [144]:
for tag in soup.find_all("div",{"class":"gray mr1 f7 f6-l"}):
    print(tag.text)

$1.12/100g
$9.98/1kg
$3.18/100g
$12.98/1kg
$3.73/100g
$1.13/100g
$9.98/1kg
$6.87/1kg
$19.77/1kg
$6.87/1kg
$6.87/1kg


In [146]:
for tag in soup.find_all("span",{"class":"normal dark-gray mb0 mt1 lh-title f6 f5-l lh-copy"}):
    print(tag.text)

Prime Organic Whole Chicken, 1 Whole Chicken
Mina Halal Whole Chicken, 1 Whole Chicken, 1.44 - 1.76 kg
Your Fresh Market Barbecue Flattened Whole Chicken, 1 chicken, 1.10 - 1.40 kg
La Grille, Montreal Steak Spice Seasoning, 188g
Maple Leaf Whole Chicken Wings, 10 Wings, 0.81 - 0.99 kg
Swanson Meals Chicken With Garlic Parm Seasoned Veg 879GR, Chicken Garlic Parm Veg 879GR
Deli Express Buffalo Chicken Wings, 505 g
Maple Leaf Natural Selections Shaved Deli Chicken Breast Oven Roasted Family Size, 375 g
PF Changs Chicken Pad Thai, Pad Thai
Vendre Gobblet Gobblers #1
Prime Whole Chicken Raised Without Antibiotics, 1 Whole Chicken
Maple Leaf Whole Chicken, 1 Whole Chicken, 1.44 - 2.00 kg
Butterball Stuffed Frozen Turkey - Raised Without Antibiotics, 5-7 kg, 5.00 - 7.00 kg
Butterball Easy Fresh Turkey Breast Roast, Turkey Breast Roast
Butterball Seasoned Frozen Young Turkey, 5-7 kg, 5.00 - 7.00 kg
Butterball Stuffed Frozen Young Turkey - Raised without Antibiotics., 7-9 kg, 7.00 - 9.00 kg
Bu

In [147]:
for tag in soup.find_all("div",{"class":"mr1 mr2-xl b black lh-copy f5 f4-l"}):
    print(tag.text)

$17
$17.56
$20.27
$5.97
$12.85
$11.97
$9.47
$13.97
$6.98
$21.01


In [149]:
dict = {
    "prod_name": [],
    "prod_price":[],
    "prod_unit_price":[]
}

soups = [soup]

for soup in soups:
    # Extract prod_name
    for tag in soup.find_all("span",{"class":"normal dark-gray mb0 mt1 lh-title f6 f5-l lh-copy"}):
        dict["prod_name"].append(tag.text)

    # Extract prod_price
    for tag in soup.find_all("span",{"class":"normal dark-gray mb0 mt1 lh-title f6 f5-l lh-copy"}):
        dict["prod_price"].append(tag.text)

    # Extact prod_unit_price
    for tag in soup.find_all("div",{"class":"gray mr1 f7 f6-l"}):
        dict["prod_unit_price"].append(tag.text)


In [151]:
pd.DataFrame(dict)

ValueError: All arrays must be of the same length

In [153]:
len(dict["prod_name"])

17

In [154]:
len(dict["prod_price"])

17

In [155]:
len(dict["prod_unit_price"])

11

It seems that not every product has a unit price.

In [159]:
# For each result
len(soup.find_all("div",{"role":"group"}))

17

In [161]:
for tag in soup.find_all("div",{"role":"group"}):
    print(tag.find("span",{"class":"normal dark-gray mb0 mt1 lh-title f6 f5-l lh-copy"}).text)

Prime Organic Whole Chicken, 1 Whole Chicken
Mina Halal Whole Chicken, 1 Whole Chicken, 1.44 - 1.76 kg
Your Fresh Market Barbecue Flattened Whole Chicken, 1 chicken, 1.10 - 1.40 kg
La Grille, Montreal Steak Spice Seasoning, 188g
Maple Leaf Whole Chicken Wings, 10 Wings, 0.81 - 0.99 kg
Swanson Meals Chicken With Garlic Parm Seasoned Veg 879GR, Chicken Garlic Parm Veg 879GR
Deli Express Buffalo Chicken Wings, 505 g
Maple Leaf Natural Selections Shaved Deli Chicken Breast Oven Roasted Family Size, 375 g
PF Changs Chicken Pad Thai, Pad Thai
Vendre Gobblet Gobblers #1
Prime Whole Chicken Raised Without Antibiotics, 1 Whole Chicken
Maple Leaf Whole Chicken, 1 Whole Chicken, 1.44 - 2.00 kg
Butterball Stuffed Frozen Turkey - Raised Without Antibiotics, 5-7 kg, 5.00 - 7.00 kg
Butterball Easy Fresh Turkey Breast Roast, Turkey Breast Roast
Butterball Seasoned Frozen Young Turkey, 5-7 kg, 5.00 - 7.00 kg
Butterball Stuffed Frozen Young Turkey - Raised without Antibiotics., 7-9 kg, 7.00 - 9.00 kg
Bu

Use search results:

In [175]:
dict = {
    "prod_name": [],
    "prod_price":[],
    "prod_unit_price":[]
}

soups = [soup]

for soup in soups:

    for search_result in soup.find_all("div",{"role":"group"}):
        # Extract prod_name
        try:
            tag = search_result.find("span",{"class":"normal dark-gray mb0 mt1 lh-title f6 f5-l lh-copy"})
            dict["prod_name"].append(tag.text)
        except:
            dict["prod_name"].append(np.NaN)

        # Extract prod_price
        try:
            tag = search_result.find("div",{"class":["mr1 mr2-xl b black lh-copy f5 f4-l","mr1 mr2-xl normal gray lh-copy f5 f4-l"]})
            dict["prod_price"].append(tag.text)
        except:
            dict["prod_price"].append(np.NaN)
    
        # Extact prod_unit_price
        try:
            tag = search_result.find("div",{"class":"gray mr1 f7 f6-l"})
            dict["prod_unit_price"].append(tag.text)
        except:
            dict["prod_unit_price"].append(np.NaN)

In [177]:
chicken_df = pd.DataFrame(dict)

In [178]:
chicken_df.head()

Unnamed: 0,prod_name,prod_price,prod_unit_price
0,"Prime Organic Whole Chicken, 1 Whole Chicken",$17,$1.12/100g
1,"Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...",$17.56,$9.98/1kg
2,Your Fresh Market Barbecue Flattened Whole Chi...,$20.27,
3,"La Grille, Montreal Steak Spice Seasoning, 188g",$5.97,$3.18/100g
4,"Maple Leaf Whole Chicken Wings, 10 Wings, 0.81...",$12.85,$12.98/1kg


In [198]:
# https://blog.finxter.com/python-regex-and-operator-tutorial-video/
cond1 = chicken_df["prod_name"].str.lower().str.contains("(?=.*whole)(?=.*chicken)", regex = True)
cond2 = chicken_df["prod_name"].str.lower().str.contains("wing|breast|thigh")
cond3 = chicken_df["prod_unit_price"].isna()

In [195]:
chicken_df.loc[cond1]

Unnamed: 0,prod_name,prod_price,prod_unit_price
0,"Prime Organic Whole Chicken, 1 Whole Chicken",$17,$1.12/100g
1,"Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...",$17.56,$9.98/1kg
2,Your Fresh Market Barbecue Flattened Whole Chi...,$20.27,
4,"Maple Leaf Whole Chicken Wings, 10 Wings, 0.81...",$12.85,$12.98/1kg
10,Prime Whole Chicken Raised Without Antibiotics...,$17.58,$1.13/100g
11,"Maple Leaf Whole Chicken, 1 Whole Chicken, 1.4...",$19.96,$9.98/1kg


In [199]:
chicken_df.loc[cond1 & ~ cond2 & ~ cond3]

Unnamed: 0,prod_name,prod_price,prod_unit_price
0,"Prime Organic Whole Chicken, 1 Whole Chicken",$17,$1.12/100g
1,"Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...",$17.56,$9.98/1kg
10,Prime Whole Chicken Raised Without Antibiotics...,$17.58,$1.13/100g
11,"Maple Leaf Whole Chicken, 1 Whole Chicken, 1.4...",$19.96,$9.98/1kg


In [193]:
for name in chicken_df.loc[:,"prod_name"]:
    print(name)

Prime Organic Whole Chicken, 1 Whole Chicken
Mina Halal Whole Chicken, 1 Whole Chicken, 1.44 - 1.76 kg
Your Fresh Market Barbecue Flattened Whole Chicken, 1 chicken, 1.10 - 1.40 kg
La Grille, Montreal Steak Spice Seasoning, 188g
Maple Leaf Whole Chicken Wings, 10 Wings, 0.81 - 0.99 kg
Swanson Meals Chicken With Garlic Parm Seasoned Veg 879GR, Chicken Garlic Parm Veg 879GR
Deli Express Buffalo Chicken Wings, 505 g
Maple Leaf Natural Selections Shaved Deli Chicken Breast Oven Roasted Family Size, 375 g
PF Changs Chicken Pad Thai, Pad Thai
Vendre Gobblet Gobblers #1
Prime Whole Chicken Raised Without Antibiotics, 1 Whole Chicken
Maple Leaf Whole Chicken, 1 Whole Chicken, 1.44 - 2.00 kg
Butterball Stuffed Frozen Turkey - Raised Without Antibiotics, 5-7 kg, 5.00 - 7.00 kg
Butterball Easy Fresh Turkey Breast Roast, Turkey Breast Roast
Butterball Seasoned Frozen Young Turkey, 5-7 kg, 5.00 - 7.00 kg
Butterball Stuffed Frozen Young Turkey - Raised without Antibiotics., 7-9 kg, 7.00 - 9.00 kg
Bu

### Direct Approach

Start with a list of links, then just scrape from list of link.

In [203]:
ing_source_df = pd.read_csv("../11_raw_data/20231102-1410_ingredient_source.csv"
        )

In [204]:
ing_source_df

Unnamed: 0,ing_name,ing_url
0,garlic powder,https://www.walmart.ca/en/ip/Great-Value-Garli...
1,black pepper,https://www.walmart.ca/en/ip/Great-Value-Groun...
2,cayenne pepper,https://www.walmart.ca/en/ip/Great-Value-Groun...
3,salt,https://www.walmart.ca/en/ip/Great-Value-Iodiz...
4,onion powder,https://www.walmart.ca/en/ip/Great-Value-Onion...
5,oregano,https://www.walmart.ca/en/ip/Great-Value-Orega...
6,flour,https://www.walmart.ca/en/ip/Great-Value-Origi...
7,paprika,https://www.walmart.ca/en/ip/Great-Value-Papri...
8,peanut oil,https://www.walmart.ca/en/ip/Great-Value-Peanu...
9,rosemary,https://www.walmart.ca/en/ip/Great-Value-Rosem...


In [205]:
query_url = ing_source_df.loc[14,"ing_url"]
query_url

'https://www.walmart.ca/en/ip/mina-halal-whole-chicken/6000200224620?from=/search'

In [206]:
# Define default options for Selenium webdriver
chrome_options = webdriver.ChromeOptions()

# Define command_executor, access this through localhost:4444
command_executor = "http://172.18.0.2:4444"

# Initiate webdriver, with command executor found within the Selenium Grid docker container
# For this line to work, the `selenium/standalone-chrome:118.0` docker image
driver = webdriver.Remote(
    command_executor = command_executor,
    options          = chrome_options
)

# Use the driver to scrape the url
driver.get(query_url)

# Extract the text of the response into a variable
html = driver.page_source

# Parse the response text using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Exit the driver
driver.quit()

In [208]:
soup.find("h1").text
soup.find("span",{"class":"mr2 gray"}).text
soup.find("span",{"itemprop":"price"}).text

'$17.56'

In [None]:
dict = {
    "prod_name"       : [],
    "prod_unit_price" : [],
    "prod_price"      : []
}

for index, row in enumerate(zip(ing_source_df["ing_name"],ing_source_df["ing_url"])):

    # Define query_url to scrape
    query_url = row[1]

    # Define default options for Selenium webdriver
    chrome_options = webdriver.ChromeOptions()
    
    # Define command_executor, access this through localhost:4444
    command_executor = "http://172.18.0.2:4444"
    
    # Initiate webdriver, with command executor found within the Selenium Grid docker container
    # For this line to work, the `selenium/standalone-chrome:118.0` docker image
    driver = webdriver.Remote(
        command_executor = command_executor,
        options          = chrome_options
    )
    
    # Use the driver to scrape the url
    driver.get(query_url)
    
    # Extract the text of the response into a variable
    html = driver.page_source
    
    # Parse the response text using Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Exit the driver
    driver.quit()

    # Append title
    try:
        dict["prod_name"].append(soup.find("h1").text)
    except:
        dict["prod_name"].append(np.NaN)

    # Append unit price
    try:
        dict["prod_unit_price"].append(soup.find("span",{"class":"mr2 gray"}).text)
    except:
        dict["prod_unit_price"].append(np.NaN)

    # Append price
    try:
        dict["prod_price"].append(soup.find("span",{"itemprop":"price"}).text)
    except:
        dict["prod_price"].append(np.NaN)




In [211]:
ing_source_df.loc[14,"prod_name"] = soup.find("h1").text
ing_source_df.loc[14,"prod_unit_price"] = soup.find("span",{"class":"mr2 gray"}).text
ing_source_df.loc[14,"prod_price"] = soup.find("span",{"itemprop":"price"}).text

In [212]:
ing_source_df

Unnamed: 0,ing_name,ing_url,prod_name,prod_unit_price,prod_price
0,garlic powder,https://www.walmart.ca/en/ip/Great-Value-Garli...,,,
1,black pepper,https://www.walmart.ca/en/ip/Great-Value-Groun...,,,
2,cayenne pepper,https://www.walmart.ca/en/ip/Great-Value-Groun...,,,
3,salt,https://www.walmart.ca/en/ip/Great-Value-Iodiz...,,,
4,onion powder,https://www.walmart.ca/en/ip/Great-Value-Onion...,,,
5,oregano,https://www.walmart.ca/en/ip/Great-Value-Orega...,,,
6,flour,https://www.walmart.ca/en/ip/Great-Value-Origi...,,,
7,paprika,https://www.walmart.ca/en/ip/Great-Value-Papri...,,,
8,peanut oil,https://www.walmart.ca/en/ip/Great-Value-Peanu...,,,
9,rosemary,https://www.walmart.ca/en/ip/Great-Value-Rosem...,,,


In [214]:
# Clean unit price
ing_source_df.loc[:,"prod_unit_price"].str.split("/")

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
5              NaN
6              NaN
7              NaN
8              NaN
9              NaN
10             NaN
11             NaN
12             NaN
13             NaN
14    [$9.98, 1kg]
15             NaN
16             NaN
Name: prod_unit_price, dtype: object

In [226]:
price = ing_source_df.loc[:,"prod_unit_price"].str.split("/")[14][0]
price = float(price.replace("$",""))
price

9.98

In [231]:
unit = ing_source_df.loc[:,"prod_unit_price"].str.split("/")[14][1]
unit_number = re.findall(r'\d+', unit)
unit_unit   = re.findall(r'\D+', unit)
unit_number, unit_unit

(['1'], ['kg'])

In [233]:
ing_source_df.loc[14,"up_price_cad"]   = price
ing_source_df.loc[14,"up_measurement"] = float(unit_number[0])
ing_source_df.loc[14,"up_unit"]        = unit_unit[0]

In [235]:
ing_source_df.loc[14]

ing_name                                                     chicken
ing_url            https://www.walmart.ca/en/ip/mina-halal-whole-...
prod_name          Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...
prod_unit_price                                            $9.98/1kg
prod_price                                                    $17.56
up_price_cad                                                    9.98
up_measurement                                                   1.0
up_unit                                                           kg
Name: 14, dtype: object

In [236]:
ing_source_df.loc[14,"up_measurement_metric"] = ing_source_df.loc[14,"up_measurement"]*1000
ing_source_df.loc[14,"up_unit_metric"]        = "g"
ing_source_df.loc[14,"up_metric"]             = ing_source_df.loc[14,"up_price_cad"]/ing_source_df.loc[14,"up_measurement_metric"]

In [237]:
ing_source_df.loc[14]

ing_name                                                           chicken
ing_url                  https://www.walmart.ca/en/ip/mina-halal-whole-...
prod_name                Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...
prod_unit_price                                                  $9.98/1kg
prod_price                                                          $17.56
up_price_cad                                                          9.98
up_measurement                                                         1.0
up_unit                                                                 kg
up_measurement_metric                                               1000.0
up_unit_metric                                                           g
up_metric                                                          0.00998
Name: 14, dtype: object

## Merge costs with ingredients

In [239]:
ing_df.head()

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric
0,Crispy Fried Chicken,1.0,pound,chicken,453.59233,g
1,Crispy Fried Chicken,1.0,cup,buttermilk,236.58824,mL
2,Crispy Fried Chicken,2.0,cup,flour,473.17648,mL
3,Crispy Fried Chicken,1.0,teaspoon,paprika,4.928922,mL
4,Crispy Fried Chicken,,,salt pepper,,


In [240]:
cond = ing_df["ing_name"] == "chicken"
ing_df.loc[cond]

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric
0,Crispy Fried Chicken,1.0,pound,chicken,453.59233,g
6,Chef John&#39;s Buttermilk Fried Chicken,1.0,pound,chicken,453.59233,g


In [248]:
final_df = ing_df.loc[cond].merge(
    ing_source_df,
    left_on = "ing_name",
    right_on = "ing_name"
)

In [250]:
final_df["cost"] = final_df["ing_amt_metric"] * final_df["up_metric"]

In [254]:
final_df.pivot(
    columns = ["recipe_name"],
    index = ["ing_name"],
    values = ["cost"]
)

Unnamed: 0_level_0,cost,cost
recipe_name,Chef John&#39;s Buttermilk Fried Chicken,Crispy Fried Chicken
ing_name,Unnamed: 1_level_2,Unnamed: 2_level_2
chicken,4.526851,4.526851


In [255]:
final_df.loc[:,["recipe_name", "ing_uom_metric"]]

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric,ing_url,prod_name,prod_unit_price,prod_price,up_price_cad,up_measurement,up_unit,up_measurement_metric,up_unit_metric,up_metric,cost
0,Crispy Fried Chicken,1.0,pound,chicken,453.59233,g,https://www.walmart.ca/en/ip/mina-halal-whole-...,"Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...",$9.98/1kg,$17.56,9.98,1.0,kg,1000.0,g,0.00998,4.526851
1,Chef John&#39;s Buttermilk Fried Chicken,1.0,pound,chicken,453.59233,g,https://www.walmart.ca/en/ip/mina-halal-whole-...,"Mina Halal Whole Chicken, 1 Whole Chicken, 1.4...",$9.98/1kg,$17.56,9.98,1.0,kg,1000.0,g,0.00998,4.526851
