# Load Data

Goal: Extract ingredients and amounts from web-scrape

In [2]:
import numpy as np
import pandas as pd
import ast

In [3]:
# Load the scraped data from allrecipes.com
df = pd.read_csv("../11_raw_data/20231031-2328_scraped_fc_recipes.csv", index_col = 0)

In [4]:
df.head()

Unnamed: 0,recipe_url,contents
0,https://www.allrecipes.com/recipe/8805/crispy-...,"{'@context': 'http://schema.org', '@type': ['R..."
1,https://www.allrecipes.com/recipe/8841/oven-fr...,"{'@context': 'http://schema.org', '@type': ['R..."
2,https://www.allrecipes.com/recipe/89268/triple...,"{'@context': 'http://schema.org', '@type': ['R..."
3,https://www.allrecipes.com/recipe/220128/chef-...,"{'@context': 'http://schema.org', '@type': ['R..."
4,https://www.allrecipes.com/recipe/150306/the-b...,"{'@context': 'http://schema.org', '@type': ['R..."


In [5]:
# Extract only 2 recipes to reduce scope for prototype speed
df = df.loc[[0,3],:]

In [6]:
# Examine the recipes 
for recipe in df["recipe_url"]:
    print(recipe)

https://www.allrecipes.com/recipe/8805/crispy-fried-chicken/
https://www.allrecipes.com/recipe/220128/chef-johns-buttermilk-fried-chicken/


Only 2 recipes were selected:
- Crispy fried chicken
- Chef John's Buttermilk Fried Chicken

In [7]:
# Examine keys in JSON dictionary containing data
sorted(list(ast.literal_eval(df.loc[0,"contents"]).keys()))

['@context',
 '@type',
 'about',
 'aggregateRating',
 'author',
 'cookTime',
 'dateModified',
 'datePublished',
 'description',
 'headline',
 'image',
 'mainEntityOfPage',
 'name',
 'nutrition',
 'prepTime',
 'publisher',
 'recipeCategory',
 'recipeCuisine',
 'recipeIngredient',
 'recipeInstructions',
 'recipeYield',
 'review',
 'totalTime',
 'video']

Ingredients are contained in the `recipeIngredient` key.

In [8]:
# Examine recipeIngredient
ast.literal_eval(df.loc[3,"contents"])["recipeIngredient"]

['1 (3 1/2) pound chicken, cut into 8 pieces',
 '1 teaspoon black pepper',
 '1 teaspoon salt',
 '1 teaspoon paprika',
 '0.5 teaspoon white pepper',
 '0.25 teaspoon dried rosemary',
 '0.25 teaspoon ground thyme',
 '0.25 teaspoon dried oregano',
 '0.25 teaspoon dried sage',
 '0.25 teaspoon cayenne pepper',
 '2 cups buttermilk',
 '2 cups flour',
 '1 teaspoon salt',
 '0.5 teaspoon paprika',
 '0.5 teaspoon cayenne pepper',
 '0.5 teaspoon garlic powder',
 '0.5 teaspoon white pepper',
 '0.5 teaspoon onion powder',
 '2.5 quarts peanut oil for frying']

Observing Chef John's Buttermilk Fried Chicken recipe's ingredient list, there appears to be no systematic way of separating the ingredients into batter and just the chicken. Thus, the ingredient amounts were aggregated by taking the sum of each ingredient.

# Extract Ingredients

## Extract Ingredient Amounts

In [9]:
# Store ingredients in a list
ing_list = ast.literal_eval(df.loc[3,"contents"])["recipeIngredient"]

for ing in ing_list:
    # print(str.split(ing," ")[0])
    print(str.split(ing," ")[0])

1
1
1
1
0.5
0.25
0.25
0.25
0.25
0.25
2
2
1
0.5
0.5
0.5
0.5
0.5
2.5


Ingredient amounts were extracted in decimal form.

## Extract Ingrdient Unit of Measurement

In [10]:
# Packages for pre-processing text
import nltk                       # Natural Language Tool Kit
nltk.download('stopwords')        # For processing stop words (words too common to hold significant meaning)
from nltk.corpus import stopwords # Import above downloaded stopwords
import re                         # Regular Expression
import string                     # For identifying punctuation

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
# Define common American measurements
measurements = [
    "teaspoon", 
    "tablespoon",
    "cup",
    "quart",
    "pound",
    "ounce"
]

# Define stopwords
eng_stopwords = stopwords.words("english")

# Define a stemmer
stemmer = nltk.stem.PorterStemmer()

for ing in ing_list:
    tokens = ing.split(" ")
    stemmed_tokens = []

    for token in tokens[1:]:
        if (not token in eng_stopwords) and token != "":
            stemmed_tokens.append(stemmer.stem(token))

    # print(stemmed_tokens)
    print([token for token in stemmed_tokens if token in measurements])

['pound']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['cup']
['cup']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['teaspoon']
['quart']


Measurements were successfully extracted.

## Extract Ingredient

In [12]:
# Define common American measurements
ingredient_list = [
    "chicken",
    "cayenne",
    "paprika",
    "rosemary",
    "thyme",
    "oregano",
    "sage",
    "buttermilk",
    "salt",
    "flour",
    "onion",
    "garlic",
    "vegetable",
    "peanut",
    "coconut",
    "white",
    "black"
]

for ing in ing_list:

    # remove punctuation and take lower case
    for punctuation_mark in string.punctuation:
        ing = ing.replace(punctuation_mark,"").lower()

    tokens = ing.split(" ")
    extracted_ingredients = []
    
    if "pepper" in tokens:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token + " pepper")
    elif "powder" in tokens:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token + " powder")
    elif "oil" in tokens:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token + " oil")
    else:
        for token in tokens:
            if token in ingredient_list:
                extracted_ingredients.append(token)
    print(extracted_ingredients)

['chicken']
['black pepper']
['salt']
['paprika']
['white pepper']
['rosemary']
['thyme']
['oregano']
['sage']
['cayenne pepper']
['buttermilk']
['flour']
['salt']
['paprika']
['cayenne pepper']
['garlic powder']
['white pepper']
['onion powder']
['peanut oil']


The ingredients were extracted.

## Combining Ingredient Amounts, UoM and Ingredients

In [13]:
# Initiate blank dictionary to store ingredients
dict = {
    "recipe_name":[],
    "ing_amt":[],
    "ing_uom":[],
    "ing_name":[]
}

for index, row in enumerate(zip(df["recipe_url"],df["contents"])):

    # Extract ingredients into a list from JSON dictionary
    ing_list = ast.literal_eval(row[1])["recipeIngredient"]
    recipe_name = ast.literal_eval(row[1])["name"]
        
    for ing in ing_list:
        dict["recipe_name"].append(recipe_name)
        
        # Extract ingredient amounts
        try:
            dict["ing_amt"].append(float(ing.split(" ")[0]))
        except:
            dict["ing_amt"].append(np.NaN)
        
        # remove punctuation and take lower case
        for punctuation_mark in string.punctuation:
            ing = ing.replace(punctuation_mark,"").lower()
        
        tokens = ing.split(" ")
        stemmed_tokens = []

        for token in tokens[1:]:
            if (not token in eng_stopwords) and token != "":
                stemmed_tokens.append(stemmer.stem(token))

        # Extract ingredient UoM
        try:
            dict["ing_uom"].append([uom for uom in stemmed_tokens if uom in measurements][0])
        except:
            dict["ing_uom"].append(np.NaN)

        # Extract ingredient name
        if "pepper" in tokens:
            dict["ing_name"].append([name + " pepper" for name in tokens if name in ingredient_list][0])
        elif "powder" in tokens:
            dict["ing_name"].append([name + " powder" for name in tokens if name in ingredient_list][0])
        elif "oil" in tokens:
            dict["ing_name"].append([name + " oil" for name in tokens if name in ingredient_list][0])
        else:
            dict["ing_name"].append([name for name in tokens if name in ingredient_list][0])

ing_df = pd.DataFrame(dict)
ing_df.head()

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name
0,Crispy Fried Chicken,1.0,pound,chicken
1,Crispy Fried Chicken,1.0,cup,buttermilk
2,Crispy Fried Chicken,2.0,cup,flour
3,Crispy Fried Chicken,1.0,teaspoon,paprika
4,Crispy Fried Chicken,,,salt pepper


Walah, the ingredients are extracted into a DataFrame where each row represents 1 ingredient with its amounts and unit of measurements.

# Data Enrichment

## Unit Conversion

In [14]:
# Define metric unit conversions
metric_conversion_rate = {
    # units regarding mass (metric unit gram)
    "pound"     : 453.59233, # https://www.metric-conversions.org/weight/pounds-to-grams.htm

    # units regarding volume (metric unit mL or cm3)
    "teaspoon"  : 4.9289215, # https://www.metric-conversions.org/volume/us-teaspoons-to-milliliters.htm#metricConversionTable?val=1
    "tablespoon": 14.786765, # https://www.metric-conversions.org/volume/us-tablespoons-to-milliliters.htm
    "quart"     : 946.35295, # https://www.metric-conversions.org/volume/us-liquid-quarts-to-milliliters.htm
    "cup"       : 236.58824  # https://www.metric-conversions.org/volume/us-cups-to-milliliters.htm 
}

In [15]:
# Define metric units
metric_uom = {
    # units regarding mass (metric unit gram)
    "pound"     : "g", # https://www.metric-conversions.org/weight/pounds-to-grams.htm

    # units regarding volume (metric unit mL or cm3)
    "teaspoon"  : "mL", # https://www.metric-conversions.org/volume/us-teaspoons-to-milliliters.htm#metricConversionTable?val=1
    "tablespoon": "mL", # https://www.metric-conversions.org/volume/us-tablespoons-to-milliliters.htm
    "quart"     : "mL", # https://www.metric-conversions.org/volume/us-liquid-quarts-to-milliliters.htm
    "cup"       : "mL"  # https://www.metric-conversions.org/volume/us-cups-to-milliliters.htm 
}

In [16]:
# # Temp fix, convert chicken ing_amt from 1 pound to 4 pound
cond = ing_df["ing_name"] == "chicken"
ing_df.loc[cond,"ing_amt"] = 4
# ing_df.loc[cond,"ing_name"] = "whole chicken"
ing_df.loc[cond]

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name
0,Crispy Fried Chicken,4.0,pound,chicken
6,Chef John&#39;s Buttermilk Fried Chicken,4.0,pound,chicken


In [17]:
# Map (VLOOKUP) the conversion rates
ing_df["ing_amt_metric"] = ing_df["ing_amt"] * ing_df["ing_uom"].map(metric_conversion_rate)
ing_df["ing_uom_metric"] = ing_df["ing_uom"].map(metric_uom)

In [18]:
# Examine the conversion result
ing_df

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric
0,Crispy Fried Chicken,4.0,pound,chicken,1814.36932,g
1,Crispy Fried Chicken,1.0,cup,buttermilk,236.58824,mL
2,Crispy Fried Chicken,2.0,cup,flour,473.17648,mL
3,Crispy Fried Chicken,1.0,teaspoon,paprika,4.928922,mL
4,Crispy Fried Chicken,,,salt pepper,,
5,Crispy Fried Chicken,2.0,quart,vegetable oil,1892.7059,mL
6,Chef John&#39;s Buttermilk Fried Chicken,4.0,pound,chicken,1814.36932,g
7,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,black pepper,4.928922,mL
8,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,salt,4.928922,mL
9,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,paprika,4.928922,mL


# Load Costs

In [19]:
cost_df = pd.read_csv("../11_raw_data/20231103-1016_ingredient_cost.csv")

In [20]:
cost_df.head()

Unnamed: 0,Material,Imperial\nMeasurement,Imperial\nUnit,Metric \nMeasurement,Metric \nUnit,Cost\n(CAD),Price\n(CAD),Unit,MOQ\nMeasurement,MOQ\nUnit,MOQ\nSource,Density\nMeasurement,Density\nUnit,Density\nSource
0,chicken,4,pound,1814.36,g,$18.11,0.00998,g,4.0,pound,https://www.walmart.ca/en/ip/mina-halal-whole-...,-,-,-
1,buttermilk,1,cup,250.0,mL,$1.00,0.004,mL,1000.0,mL,https://www.walmart.ca/en/ip/Sealtest-1-Butter...,-,-,-
2,all-purpose flour,2,cup,500.0,mL,$0.40,0.001508,g,2500.0,g,https://www.walmart.ca/en/ip/Great-Value-Origi...,0.53,g/mL,https://www.aqua-calc.com/page/density-table/s...
3,paprika,1,teaspoon,5.0,mL,$0.04,0.0175,g,130.0,g,https://www.walmart.ca/en/ip/Great-Value-Papri...,0.46,g/mL,https://www.aqua-calc.com/page/density-table/s...
4,pepper,to taste,-,13.48,g,$0.36,0.0267,g,85.0,g,https://www.walmart.ca/en/ip/Great-Value-Groun...,-,-,-


In [21]:
final_df = ing_df.merge(
    cost_df.loc[:,["Material", "Price\n(CAD)", "Unit", "Density\nMeasurement", "Density\nUnit"]],
    left_on = "ing_name",
    right_on = "Material"
)

In [22]:
final_df.head()

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric,Material,Price\n(CAD),Unit,Density\nMeasurement,Density\nUnit
0,Crispy Fried Chicken,4.0,pound,chicken,1814.36932,g,chicken,0.00998,g,-,-
1,Chef John&#39;s Buttermilk Fried Chicken,4.0,pound,chicken,1814.36932,g,chicken,0.00998,g,-,-
2,Crispy Fried Chicken,1.0,cup,buttermilk,236.58824,mL,buttermilk,0.004,mL,-,-
3,Chef John&#39;s Buttermilk Fried Chicken,2.0,cup,buttermilk,473.17648,mL,buttermilk,0.004,mL,-,-
4,Crispy Fried Chicken,2.0,cup,flour,473.17648,mL,flour,0.001508,g,0.53,g/mL


In [23]:
final_df["Density\nMeasurement"] = final_df["Density\nMeasurement"].str.replace("-","0").str.replace("","0")
final_df["Density\nMeasurement"] = final_df["Density\nMeasurement"].astype("float")

In [25]:
cost_list = []

for index, row in final_df.iterrows():
    if row["ing_uom_metric"] == row["Unit"]:
        cost_list.append(row["ing_amt_metric"] * row["Price\n(CAD)"])
    else:
        cost_list.append(row["ing_amt_metric"] * row["Price\n(CAD)"] * row["Density\nMeasurement"])

In [26]:
final_df.loc[:,"cost"] = cost_list
final_df

Unnamed: 0,recipe_name,ing_amt,ing_uom,ing_name,ing_amt_metric,ing_uom_metric,Material,Price\n(CAD),Unit,Density\nMeasurement,Density\nUnit,cost
0,Crispy Fried Chicken,4.0,pound,chicken,1814.36932,g,chicken,0.00998,g,0.0,-,18.107406
1,Chef John&#39;s Buttermilk Fried Chicken,4.0,pound,chicken,1814.36932,g,chicken,0.00998,g,0.0,-,18.107406
2,Crispy Fried Chicken,1.0,cup,buttermilk,236.58824,mL,buttermilk,0.004,mL,0.0,-,0.946353
3,Chef John&#39;s Buttermilk Fried Chicken,2.0,cup,buttermilk,473.17648,mL,buttermilk,0.004,mL,0.0,-,1.892706
4,Crispy Fried Chicken,2.0,cup,flour,473.17648,mL,flour,0.001508,g,0.0503,g/mL,0.035892
5,Chef John&#39;s Buttermilk Fried Chicken,2.0,cup,flour,473.17648,mL,flour,0.001508,g,0.0503,g/mL,0.035892
6,Crispy Fried Chicken,1.0,teaspoon,paprika,4.928922,mL,paprika,0.0175,g,0.0406,g/mL,0.003502
7,Chef John&#39;s Buttermilk Fried Chicken,1.0,teaspoon,paprika,4.928922,mL,paprika,0.0175,g,0.0406,g/mL,0.003502
8,Chef John&#39;s Buttermilk Fried Chicken,0.5,teaspoon,paprika,2.464461,mL,paprika,0.0175,g,0.0406,g/mL,0.001751
9,Crispy Fried Chicken,2.0,quart,vegetable oil,1892.7059,mL,vegetable oil,0.003511,mL,0.0,-,6.64529


In [27]:
final_df = final_df.groupby(
    by = ["recipe_name","ing_name"],
    as_index = False
).agg(
    cost = ("cost","sum")
)

In [29]:
pivot_df = final_df.pivot(
    columns = "recipe_name",
    index   = "ing_name",
    values = "cost"
)

In [30]:
pivot_df

recipe_name,Chef John&#39;s Buttermilk Fried Chicken,Crispy Fried Chicken
ing_name,Unnamed: 1_level_1,Unnamed: 2_level_1
black pepper,0.005369,
buttermilk,1.892706,0.946353
cayenne pepper,0.002054,
chicken,18.107406,18.107406
flour,0.035892,0.035892
garlic powder,0.00226,
onion powder,0.001518,
oregano,0.002684,
paprika,0.005253,0.003502
peanut oil,16.4902,


In [33]:
pivot_df.columns = ["recipe 1","recipe 2"]
pivot_df

Unnamed: 0_level_0,recipe 1,recipe 2
ing_name,Unnamed: 1_level_1,Unnamed: 2_level_1
black pepper,0.005369,
buttermilk,1.892706,0.946353
cayenne pepper,0.002054,
chicken,18.107406,18.107406
flour,0.035892,0.035892
garlic powder,0.00226,
onion powder,0.001518,
oregano,0.002684,
paprika,0.005253,0.003502
peanut oil,16.4902,


In [78]:
final_df.to_csv("../12_processed_data/recipes_pivot.csv")