# Data Pre-Processing Pipeline

In [154]:
import numpy as np
import pandas as pd
import re
import warnings

# Importing necessary libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string

warnings.filterwarnings("ignore")

## Combine All Recipe Data

In [155]:
import yaml

def load_params():
    with open("../params.yaml") as f:
        params = yaml.safe_load(f)
    return params

params = load_params()

In [156]:
import os
import ast
data_dir = "../" + params["data_preprocessing"]["raw_data_path"]
pickle_files = []
for root, _, files in os.walk(data_dir):
    for f in files:
        if f.endswith(".pkl"):
            pickle_files.append(os.path.join(root, f))

print("Combining recipe data...")
dfs = []
for path in pickle_files:
    print(f"Loading {path} ...")
    df = pd.read_pickle(path)
    dfs.append(df)

complete_cookbook = pd.concat(dfs, axis=0)
complete_cookbook.ingredients = complete_cookbook.ingredients.apply(lambda x: ast.literal_eval(x))
complete_cookbook = complete_cookbook.reset_index(drop=True)
print(f"Combined {len(pickle_files)} files with total {len(complete_cookbook)} recipes.")
complete_cookbook.head()

Combining recipe data...
Loading ../data/raw/omnivores_cookbook_recipes.pkl ...
Loading ../data/raw/daily_dish_recipes.pkl ...
Loading ../data/raw/woks_of_life_recipes.pkl ...
Loading ../data/raw/just_one_cookbook_recipes.pkl ...
Loading ../data/raw/minimalist_baker_recipes.pkl ...
Loading ../data/raw/recipe_tin_eats_recipes.pkl ...
Loading ../data/raw/love_and_lemons_recipes.pkl ...
Loading ../data/raw/spoon_fork_bacon_recipes.pkl ...
Combined 8 files with total 9223 recipes.


Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,1.2,0.8,46.0,3.0,,,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,3.0,7.3,229.0,3.0,,,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,,,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,3.9,11.1,96.0,4.0,,,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1.1,4.1,9.0,1.0,,,,,,


## General Filtering

In [157]:
cleaned_cookbook = complete_cookbook.copy()
cleaned_cookbook.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9223 entries, 0 to 9222
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   recipe_title         9223 non-null   string 
 1   recipe_url           9223 non-null   string 
 2   ingredients          9223 non-null   object 
 3   img_url              9223 non-null   string 
 4   num_steps            9223 non-null   float64
 5   prep_time            8345 non-null   float64
 6   cook_time            7047 non-null   float64
 7   total_time           7510 non-null   float64
 8   serving_size         3055 non-null   float64
 9   calories             7284 non-null   float64
 10  carbohydrates        7091 non-null   float64
 11  protein              7095 non-null   float64
 12  fat                  7055 non-null   float64
 13  saturated_fat        6874 non-null   float64
 14  cholesterol          5854 non-null   float64
 15  sodium               7013 non-null   f

### Clean up Recipe Title

In [158]:
def clean_recipe_title(title: string) -> string:
    """
    Cleans the recipe title by removing characters that are not digits or alphabets and extra spaces.
    Also removes certain words from the title and ensure capitalization.
    """
    line = re.sub("\s*\([^()]*\)\s*", "", title) # Remove text within parentheses
    line = re.sub("[^ \-a-zA-Z0-9()]*", "", line) # Replace unwanted characters with space
    line = re.sub(r"\s*\{[^{}]*\}\s*", "", line) # Remove text within brackets
    line = re.sub("\s*\([^()]*", "", line) # Remove uncomplete parentheses

    # Remove certain words from recipe titles
    line = re.sub("(recipe|how\s*to\s*\w*\s|best|easy)", "", line, flags=re.IGNORECASE).strip()
    line = line.strip()  # Remove leading and trailing spaces
    return line.title()  # Capitalize the first letter of each word

# cleaned_cookbook["recipe_title"] = cleaned_cookbook.recipe_title.apply(clean_recipe_title)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,1.2,0.8,46.0,3.0,,,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,3.0,7.3,229.0,3.0,,,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,,,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,3.9,11.1,96.0,4.0,,,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1.1,4.1,9.0,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9218,Vegan Swedish Meatballs over Mashed Potatoes a...,https://www.spoonforkbacon.com/vegan-swedish-m...,"[extra virgin olive oil, divided, sliced cremi...",https://www.spoonforkbacon.com/wp-content/uplo...,9.0,,,,,,...,,,,,,,,,,
9219,Roasted Chickpea Stuffed Sweet Potatoes with C...,https://www.spoonforkbacon.com/roasted-chickpe...,"[cans chickpeas, drained and pat dry, extra vi...",https://www.spoonforkbacon.com/wp-content/uplo...,6.0,,,,,791.0,...,24.0,22.0,194.0,10.0,,11.0,32188.0,,,
9220,Best Cauliflower Recipes,https://www.spoonforkbacon.com/cauliflower-rec...,"[head cauliflower, stem and outer leaves remov...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,20.0,16.0,36.0,,821.0,...,3.0,26.0,63.0,2.0,,57.0,220.0,,,
9221,Jalapeño and Cheddar Spoon Bread,https://www.spoonforkbacon.com/jalapeno-chedda...,"[unsalted butter, all purpose flour, yellow c...",https://www.spoonforkbacon.com/wp-content/uplo...,8.0,10.0,30.0,65.0,,361.0,...,2.0,15.0,282.0,1.0,,5.0,735.0,0.4,1.0,5.0


### Remove Compilation Recipe

In [159]:
# Drop compilation recipes and keep single recipes only
all_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+", x.lower()))) # All titles with numbers in front 
keep_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+( |-)+(ingredient(s)?|minute(s)?|hour(s)?|bowl(s)?|pot(s)?|layer(s)?)", x.lower()))) # Titles with numbers in front to keep

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[all_numbers_title_idx & ~keep_numbers_title_idx].index)
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,1.2,0.8,46.0,3.0,,,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,3.0,7.3,229.0,3.0,,,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,,,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,3.9,11.1,96.0,4.0,,,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1.1,4.1,9.0,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9110,Vegan Swedish Meatballs over Mashed Potatoes a...,https://www.spoonforkbacon.com/vegan-swedish-m...,"[extra virgin olive oil, divided, sliced cremi...",https://www.spoonforkbacon.com/wp-content/uplo...,9.0,,,,,,...,,,,,,,,,,
9111,Roasted Chickpea Stuffed Sweet Potatoes with C...,https://www.spoonforkbacon.com/roasted-chickpe...,"[cans chickpeas, drained and pat dry, extra vi...",https://www.spoonforkbacon.com/wp-content/uplo...,6.0,,,,,791.0,...,24.0,22.0,194.0,10.0,,11.0,32188.0,,,
9112,Best Cauliflower Recipes,https://www.spoonforkbacon.com/cauliflower-rec...,"[head cauliflower, stem and outer leaves remov...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,20.0,16.0,36.0,,821.0,...,3.0,26.0,63.0,2.0,,57.0,220.0,,,
9113,Jalapeño and Cheddar Spoon Bread,https://www.spoonforkbacon.com/jalapeno-chedda...,"[unsalted butter, all purpose flour, yellow c...",https://www.spoonforkbacon.com/wp-content/uplo...,8.0,10.0,30.0,65.0,,361.0,...,2.0,15.0,282.0,1.0,,5.0,735.0,0.4,1.0,5.0


### Remove non-meal recipe (sauces, dressing, store direction, etc)

In [160]:
other_recipes_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.search("(sauce(s)?|dressing(s)?|store|cut|slice|clean|what)", x.lower())))

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[other_recipes_idx].index)
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,1.2,0.8,46.0,3.0,,,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,3.0,7.3,229.0,3.0,,,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,,,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,3.9,11.1,96.0,4.0,,,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1.1,4.1,9.0,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8490,Spicy Vegan Chickpea and Cauliflower Curry,https://www.spoonforkbacon.com/spicy-vegan-chi...,"[cauliflower, stem removed and cut into bite s...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,,,,,,...,,,,,,,,,,
8491,Vegan Swedish Meatballs over Mashed Potatoes a...,https://www.spoonforkbacon.com/vegan-swedish-m...,"[extra virgin olive oil, divided, sliced cremi...",https://www.spoonforkbacon.com/wp-content/uplo...,9.0,,,,,,...,,,,,,,,,,
8492,Best Cauliflower Recipes,https://www.spoonforkbacon.com/cauliflower-rec...,"[head cauliflower, stem and outer leaves remov...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,20.0,16.0,36.0,,821.0,...,3.0,26.0,63.0,2.0,,57.0,220.0,,,
8493,Jalapeño and Cheddar Spoon Bread,https://www.spoonforkbacon.com/jalapeno-chedda...,"[unsalted butter, all purpose flour, yellow c...",https://www.spoonforkbacon.com/wp-content/uplo...,8.0,10.0,30.0,65.0,,361.0,...,2.0,15.0,282.0,1.0,,5.0,735.0,0.4,1.0,5.0


### Remove 1-2 ingredient recipes

Most likely to be tutorial and not an actual recipe

In [161]:
less_than_two_ingredients = cleaned_cookbook[cleaned_cookbook.ingredients.apply(lambda x: len(x)) <= 2]

cleaned_cookbook = cleaned_cookbook.drop(less_than_two_ingredients.index)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,1.2,0.8,46.0,3.0,,,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,3.0,7.3,229.0,3.0,,,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,,,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,3.9,11.1,96.0,4.0,,,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1.1,4.1,9.0,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8490,Spicy Vegan Chickpea and Cauliflower Curry,https://www.spoonforkbacon.com/spicy-vegan-chi...,"[cauliflower, stem removed and cut into bite s...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,,,,,,...,,,,,,,,,,
8491,Vegan Swedish Meatballs over Mashed Potatoes a...,https://www.spoonforkbacon.com/vegan-swedish-m...,"[extra virgin olive oil, divided, sliced cremi...",https://www.spoonforkbacon.com/wp-content/uplo...,9.0,,,,,,...,,,,,,,,,,
8492,Best Cauliflower Recipes,https://www.spoonforkbacon.com/cauliflower-rec...,"[head cauliflower, stem and outer leaves remov...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,20.0,16.0,36.0,,821.0,...,3.0,26.0,63.0,2.0,,57.0,220.0,,,
8493,Jalapeño and Cheddar Spoon Bread,https://www.spoonforkbacon.com/jalapeno-chedda...,"[unsalted butter, all purpose flour, yellow c...",https://www.spoonforkbacon.com/wp-content/uplo...,8.0,10.0,30.0,65.0,,361.0,...,2.0,15.0,282.0,1.0,,5.0,735.0,0.4,1.0,5.0


## Handling NA Values

In [162]:
cleaned_cookbook.isna().sum()

recipe_title              0
recipe_url                0
ingredients               0
img_url                   0
num_steps                 0
prep_time               809
cook_time              1884
total_time             1558
serving_size           5557
calories               1686
carbohydrates          1861
protein                1855
fat                    1885
saturated_fat          2037
cholesterol            2908
sodium                 1936
potassium              2556
fiber                  2076
sugar                  1951
calcium                2656
iron                   2677
custom_time            7353
vitamin_c              3529
vitamin_a              3308
trans_fat              5235
polyunsaturated_fat    5300
monounsaturated_fat    5355
dtype: int64

### Time

Fill missing total time based on other times in case it wasn't catched when scraping or error. Otherwise, drop and fill the rest time variables with 0s

In [163]:
cleaned_cookbook.total_time = cleaned_cookbook.total_time.fillna(cleaned_cookbook.prep_time + cleaned_cookbook.cook_time + cleaned_cookbook.custom_time)
cleaned_cookbook = cleaned_cookbook.dropna(subset="total_time")

Fill all missing time data with 0s and overwrite total time to ensure consistency

In [164]:
cleaned_cookbook[["prep_time", "cook_time", "custom_time"]] = cleaned_cookbook[["prep_time", "cook_time", "custom_time"]].fillna(0)
cleaned_cookbook.total_time = cleaned_cookbook.prep_time + cleaned_cookbook.cook_time + cleaned_cookbook.custom_time

In [165]:
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
0,Ginger Fried Rice,https://omnivorescookbook.com/ginger-fried-rice/,"[ground chicken, Shaoxing wine, soy sauce, sal...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,10.0,25.0,1.0,413.0,...,1.2,0.8,46.0,3.0,0.0,,,,,
1,Kung Pao Tofu (宫爆豆腐),https://omnivorescookbook.com/kung-pao-tofu/,"[firm tofu, soy sauce, maple syrup, cornstarch...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,20.0,40.0,1.0,302.0,...,3.0,7.3,229.0,3.0,0.0,,,,,
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,,,,,
3,Roast Pork Lo Mein (叉烧捞面),https://omnivorescookbook.com/pork-lo-mein/,"[fresh lo mein noodles, chicken stock, oyster ...",https://omnivorescookbook.com/wp-content/uploa...,7.0,15.0,15.0,30.0,1.0,457.0,...,3.9,11.1,96.0,4.0,0.0,,,,,
4,"Sichuan Dumplings (钟水饺, Zhong Shui Jiao)",https://omnivorescookbook.com/sichuan-dumplings/,"[soy sauce, scallions, ginger, bay leaf, brown...",https://omnivorescookbook.com/wp-content/uploa...,6.0,45.0,15.0,60.0,1.0,346.0,...,1.1,4.1,9.0,1.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8487,Spicy Black Bean Soup,https://www.spoonforkbacon.com/spicy-black-bea...,"[dry black beans, water, divided, sea salt, sp...",https://www.spoonforkbacon.com/wp-content/uplo...,5.0,25.0,155.0,360.0,,178.0,...,7.0,3.0,71.0,3.0,180.0,21.0,704.0,,,
8489,The BEST Balsamic Vinaigrette Recipe,https://www.spoonforkbacon.com/the-best-balsam...,"[roughly chopped shallot, garlic clove, light ...",https://www.spoonforkbacon.com/wp-content/uplo...,3.0,3.0,0.0,3.0,,183.0,...,0.1,5.0,8.0,0.2,0.0,0.3,1.0,,2.0,13.0
8492,Best Cauliflower Recipes,https://www.spoonforkbacon.com/cauliflower-rec...,"[head cauliflower, stem and outer leaves remov...",https://www.spoonforkbacon.com/wp-content/uplo...,10.0,20.0,16.0,36.0,,821.0,...,3.0,26.0,63.0,2.0,0.0,57.0,220.0,,,
8493,Jalapeño and Cheddar Spoon Bread,https://www.spoonforkbacon.com/jalapeno-chedda...,"[unsalted butter, all purpose flour, yellow c...",https://www.spoonforkbacon.com/wp-content/uplo...,8.0,10.0,30.0,40.0,,361.0,...,2.0,15.0,282.0,1.0,0.0,5.0,735.0,0.4,1.0,5.0


### Macros and Micros

Fill NAs with 0s

In [69]:
fill_idx = cleaned_cookbook.columns[8:-1]
cleaned_cookbook[fill_idx].isna().sum()

serving_size           646
calories                72
carbohydrates           86
protein                 79
fat                     88
saturated_fat           95
cholesterol            201
sodium                  79
potassium               85
fiber                  126
sugar                  103
calcium                 84
iron                    94
custom_time              0
vitamin_c              259
vitamin_a              208
trans_fat              517
polyunsaturated_fat    476
dtype: int64

In [70]:
cleaned_cookbook[fill_idx] = cleaned_cookbook[fill_idx].fillna(0)

In [71]:
cleaned_cookbook[fill_idx].isna().sum()

serving_size           0
calories               0
carbohydrates          0
protein                0
fat                    0
saturated_fat          0
cholesterol            0
sodium                 0
potassium              0
fiber                  0
sugar                  0
calcium                0
iron                   0
custom_time            0
vitamin_c              0
vitamin_a              0
trans_fat              0
polyunsaturated_fat    0
dtype: int64

## Handling Duplicated Values

In [72]:
cleaned_cookbook.duplicated(subset=["recipe_title"]).sum()

4

In [73]:
cleaned_cookbook = cleaned_cookbook.drop_duplicates(subset=["recipe_title"])
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,fiber,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat
2,Air Fryer Chinese Roast Chicken,https://omnivorescookbook.com/air-fryer-chines...,"[whole chicken, Shaoxing wine, salt, black pep...",https://omnivorescookbook.com/wp-content/uploa...,3.0,10.0,50.0,180.0,1.0,237.0,...,0.1,1.0,14.0,1.0,120.0,0.0,0.0,0.0,0.0,
13,Sichuan Crispy Beef (5-Ingredient),https://omnivorescookbook.com/sichuan-crispy-b...,"[beef flank steak, salt, baking soda, vegetabl...",https://omnivorescookbook.com/wp-content/uploa...,4.0,10.0,10.0,50.0,1.0,353.0,...,0.5,0.4,12.0,20.0,30.0,0.0,0.0,0.0,0.0,
14,Hong Kong Pan Fried Pork Chops (港式洋葱猪扒),https://omnivorescookbook.com/hong-kong-pan-fr...,"[boneless pork chops, Shaoxing wine, light soy...",https://omnivorescookbook.com/wp-content/uploa...,8.0,20.0,15.0,65.0,1.0,419.0,...,0.9,5.4,41.0,1.0,30.0,0.0,0.0,0.0,0.0,
15,Matcha Tiramisu,https://omnivorescookbook.com/matcha-tiramisu/,"[ceremonial grade matcha, very hot water, masc...",https://omnivorescookbook.com/wp-content/uploa...,7.0,50.0,10.0,180.0,1.0,224.0,...,0.2,12.6,89.0,1.0,120.0,0.0,0.0,0.0,0.0,
34,Taiwanese Fried Chicken (台式鸡排),https://omnivorescookbook.com/taiwanese-fried-...,"[boneless skinless chicken thighs, light soy s...",https://omnivorescookbook.com/wp-content/uploa...,8.0,30.0,20.0,50.0,1.0,366.0,...,0.4,4.5,28.0,2.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8462,Sesame Ginger Chicken Salad,https://www.spoonforkbacon.com/sesame-ginger-c...,"[neutral oil, cooked quinoa, vegetable oil, wo...",https://www.spoonforkbacon.com/wp-content/uplo...,5.0,15.0,20.0,95.0,0.0,589.0,...,6.0,3.0,67.0,2.0,60.0,24.0,3162.0,0.1,10.0,12.0
8465,Peach and Pita Panzanella Salad with Burrata a...,https://www.spoonforkbacon.com/peach-and-pita-...,"[shelled pistachios, shelled walnuts, sesame s...",https://www.spoonforkbacon.com/wp-content/uplo...,9.0,30.0,15.0,75.0,0.0,0.0,...,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,
8477,BEST Pumpkin Recipes,https://www.spoonforkbacon.com/favorite-fall-p...,"[baby pumpkins, tops removed and completely ho...",https://www.spoonforkbacon.com/wp-content/uplo...,6.0,20.0,30.0,200.0,0.0,255.0,...,1.0,22.0,85.0,1.0,150.0,4.0,6240.0,0.0,0.0,
8481,French Onion Soup,https://www.spoonforkbacon.com/french-onion-soup/,"[ unsalted butter, extra virgin olive oil, lar...",https://www.spoonforkbacon.com/wp-content/uplo...,7.0,20.0,40.0,120.0,0.0,566.0,...,2.0,6.0,644.0,2.0,60.0,6.0,1135.0,1.0,3.0,14.0


## Tokenizing, Lemmatizing, and Removing General Stop Words for Ingredients

Clean the ingredients using multiple NLP methods and filtering general stop words

Step-by-step:
1. Normalization
2. Surface-level Filtering (numbers, punctuations, brands, etc)
3. Tokenization
4. Lemmatization
5. Stop-Words Filtering

In [74]:
# Identify stop words specific to recipes
PREP_WORDS = ["thinly", "finely", "softened", "crushed", "skinned", "whole", "chopped", "minced"]

TEMP_WORDS = ["hot", "warm", "cold", "room", "chilled", "frozen", "cool", "boiling", "temperature"]

TASTE_WORDS = ["sweet", "sour", "bitter", "salty", "umami"]

QUALITY_WORDS = ["fresh", "freshly", "organic", "good", "quality", "extra", "premium", "best", "finest", "high", "filtered", "light", "sashimi-grade", "pure", "fine"]

SIZE_WORDS = ["large", "mini", "small", "medium", "jumbo", "extra", "extra-large", "extra-small", "sliced", "halved", "diced", "cubed", "peeled", "grated", "shredded", "quartered", "clove", "short-grain", "heaping"]

STATE_WORDS = ["powder", "sauce", "boneless", "skinless", "skin", "cooked", "ripe", "coarse", "coarsely", "filet", "fillet", "fillets", "canned",
                "homemade", "used", "dried", "reserved", "packet", "ground", "uncooked", "pasteurized", "bonein", "skinon", "preserved", "raw", "cooking"]

OTHER_WORDS = ["quality", "extra", "virgin", "extravirgin", "long", "high", "japanese", "combination", "diamond", "kosher", "stalk", "juice", "sea", "taste", "crystal", "choice"]

UNITS = ["cup", "cups", "tbsp", "tablespoon", "tsp", "teaspoon", "g", "kg", "oz", "ml", "l", "lb", "pound", "inch", "cm", "m", "handful", "bulb", "dollop", "pinch"]

EXTRA_WORDS = PREP_WORDS + TEMP_WORDS + TASTE_WORDS + QUALITY_WORDS + SIZE_WORDS + STATE_WORDS + OTHER_WORDS + UNITS

In [75]:
def clean_ingredients(ingredients: list) -> list:
    """
    A single ingredient may consist of several words, thus
    it takes each string of ingredient and process it using multiple NLP methods into a clean string.
    This function also splits ingredients that are combined by commas and "and" and processes them separately.
    """
    # Initialize functions for cleaning
    translator = str.maketrans('', '', '!"#$%&\'()*+,.:;<=>?@[\\]^_`{|}~') # Punctuations remover (exclue slash "/" and strip "-")
    lemmatizer = WordNetLemmatizer() # Word lemmatizer
    
    clean_ingredients = []
    for ingredient in ingredients:
        # Split by commas and "and" to handle combined ingredients
        split_ingredients = [ing.strip() for ing in re.split(r'\s+(and|&|,)\s+', ingredient, flags=re.IGNORECASE) if ing.strip()]
        split_ingredients = list(dict.fromkeys(split_ingredients))  # Remove duplicates from split ingredients

        # Process each split ingredient separately
        for sub_ingredient in split_ingredients:
            # Normalization
            line = sub_ingredient.lower() # Make sure string are lowercase
            line = re.sub("\s*\([^()]*\)\s*", "", line) # Remove text between parentheses
            
            # Surface level cleaning
            line = re.sub("(\s+(or|/){1}\s+)", "/", line) # Remove ingredient substitutes
            # line = re.sub("(diamond crystal|premium-quality)", "", line) # Remove brands
            # line = re.sub("(\w*\d\w*|½|¼|¾)", "", line) # Remove numbers

            # Remove punctuations
            line = line.translate(translator)

            # In-depth cleaning
            line_tokenized = word_tokenize(line) # Tokenize the ingredient first
            line_lemmatized = [lemmatizer.lemmatize(ing) for ing in line_tokenized] # Then lemmatize the ingredient
            line_split = [ing for ing in line_lemmatized if ing not in stopwords.words("english") + EXTRA_WORDS + [""]] # Remove stop words
            
            if line_split:  # Only add if there are remaining words after cleaning
                line_split = list(dict.fromkeys(line_split)) # Remove duplicates from the words in a single ingredient
                line = "_".join(line_split) # Rejoin the ingredient as a single string
                clean_ingredients.append(line) # Add back to the ingredients list
        
    clean_ingredients = list(dict.fromkeys(clean_ingredients)) # Remove duplicates
    
    return clean_ingredients

# Clean ingredients namings
cleaned_cookbook.ingredients = cleaned_cookbook.ingredients.apply(clean_ingredients)

In [76]:
from nltk.probability import FreqDist

f_dist = FreqDist(i for i in cleaned_cookbook.ingredients.explode())

f_dist.most_common(100)

[('salt', 500),
 ('garlic', 251),
 ('water', 240),
 ('sugar', 226),
 ('egg', 204),
 ('unsalted_butter', 201),
 ('soy', 157),
 ('ginger', 134),
 ('vanilla_extract', 130),
 ('black_pepper', 129),
 ('milk', 129),
 ('brown_sugar', 115),
 ('baking', 101),
 ('olive_oil', 99),
 ('onion', 89),
 ('pepper', 73),
 ('green_onion', 70),
 ('all-purpose_flour', 70),
 ('cinnamon', 70),
 ('lemon', 66),
 ('vegetable_oil', 65),
 ('purpose_flour', 62),
 ('honey', 61),
 ('sake', 59),
 ('neutral_oil', 58),
 ('baking_soda', 56),
 ('carrot', 55),
 ('granulated_sugar', 55),
 ('egg_yolk', 54),
 ('mirin', 54),
 ('shaoxing_wine', 50),
 ('cornstarch', 46),
 ('sesame_oil', 43),
 ('flour', 41),
 ('cooking/kosher_salt', 40),
 ('white_pepper', 38),
 ('white_sugar', 38),
 ('rice_vinegar', 34),
 ('cumin', 34),
 ('cream', 33),
 ('cream_cheese', 31),
 ('nutmeg', 31),
 ('heavy_cream', 30),
 ('salt/kosher_salt', 30),
 ('cake_flour', 29),
 ('cornflour/cornstarch', 29),
 ('lemon_zest', 28),
 ('oyster', 27),
 ('heavycream', 27

## Create Ingredients Count Variable

In [77]:
cleaned_cookbook["num_ingredients"] = cleaned_cookbook.ingredients.apply(lambda x: len(x))

## Save Cleaned Data

### Type Check

In [78]:
cleaned_cookbook[cleaned_cookbook.recipe_url == "https://www.justonecookbook.com/rice-cooker-japanese-ketchup-rice/"]

Unnamed: 0,recipe_title,recipe_url,ingredients,img_url,num_steps,prep_time,cook_time,total_time,serving_size,calories,...,sugar,calcium,iron,custom_time,vitamin_c,vitamin_a,trans_fat,polyunsaturated_fat,monounsaturated_fat,num_ingredients


In [27]:
cleaned_cookbook.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6865 entries, 0 to 8494
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   recipe_title         6865 non-null   object 
 1   recipe_url           6865 non-null   string 
 2   ingredients          6865 non-null   object 
 3   img_url              6865 non-null   string 
 4   num_steps            6865 non-null   float64
 5   prep_time            6865 non-null   float64
 6   cook_time            6865 non-null   float64
 7   total_time           6865 non-null   float64
 8   serving_size         6865 non-null   float64
 9   calories             6865 non-null   float64
 10  carbohydrates        6865 non-null   float64
 11  protein              6865 non-null   float64
 12  fat                  6865 non-null   float64
 13  saturated_fat        6865 non-null   float64
 14  cholesterol          6865 non-null   float64
 15  sodium               6865 non-null   float6

In [22]:
# Convert recipe_title to string
cleaned_cookbook.recipe_title = cleaned_cookbook.recipe_title.astype(str)

### Save

In [None]:
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook.to_pickle("processed_cookbook.pkl", protocol=4)

## Sanity Check

In [None]:
pd.read_pickle("processed_cookbook.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork_leg_bone, pork_hock, water, garlic, ging...",35.0,380.0,60.0,130.0,60.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken_tender, salt, black_pepper, kewpie_ma...",15.0,40.0,10.0,10.0,0.0,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,0.0,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[allpurpose_flour, water, salt, potato_starch]",36.0,120.0,60.0,0.0,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,0.0,0.0,21.0,5.0,0.0,https://www.justonecookbook.com/wp-content/upl...
3,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green_scallion, egg, chicken_broth, s...",10.0,30.0,5.0,10.0,0.0,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,0.0,https://www.justonecookbook.com/wp-content/upl...
4,Butter Ponzu Beef,https://www.justonecookbook.com/butter-ponzu-b...,"[beef, garlic, komatsuna, maitake_mushroom, ne...",13.0,40.0,10.0,10.0,0.0,386.0,8.0,...,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,0.0,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallion, ginger, peanut_oil, shaoxin...",4.0,15.0,10.0,5.0,0.0,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6266,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[allpurpose_flour, baking, baking_soda, salt, ...",4.0,60.0,30.0,30.0,0.0,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,0.0,97.0,2.2,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6267,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[mango, peach, lime, simple_syrup, tequila, tr...",3.0,10.0,10.0,0.0,0.0,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6268,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken_breast, onion, chinese_saltcured...",3.0,30.0,20.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...


In [None]:
complete_cookbook[complete_cookbook.recipe_url == "https://natashaskitchen.com/cucumber-radish-salad-recipe/"]

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
4337,Cucumber Radish Salad Recipe (VIDEO),https://natashaskitchen.com/cucumber-radish-sa...,"[English cucumber, radishes, chives or green o...",2.0,10.0,10.0,,,99.0,4.0,...,479.0,232.0,,2.0,565.0,8.3,69.0,0.4,,https://natashaskitchen.com/wp-content/uploads...


In [None]:
cleaned_cookbook[cleaned_cookbook.recipe_url == "https://natashaskitchen.com/cucumber-radish-salad-recipe/"]

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
3958,Cucumber Radish Salad,https://natashaskitchen.com/cucumber-radish-sa...,"[english_cucumber, radish, chive_onion, sour_c...",2.0,10.0,10.0,0.0,0.0,99.0,4.0,...,479.0,232.0,0.0,2.0,565.0,8.3,69.0,0.4,0.0,https://natashaskitchen.com/wp-content/uploads...
