# Data Pre-Processing Pipeline

In [38]:
import numpy as np
import pandas as pd
import re
import warnings

# Importing necessary libraries for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string

warnings.filterwarnings("ignore")

## Combine All Recipe Data

In [39]:
import yaml

def load_params():
    with open("../params.yaml") as f:
        params = yaml.safe_load(f)
    return params

params = load_params()

In [40]:
import os
data_dir = "../" + params["data_preprocessing"]["raw_data_path"]
pickle_files = []
for root, _, files in os.walk(data_dir):
    for f in files:
        if f.endswith(".pkl"):
            pickle_files.append(os.path.join(root, f))

print("Combining recipe data...")
dfs = []
for path in pickle_files:
    print(f"Loading {path} ...")
    df = pd.read_pickle(path)
    dfs.append(df)

complete_cookbook = pd.concat(dfs, axis=0)
complete_cookbook = complete_cookbook.reset_index(drop=True)
complete_cookbook.head()
print(f"Combined {len(pickle_files)} files with total {len(complete_cookbook)} recipes.")

Combining recipe data...
Loading ../data/raw/natashas_kitchen_recipes.pkl ...
Loading ../data/raw/woks_of_life_recipes.pkl ...
Loading ../data/raw/just_one_cookbook_recipes.pkl ...
Loading ../data/raw/minimalist_baker_recipes.pkl ...
Loading ../data/raw/recipe_tin_eats_recipes.pkl ...
Loading ../data/raw/love_and_lemons_recipes.pkl ...
Combined 6 files with total 7249 recipes.


## General Filtering

In [403]:
cleaned_cookbook = complete_cookbook.copy()
cleaned_cookbook.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7249 entries, 0 to 7248
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   recipe_title         7249 non-null   string 
 1   recipe_url           7249 non-null   string 
 2   ingredients          7249 non-null   object 
 3   num_steps            7249 non-null   float64
 4   total_time           6552 non-null   float64
 5   prep_time            7087 non-null   float64
 6   cook_time            5950 non-null   float64
 7   custom_time          691 non-null    float64
 8   calories             5653 non-null   float64
 9   carbohydrates        5462 non-null   float64
 10  protein              5460 non-null   float64
 11  fat                  5428 non-null   float64
 12  saturated_fat        5303 non-null   float64
 13  polyunsaturated_fat  2502 non-null   float64
 14  monounsaturated_fat  2445 non-null   float64
 15  trans_fat            2781 non-null   f

### Clean up Recipe Title

In [404]:
def clean_recipe_title(title: string) -> string:
    """
    Cleans the recipe title by removing characters that are not digits or alphabets and extra spaces.
    Also removes certain words from the title and ensure capitalization.
    """
    line = re.sub("\s*\([^()]*\)\s*", "", title) # Remove text within parentheses
    line = re.sub("[^ \-a-zA-Z0-9()]*", "", line) # Replace unwanted characters with space
    line = re.sub("\s*\([^()]*", "", line) # Remove uncomplete parentheses

    # Remove certain words from recipe titles
    line = re.sub("(recipe|how\s*to\s*\w*\s|best|easy)", "", line, flags=re.IGNORECASE).strip()
    line = line.strip()  # Remove leading and trailing spaces
    return line.title()  # Capitalize the first letter of each word

cleaned_cookbook["recipe_title"] = cleaned_cookbook.recipe_title.apply(clean_recipe_title)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,,,21.0,5.0,,https://www.justonecookbook.com/wp-content/upl...
3,Carrot Ginger Dressing,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
4,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7244,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,,https://thewoksoflife.com/wp-content/uploads/2...
7245,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,,https://thewoksoflife.com/wp-content/uploads/2...
7246,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,,https://thewoksoflife.com/wp-content/uploads/2...
7247,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,https://thewoksoflife.com/wp-content/uploads/2...


### Remove Compilation Recipe

In [405]:
# Drop compilation recipes and keep single recipes only
all_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+", x.lower()))) # All titles with numbers in front 
keep_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+( |-)+(ingredient(s)?|minute(s)?|hour(s)?|bowl(s)?|pot(s)?|layer(s)?)", x.lower()))) # Titles with numbers in front to keep

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[all_numbers_title_idx & ~keep_numbers_title_idx].index)
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,,,21.0,5.0,,https://www.justonecookbook.com/wp-content/upl...
3,Carrot Ginger Dressing,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
4,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7143,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,,https://thewoksoflife.com/wp-content/uploads/2...
7144,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,,https://thewoksoflife.com/wp-content/uploads/2...
7145,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,,https://thewoksoflife.com/wp-content/uploads/2...
7146,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,https://thewoksoflife.com/wp-content/uploads/2...


### Remove non-meal recipe (sauces, dressing, store direction, etc)

In [406]:
other_recipes_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.search("(sauce(s)?|dressing(s)?|store|cut|slice|clean|what)", x.lower())))

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[other_recipes_idx].index)
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,,,21.0,5.0,,https://www.justonecookbook.com/wp-content/upl...
3,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,,https://www.justonecookbook.com/wp-content/upl...
4,Meat Thinly,https://www.justonecookbook.com/how-to-slice-m...,[premium-quality meat],8.0,20.0,10.0,,90.0,,,...,,,,,,,,,,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6698,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,,https://thewoksoflife.com/wp-content/uploads/2...
6699,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,,https://thewoksoflife.com/wp-content/uploads/2...
6700,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,,https://thewoksoflife.com/wp-content/uploads/2...
6701,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,https://thewoksoflife.com/wp-content/uploads/2...


### Remove 1-2 ingredient recipes

Most likely to be tutorial and not an actual recipe

In [407]:
less_than_two_ingredients = cleaned_cookbook[cleaned_cookbook.ingredients.apply(lambda x: len(x)) <= 2]

cleaned_cookbook = cleaned_cookbook.drop(less_than_two_ingredients.index)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,,,21.0,5.0,,https://www.justonecookbook.com/wp-content/upl...
3,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,,https://www.justonecookbook.com/wp-content/upl...
5,Butter Ponzu Beef,https://www.justonecookbook.com/butter-ponzu-b...,"[thinly sliced beef (such as ribeye), garlic, ...",13.0,40.0,10.0,10.0,,386.0,8.0,...,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6698,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,,https://thewoksoflife.com/wp-content/uploads/2...
6699,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,,https://thewoksoflife.com/wp-content/uploads/2...
6700,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,,https://thewoksoflife.com/wp-content/uploads/2...
6701,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,https://thewoksoflife.com/wp-content/uploads/2...


## Handling NA Values

In [408]:
cleaned_cookbook.isna().sum()

recipe_title              0
recipe_url                0
ingredients               0
num_steps                 0
total_time              589
prep_time               108
cook_time              1036
custom_time            5875
calories               1341
carbohydrates          1517
protein                1517
fat                    1540
saturated_fat          1654
polyunsaturated_fat    4245
monounsaturated_fat    4296
trans_fat              3950
cholesterol            2310
sodium                 1604
potassium              2203
fiber                  1727
sugar                  1610
vitamin_a              2461
vitamin_c              2636
calcium                2298
iron                   2299
serving_size           4273
image_url                 0
dtype: int64

### Time

Fill NA total time based on other times in case it wasn't catched when scraping. Otherwise, drop and fill the rest time variables with 0s

In [409]:
cleaned_cookbook.total_time = cleaned_cookbook.total_time.fillna(cleaned_cookbook.prep_time + cleaned_cookbook.cook_time + cleaned_cookbook.custom_time)
cleaned_cookbook = cleaned_cookbook.dropna(subset="total_time")
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,,,21.0,5.0,,https://www.justonecookbook.com/wp-content/upl...
3,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,,https://www.justonecookbook.com/wp-content/upl...
5,Butter Ponzu Beef,https://www.justonecookbook.com/butter-ponzu-b...,"[thinly sliced beef (such as ribeye), garlic, ...",13.0,40.0,10.0,10.0,,386.0,8.0,...,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6698,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,,https://thewoksoflife.com/wp-content/uploads/2...
6699,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,,https://thewoksoflife.com/wp-content/uploads/2...
6700,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,,https://thewoksoflife.com/wp-content/uploads/2...
6701,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,https://thewoksoflife.com/wp-content/uploads/2...


In [410]:
cleaned_cookbook[["prep_time", "cook_time", "custom_time"]] = cleaned_cookbook[["prep_time", "cook_time", "custom_time"]].fillna(0)

In [411]:
cleaned_cookbook[["total_time", "prep_time", "cook_time", "custom_time"]].isna().sum()

total_time     0
prep_time      0
cook_time      0
custom_time    0
dtype: int64

### Macros and Micros

Fill NAs with 0s

In [412]:
fill_idx = cleaned_cookbook.columns[8:-1]
cleaned_cookbook[fill_idx].isna().sum()

calories               1154
carbohydrates          1325
protein                1325
fat                    1348
saturated_fat          1461
polyunsaturated_fat    3897
monounsaturated_fat    3949
trans_fat              3603
cholesterol            2065
sodium                 1412
potassium              2008
fiber                  1527
sugar                  1416
vitamin_a              2260
vitamin_c              2420
calcium                2102
iron                   2106
serving_size           3815
dtype: int64

In [413]:
cleaned_cookbook[fill_idx] = cleaned_cookbook[fill_idx].fillna(0)

In [414]:
cleaned_cookbook[fill_idx].isna().sum()

calories               0
carbohydrates          0
protein                0
fat                    0
saturated_fat          0
polyunsaturated_fat    0
monounsaturated_fat    0
trans_fat              0
cholesterol            0
sodium                 0
potassium              0
fiber                  0
sugar                  0
vitamin_a              0
vitamin_c              0
calcium                0
iron                   0
serving_size           0
dtype: int64

## Handling Duplicated Values

In [415]:
cleaned_cookbook.duplicated(subset=["recipe_title"]).sum()

313

In [416]:
cleaned_cookbook = cleaned_cookbook.drop_duplicates(subset=["recipe_title"])
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,0.0,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,0.0,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,0.0,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,0.0,0.0,21.0,5.0,0.0,https://www.justonecookbook.com/wp-content/upl...
3,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,0.0,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,0.0,https://www.justonecookbook.com/wp-content/upl...
5,Butter Ponzu Beef,https://www.justonecookbook.com/butter-ponzu-b...,"[thinly sliced beef (such as ribeye), garlic, ...",13.0,40.0,10.0,10.0,0.0,386.0,8.0,...,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,0.0,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6698,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,0.0,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6699,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,0.0,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,0.0,97.0,2.2,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6700,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,0.0,0.0,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6701,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...


## Tokenizing, Lemmatizing, and Removing General Stop Words for Ingredients

Clean the ingredients using multiple NLP methods and filtering general stop words

Step-by-step:
1. Normalization
2. Surface-level Filtering (numbers, punctuations, brands, etc)
3. Tokenization
4. Lemmatization
5. Stop-Words Filtering

In [None]:
# Identify stop words specific to recipes
PREP_WORDS = ["thinly", "finely", "softened", "crushed", "skinned", "whole", "chopped", "minced"]

TEMP_WORDS = ["hot", "warm", "cold", "room", "chilled", "frozen", "cool", "boiling", "temperature"]

TASTE_WORDS = ["sweet", "sour", "bitter", "salty", "umami"]

QUALITY_WORDS = ["fresh", "freshly", "organic", "good", "quality", "extra", "premium", "best", "finest", "high", "filtered", "light", "sashimi-grade", "pure", "fine"]

SIZE_WORDS = ["large", "mini", "small", "medium", "jumbo", "extra", "extra-large", "extra-small", "sliced", "halved", "diced", "cubed", "peeled", "grated", "shredded", "quartered", "clove", "short-grain", "heaping"]

STATE_WORDS = ["powder", "sauce", "boneless", "skinless", "skin", "cooked", "ripe", "coarse", "coarsely", "filet", "fillet", "fillets", "canned",
                "homemade", "used", "dried", "reserved", "packet", "ground", "uncooked", "pasteurized", "bonein", "skinon", "preserved", "raw", "cooking"]

OTHER_WORDS = ["quality", "extra", "virgin", "extravirgin", "long", "high", "japanese", "combination", "diamond", "kosher", "stalk", "juice", "sea", "taste", "crystal", "choice"]

UNITS = ["cup", "cups", "tbsp", "tablespoon", "tsp", "teaspoon", "g", "kg", "oz", "ml", "l", "lb", "pound", "inch", "cm", "m", "handful", "bulb", "dollop", "pinch"]

EXTRA_WORDS = PREP_WORDS + TEMP_WORDS + TASTE_WORDS + QUALITY_WORDS + SIZE_WORDS + STATE_WORDS + OTHER_WORDS + UNITS

In [None]:
def clean_ingredients(ingredients: list) -> list:
    """
    A single ingredient may consist of several words, thus
    it takes each string of ingredient and process it using multiple NLP methods into a clean string.
    This function also splits ingredients that are combined by commas and "and" and processes them separately.
    """
    # Initialize functions for cleaning
    translator = str.maketrans('', '', '!"#$%&\'()*+,.:;<=>?@[\\]^_`{|}~') # Punctuations remover (exclue slash "/" and strip "-")
    lemmatizer = WordNetLemmatizer() # Word lemmatizer
    
    clean_ingredients = []
    for ingredient in ingredients:
        # Split by commas and "and" to handle combined ingredients
        split_ingredients = [ing.strip() for ing in re.split(r'\s+(and|&|,)\s+', ingredient, flags=re.IGNORECASE) if ing.strip()]
        split_ingredients = list(dict.fromkeys(split_ingredients))  # Remove duplicates from split ingredients

        # Process each split ingredient separately
        for sub_ingredient in split_ingredients:
            # Normalization
            line = sub_ingredient.lower() # Make sure string are lowercase
            line = re.sub("\s*\([^()]*\)\s*", "", line) # Remove text between parentheses
            
            # Surface level cleaning
            line = re.sub("(\s+(or|/){1}\s+)", "/", line) # Remove ingredient substitutes
            # line = re.sub("(diamond crystal|premium-quality)", "", line) # Remove brands
            # line = re.sub("(\w*\d\w*|½|¼|¾)", "", line) # Remove numbers

            # Remove punctuations
            line = line.translate(translator)

            # In-depth cleaning
            line_tokenized = word_tokenize(line) # Tokenize the ingredient first
            line_lemmatized = [lemmatizer.lemmatize(ing) for ing in line_tokenized] # Then lemmatize the ingredient
            line_split = [ing for ing in line_lemmatized if ing not in stopwords.words("english") + EXTRA_WORDS + [""]] # Remove stop words
            
            if line_split:  # Only add if there are remaining words after cleaning
                line_split = list(dict.fromkeys(line_split)) # Remove duplicates from the words in a single ingredient
                line = "_".join(line_split) # Rejoin the ingredient as a single string
                clean_ingredients.append(line) # Add back to the ingredients list
        
    clean_ingredients = list(dict.fromkeys(clean_ingredients)) # Remove duplicates
    
    return clean_ingredients

# Clean ingredients namings
cleaned_cookbook.ingredients = cleaned_cookbook.ingredients.apply(clean_ingredients)

In [419]:
from nltk.probability import FreqDist

f_dist = FreqDist(i for i in cleaned_cookbook.ingredients.explode())

f_dist.most_common(100)

[('salt', 3831),
 ('garlic', 2151),
 ('water', 1651),
 ('black_pepper', 1390),
 ('sugar', 1117),
 ('olive_oil', 1014),
 ('egg', 997),
 ('soy', 915),
 ('onion', 752),
 ('ginger', 742),
 ('vanilla_extract', 691),
 ('unsalted_butter', 633),
 ('lemon', 624),
 ('carrot', 518),
 ('all-purpose_flour', 488),
 ('baking', 477),
 ('maple_syrup', 471),
 ('scallion', 460),
 ('sesame_oil', 454),
 ('milk', 430),
 ('parsley', 423),
 ('pepper', 403),
 ('cinnamon', 394),
 ('white_pepper', 387),
 ('oil', 365),
 ('neutral_oil', 361),
 ('cilantro', 350),
 ('shaoxing_wine', 348),
 ('cornstarch', 345),
 ('vegetable_oil', 330),
 ('lime', 320),
 ('extra-virgin_olive_oil', 308),
 ('baking_soda', 303),
 ('cumin', 298),
 ('tomato', 291),
 ('brown_sugar', 271),
 ('granulated_sugar', 266),
 ('red_onion', 262),
 ('sake', 241),
 ('oyster', 240),
 ('mirin', 239),
 ('dark_soy', 232),
 ('avocado', 220),
 ('red_pepper_flake', 219),
 ('butter', 208),
 ('honey', 205),
 ('cream', 190),
 ('potato', 187),
 ('rice_vinegar', 17

## Save Cleaned Data

### Type Check

In [423]:
cleaned_cookbook.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5750 entries, 0 to 6702
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   recipe_title         5750 non-null   object 
 1   recipe_url           5750 non-null   string 
 2   ingredients          5750 non-null   object 
 3   num_steps            5750 non-null   float64
 4   total_time           5750 non-null   float64
 5   prep_time            5750 non-null   float64
 6   cook_time            5750 non-null   float64
 7   custom_time          5750 non-null   float64
 8   calories             5750 non-null   float64
 9   carbohydrates        5750 non-null   float64
 10  protein              5750 non-null   float64
 11  fat                  5750 non-null   float64
 12  saturated_fat        5750 non-null   float64
 13  polyunsaturated_fat  5750 non-null   float64
 14  monounsaturated_fat  5750 non-null   float64
 15  trans_fat            5750 non-null   float6

In [424]:
# Convert recipe_title to string
cleaned_cookbook.recipe_title = cleaned_cookbook.recipe_title.astype(str)

### Save

In [None]:
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook.to_pickle("processed_cookbook.pkl", protocol=4)

## Sanity Check

In [None]:
pd.read_pickle("processed_cookbook.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
0,Tonkotsu Ramen,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork_leg_bone, pork_hock, water, garlic, ging...",35.0,380.0,60.0,130.0,60.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.justonecookbook.com/wp-content/upl...
1,Pan-Fried Curry Chicken,https://www.justonecookbook.com/pan-fried-curr...,"[chicken_tender, salt, black_pepper, kewpie_ma...",15.0,40.0,10.0,10.0,0.0,290.0,10.0,...,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,0.0,https://www.justonecookbook.com/wp-content/upl...
2,Homemade Udon Noodles,https://www.justonecookbook.com/udon-noodles/,"[allpurpose_flour, water, salt, potato_starch]",36.0,120.0,60.0,0.0,150.0,361.0,76.0,...,198.0,106.0,3.0,1.0,0.0,0.0,21.0,5.0,0.0,https://www.justonecookbook.com/wp-content/upl...
3,Tomato Egg Vermicelli Soup,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green_scallion, egg, chicken_broth, s...",10.0,30.0,5.0,10.0,0.0,123.0,16.0,...,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,0.0,https://www.justonecookbook.com/wp-content/upl...
4,Butter Ponzu Beef,https://www.justonecookbook.com/butter-ponzu-b...,"[beef, garlic, komatsuna, maitake_mushroom, ne...",13.0,40.0,10.0,10.0,0.0,386.0,8.0,...,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,0.0,https://www.justonecookbook.com/wp-content/upl...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,Scallion Ginger Shrimp,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallion, ginger, peanut_oil, shaoxin...",4.0,15.0,10.0,5.0,0.0,191.0,2.0,...,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6266,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[allpurpose_flour, baking, baking_soda, salt, ...",4.0,60.0,30.0,30.0,0.0,517.0,53.0,...,418.0,346.0,2.0,28.0,225.0,0.0,97.0,2.2,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6267,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[mango, peach, lime, simple_syrup, tequila, tr...",3.0,10.0,10.0,0.0,0.0,206.0,30.0,...,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...
6268,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken_breast, onion, chinese_saltcured...",3.0,30.0,20.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://thewoksoflife.com/wp-content/uploads/2...


In [None]:
complete_cookbook[complete_cookbook.recipe_url == "https://natashaskitchen.com/cucumber-radish-salad-recipe/"]

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
4337,Cucumber Radish Salad Recipe (VIDEO),https://natashaskitchen.com/cucumber-radish-sa...,"[English cucumber, radishes, chives or green o...",2.0,10.0,10.0,,,99.0,4.0,...,479.0,232.0,,2.0,565.0,8.3,69.0,0.4,,https://natashaskitchen.com/wp-content/uploads...


In [None]:
cleaned_cookbook[cleaned_cookbook.recipe_url == "https://natashaskitchen.com/cucumber-radish-salad-recipe/"]

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size,image_url
3958,Cucumber Radish Salad,https://natashaskitchen.com/cucumber-radish-sa...,"[english_cucumber, radish, chive_onion, sour_c...",2.0,10.0,10.0,0.0,0.0,99.0,4.0,...,479.0,232.0,0.0,2.0,565.0,8.3,69.0,0.4,0.0,https://natashaskitchen.com/wp-content/uploads...
