# Data Pre-Processing Pipeline

In [209]:
import numpy as np
import pandas as pd
import re
import warnings

warnings.filterwarnings("ignore")

In [210]:
complete_cookbook = pd.read_pickle("Data/complete_cookbook.pkl")
complete_cookbook = complete_cookbook.reset_index(drop=True)
complete_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe 豚骨ラーメン,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken (Video),https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles (Video) 手打ちうどん,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Carrot Ginger Dressing 人参ドレッシング,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,
4,Tomato Egg Vermicelli Soup (Video),https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7244,Scallion Ginger Shrimp Recipe (Redux!),https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
7245,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
7246,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
7247,Cantonese Chicken & Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


## General Filtering

### Remove Compilation Recipe

In [211]:
cleaned_cookbook = complete_cookbook.copy()

# Remove other characters that are not digits or alphabets
cleaned_cookbook.recipe_title = complete_cookbook.recipe_title.apply(lambda x: re.sub("[^ \-a-zA-Z0-9]*", "", x)) 

# Drop compilation recipes and keep single recipes only
all_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+", x.lower()))) # All titles with numbers in front 
keep_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+( |-)+(ingredient(s)?|minute(s)?|hour(s)?|bowl(s)?|pot(s)?|layer(s)?)", x.lower()))) # Titles with numbers in front to keep

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[all_numbers_title_idx & ~keep_numbers_title_idx].index)
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken Video,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles Video,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Carrot Ginger Dressing,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,
4,Tomato Egg Vermicelli Soup Video,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7145,Scallion Ginger Shrimp Recipe Redux,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
7146,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
7147,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
7148,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


### Remove non-meal recipe (sauces, dressing, store direction, etc)

In [212]:
other_recipes_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.search("(sauce(s)?|dressing(s)?|store|cut|slice|clean)", x.lower())))

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[other_recipes_idx].index)
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook



Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken Video,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles Video,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Tomato Egg Vermicelli Soup Video,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
4,Butter Ponzu Beef Video,https://www.justonecookbook.com/butter-ponzu-b...,"[thinly sliced beef (such as ribeye), garlic, ...",13.0,40.0,10.0,10.0,,386.0,8.0,...,84.0,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6669,Scallion Ginger Shrimp Recipe Redux,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
6670,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
6671,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
6672,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


## Tokenizing, Lemmatizing, and General Stop Words

Clean the ingredients using multiple NLP methods and filtering general stop words

Step-by-step:
1. Normalization
2. Surface-level Filtering (numbers, punctuations, brands, etc)
3. Tokenization
4. Lemmatization
5. Stop-Words Filtering

In [213]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string

In [214]:
# Identify stop words specific to recipes
PREP_WORDS = ["thinly", "finely", "softened", "crushed", "skinned", "whole", "chopped", "minced"]

TEMP_WORDS = ["hot", "warm", "cold", "room", "chilled", "frozen", "cool", "boiling", "temparature"]

QUALITY_WORDS = ["fresh", "freshly", "organic", "good", "quality", "extra", "premium", "best", "finest", "high", "filtered", "light", "sashimi-grade", "pure", "fine"]

SIZE_WORDS = ["large", "small", "medium", "jumbo", "extra", "extra-large", "extra-small", "sliced", "halved", "diced", "cubed", "peeled", "grated", "shredded", "quartered", "clove", "short-grain", "heaping"]

STATE_WORDS = ["powder", "sauce", "boneless", "skinless", "skin", "cooked", "ripe", "coarse", "coarsely", "filet", "fillet", "fillets", "canned",
                "homemade", "used", "dried", "reserved", "packet", "ground", "uncooked", "pasteurized", "bonein", "skinon", "preserved", "raw", "cooking"]

OTHER_WORDS = ["quality", "extra", "virgin", "extravirgin", "long", "high", "japanese", "combination", "kosher", "stalk", "juice", "sea", "taste"]

UNITS = ["cup", "cups", "tbsp", "tablespoon", "tsp", "teaspoon", "g", "kg", "oz", "ml", "l", "lb", "pound", "inch", "cm", "m", "handful", "bulb", "dollop", "pinch"]

STOP_WORDS = PREP_WORDS + TEMP_WORDS + QUALITY_WORDS + SIZE_WORDS + STATE_WORDS + OTHER_WORDS + UNITS


def clean_ingredients(ingredients: list) -> list:
    """
    A single ingredient may consist of several words, thus
    it takes each string of ingredient and process it using multiple NLP methods into a clean string.
    This function also splits ingredients that are combined by commas and "and" and processes them separately.
    """
    # Initialize functions for cleaning
    translator = str.maketrans('', '', string.punctuation) # Punctuations remover
    lemmatizer = WordNetLemmatizer() # Word lemmatizer
    
    clean_ingredients = []
    for ingredient in ingredients:
        # Split by commas and "and" to handle combined ingredients
        split_ingredients = [ing.strip() for ing in re.split(r'\s+(and|&|,)\s+', ingredient, flags=re.IGNORECASE) if ing.strip()]
        split_ingredients = list(set(split_ingredients))  # Remove duplicates from split ingredients

        # Process each split ingredient separately
        for sub_ingredient in split_ingredients:
            # Normalization
            line = sub_ingredient.lower() # Make sure string are lowercase
            line = re.sub("\s*\([^()]*\)\s*", "", line) # Remove text between parentheses

            # Surface level cleaning
            line = re.sub("([a-z]*/|\s+(or){1}\s+[a-z]*)", "", line) # Remove ingredient substitutes
            line = re.sub("(diamond crystal|premium-quality)", "", line) # Remove brands
            line = re.sub("(\w*\d\w*|½|¼|¾)", "", line) # Remove numbers

            # Remove punctuations
            line = line.translate(translator)

            # In-depth cleaning
            line_tokenized = word_tokenize(line) # Tokenize the ingredient first
            line_lemmatized = [lemmatizer.lemmatize(ing) for ing in line_tokenized] # Then lemmatize the ingredient
            line_split = [ing for ing in line_lemmatized if ing not in stopwords.words("english") + STOP_WORDS + [""]] # Remove stop words

            if line_split:  # Only add if there are remaining words after cleaning
                line_split = list(set(line_split)) # Remove duplicates from the words in a single ingredient
                line = "_".join(line_split) # Rejoin the ingredient as a single string
                clean_ingredients.append(line) # Add back to the ingredients list
        
    clean_ingredients = list(set(clean_ingredients)) # Remove duplicates
    
    return clean_ingredients

# Clean ingredients namings
cleaned_cookbook.ingredients = cleaned_cookbook.ingredients.apply(clean_ingredients)

In [215]:
from nltk.probability import FreqDist

f_dist = FreqDist(i for i in cleaned_cookbook.ingredients.explode())

f_dist.most_common(100)

[('salt', 4508),
 ('garlic', 2544),
 ('water', 1906),
 ('olive_oil', 1676),
 ('pepper_black', 1655),
 ('sugar', 1186),
 ('egg', 1156),
 ('soy', 990),
 ('onion', 914),
 ('ginger', 821),
 ('butter_unsalted', 777),
 ('extract_vanilla', 775),
 ('lemon', 763),
 ('carrot', 619),
 ('allpurpose_flour', 559),
 ('parsley', 543),
 ('baking', 535),
 ('oil_sesame', 513),
 ('scallion', 510),
 ('syrup_maple', 509),
 ('milk', 500),
 ('pepper', 487),
 ('cinnamon', 452),
 ('oil_vegetable', 430),
 ('white_pepper', 427),
 ('oil', 416),
 ('cilantro', 412),
 ('cornstarch', 398),
 ('lime', 384),
 ('oil_neutral', 371),
 ('tomato', 370),
 ('wine_shaoxing', 363),
 ('baking_soda', 348),
 ('cumin', 347),
 ('onion_red', 330),
 ('granulated_sugar', 320),
 ('sugar_brown', 319),
 ('oyster', 272),
 ('butter', 263),
 ('avocado', 260),
 ('dark_soy', 257),
 ('mirin', 254),
 ('sake', 246),
 ('flake_pepper_red', 244),
 ('honey', 237),
 ('flour', 230),
 ('oil_avocado', 214),
 ('thigh_chicken', 212),
 ('dijon_mustard', 212),

In [216]:
cleaned_cookbook.isna().mean() * 100

recipe_title            0.000000
recipe_url              0.000000
ingredients             0.000000
num_steps               0.000000
total_time              9.109979
prep_time               1.992808
cook_time              16.451903
custom_time            90.035960
calories               21.111777
carbohydrates          23.823794
protein                23.808810
fat                    24.213365
saturated_fat          25.936470
polyunsaturated_fat    65.133353
monounsaturated_fat    65.897513
trans_fat              60.833084
cholesterol            36.230147
sodium                 25.067426
potassium              34.192388
fiber                  27.135151
sugar                  25.367096
vitamin_a              38.492658
vitamin_c              41.234642
calcium                35.585856
iron                   35.705724
serving_size           65.522925
dtype: float64

In [217]:
cleaned_cookbook.recipe_title = cleaned_cookbook.recipe_title.astype("string")
cleaned_cookbook = cleaned_cookbook.reset_index(drop=True)
cleaned_cookbook.to_pickle("processed_cookbook.pkl", protocol=4)

In [218]:
pd.read_pickle("processed_cookbook.pkl")

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe,https://www.justonecookbook.com/easy-tonkotsu-...,"[mirin, ginger, chashu, water, niboshi_boiled,...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken Video,https://www.justonecookbook.com/pan-fried-curr...,"[kewpie_mayonnaise, oil_neutral, allpurpose_fl...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles Video,https://www.justonecookbook.com/udon-noodles/,"[salt, starch_potato, allpurpose_flour, water]",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Tomato Egg Vermicelli Soup Video,https://www.justonecookbook.com/tomato-egg-ver...,"[toasted_sesame_oil, salt, white_pepper, tomat...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
4,Butter Ponzu Beef Video,https://www.justonecookbook.com/butter-ponzu-b...,"[oil_neutral, pepper_black, komatsuna, garlic,...",13.0,40.0,10.0,10.0,,386.0,8.0,...,84.0,751.0,684.0,2.0,1.0,194.0,17.0,90.0,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6669,Scallion Ginger Shrimp Recipe Redux,https://thewoksoflife.com/scallion-ginger-shri...,"[scallion, white_pepper, ginger, oil_peanut, s...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
6670,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[oil, butter_peanut, cream_cheese, baking, but...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
6671,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[peach, ice, wedge_lime, lime, mango, tequila,...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
6672,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[breast_chicken, oil, scallion, rice, onion, l...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,
