# Data Pre-Processing Pipeline

In [597]:
import numpy as np
import pandas as pd
import re
import warnings

warnings.filterwarnings("ignore")

In [598]:
complete_cookbook = pd.read_pickle("Data/complete_cookbook.pkl")
complete_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe 豚骨ラーメン,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken (Video),https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles (Video) 手打ちうどん,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Carrot Ginger Dressing 人参ドレッシング,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,
4,Tomato Egg Vermicelli Soup (Video),https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7244,Scallion Ginger Shrimp Recipe (Redux!),https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
7245,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
7246,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
7247,Cantonese Chicken & Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


## General Filtering

Remove data that are compilation of recipes instead of a single recipe

In [599]:
cleaned_cookbook = complete_cookbook.copy()

# Remove other characters that are not digits or alphabets
cleaned_cookbook.recipe_title = complete_cookbook.recipe_title.apply(lambda x: re.sub("[^ \-a-zA-Z0-9]*", "", x)) 

# Drop compilation recipes and keep single recipes only
all_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+", x.lower()))) # All titles with numbers in front 
keep_numbers_title_idx = ~pd.isna(cleaned_cookbook.recipe_title.apply(lambda x: re.match("^\d+( |-)+(ingredient(s)?|minute(s)?|hour(s)?|bowl(s)?|pot(s)?|layer(s)?)", x.lower()))) # Titles with numbers in front to keep

cleaned_cookbook = cleaned_cookbook.drop(cleaned_cookbook[all_numbers_title_idx & ~keep_numbers_title_idx].index)
cleaned_cookbook

Unnamed: 0,recipe_title,recipe_url,ingredients,num_steps,total_time,prep_time,cook_time,custom_time,calories,carbohydrates,...,cholesterol,sodium,potassium,fiber,sugar,vitamin_a,vitamin_c,calcium,iron,serving_size
0,Easy Tonkotsu Ramen Recipe,https://www.justonecookbook.com/easy-tonkotsu-...,"[pork leg bones, pork hock with skin, water, g...",35.0,380.0,60.0,130.0,60.0,,,...,,,,,,,,,,
1,Pan-Fried Curry Chicken Video,https://www.justonecookbook.com/pan-fried-curr...,"[chicken tenders, Diamond Crystal kosher salt,...",15.0,40.0,10.0,10.0,,290.0,10.0,...,76.0,989.0,441.0,0.5,4.0,46.0,1.0,11.0,1.0,
2,Homemade Udon Noodles Video,https://www.justonecookbook.com/udon-noodles/,"[all-purpose flour (plain flour), water, Diamo...",36.0,120.0,60.0,,150.0,361.0,76.0,...,,198.0,106.0,3.0,1.0,,,21.0,5.0,
3,Carrot Ginger Dressing,https://www.justonecookbook.com/carrot-ginger-...,"[carrot, onion, ginger, sugar, miso, Diamond C...",10.0,20.0,10.0,,,121.0,7.0,...,,170.0,96.0,1.0,5.0,3792.0,2.0,13.0,1.0,
4,Tomato Egg Vermicelli Soup Video,https://www.justonecookbook.com/tomato-egg-ver...,"[tomato, green onion/scallion, large egg (50 g...",10.0,30.0,5.0,10.0,,123.0,16.0,...,93.0,299.0,215.0,2.0,3.0,737.0,11.0,79.0,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7244,Scallion Ginger Shrimp Recipe Redux,https://thewoksoflife.com/scallion-ginger-shri...,"[shrimp, scallions, fresh ginger, peanut oil, ...",4.0,15.0,10.0,5.0,,191.0,2.0,...,286.0,1043.0,145.0,1.0,1.0,120.0,7.2,173.0,2.6,
7245,Classic Peanut Butter Cake,https://thewoksoflife.com/classic-peanut-butte...,"[all-purpose flour, baking powder, baking soda...",4.0,60.0,30.0,30.0,,517.0,53.0,...,61.0,418.0,346.0,2.0,28.0,225.0,,97.0,2.2,
7246,Frozen White Peach Mango Margaritas,https://thewoksoflife.com/frozen-white-peach-m...,"[ripe mango, ripe peaches, fresh lime juice, s...",3.0,10.0,10.0,,,206.0,30.0,...,,593.0,234.0,2.0,28.0,691.0,23.0,9.0,1.0,
7247,Cantonese Chicken Salted Fish Fried Rice,https://thewoksoflife.com/cantonese-chicken-sa...,"[oil, chicken breast, medium onion, Chinese sa...",3.0,30.0,20.0,10.0,,,,...,,,,,,,,,,


## Tokenizing, Lemmatizing, and General Stop Words

Clean the ingredients using multiple NLP methods and filtering general stop words

Step-by-step:
1. Normalization
2. Surface-level Filtering (numbers, punctuations, brands, etc)
3. Tokenization
4. Lemmatization
5. Stop-Words Filtering

In [600]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string

In [601]:
# Identify stop words specific to recipes
PREP_WORDS = ["thinly", "finely", "softened", "crushed", "skinned", "whole", "chopped", "minced"]

STATE_WORDS = ["powder", "sauce", "freshly", "fresh", "organic", "large", "small", "medium", "boneless", "skinless", "skin", "cooked", "sliced", "ripe", "coarsely", "hot", "cold", "filet", "fillet",
                "homemade", "used", "temperature", "filtered", "dried", "reserved",  "packet", "ground", "uncooked", "good", "premium", "pasteurized", "boiling", "bonein", "skinon", "preserved"]

OTHER_WORDS = ["quality", "extra", "virgin", "extravirgin", "long", "high", "room", "japanese", "combination", "kosher", "stalk"]

UNITS = ["cup", "cups", "tbsp", "tablespoon", "tsp", "teaspoon", "g", "kg", "oz", "ml", "l", "lb", "pound", "inch", "cm", "m", "handful", "bulb", "dollop"]

STOP_WORDS = PREP_WORDS + STATE_WORDS + OTHER_WORDS + UNITS



def clean_ingredients(ingredients: list) -> list:
    """
    A single ingredient may consist of several words, thus
    it takes each string of ingredient and process it using multiple NLP methods into a clean string
    """
    # Initialize functions for cleaning
    translator = str.maketrans('', '', string.punctuation) # Punctuations remover
    lemmatizer = WordNetLemmatizer() # Word lemmatrizer

    clean_ingredients = []
    for ingredient in ingredients:
        # Normalization
        line = ingredient.lower() # Make sure string are lowercase
        line = re.sub("\s*\([^()]*\)\s*", "", line) # Remove text between parentheses

        # Surface level cleaning
        line = re.sub("([a-z]*/|\s+(or){1}\s+[a-z]*)", "", line) # Remove ingredient substitutes
        line = re.sub("(diamond crystal|premium-quality)", "", line) # Remove brands
        line = re.sub("(\w*\d\w*|½|¼|¾)", "", line) # Remove numbers

        # Remove punctuations
        line = line.translate(translator)

        # In-depth cleaning
        line_tokenized = word_tokenize(line) # Tokenize the ingredient first
        line_lemmatized = [lemmatizer.lemmatize(ing) for ing in line_tokenized] # Then lemmatize the ingredient
        line_split = [ing for ing in line_lemmatized if ing not in stopwords.words("english") + STOP_WORDS + [""]] # Remove stop words

        line = "_".join(line_split) # Rejoin the ingredient as a single string
        clean_ingredients.append(line) # Add back to the ingredients list

    return clean_ingredients

# Clean ingredients namings
cleaned_cookbook.ingredients = cleaned_cookbook.ingredients.apply(lambda ingredients: clean_ingredients(ingredients))

In [604]:
cleaned_cookbook.ingredients

0       [pork_leg_bone, pork_hock, water, garlic, ging...
1       [chicken_tender, salt, black_pepper, kewpie_ma...
2          [allpurpose_flour, water, salt, potato_starch]
3       [carrot, onion, ginger, sugar, miso, salt, bla...
4       [tomato, green_scallion, egg, chicken_broth, s...
                              ...                        
7244    [shrimp, scallion, ginger, peanut_oil, shaoxin...
7245    [allpurpose_flour, baking, baking_soda, salt, ...
7246    [mango, peach, lime_juice, simple_syrup, tequi...
7247    [oil, chicken_breast, onion, chinese_saltcured...
7248    [sweetened_condensed_milk, vanilla_extract, bo...
Name: ingredients, Length: 7150, dtype: object

## Removing Specific Stop Words Ingredients (Optional)

In [607]:
from nltk.probability import FreqDist

f_dist = FreqDist(i for i in cleaned_cookbook.ingredients.explode())

f_dist.most_common(100)

[('salt', 2659),
 ('water', 2219),
 ('olive_oil', 1970),
 ('sea_salt', 1668),
 ('garlic', 1629),
 ('black_pepper', 1547),
 ('sugar', 1406),
 ('garlic_clove', 1331),
 ('egg', 1225),
 ('onion', 1004),
 ('unsalted_butter', 915),
 ('ginger', 832),
 ('soy', 814),
 ('lemon_juice', 758),
 ('vanilla_extract', 690),
 ('maple_syrup', 619),
 ('allpurpose_flour', 605),
 ('sesame_oil', 605),
 ('parsley', 591),
 ('carrot', 585),
 ('scallion', 575),
 ('baking', 541),
 ('cornstarch', 533),
 ('milk', 518),
 ('vegetable_oil', 516),
 ('cinnamon', 499),
 ('white_pepper', 479),
 ('oil', 465),
 ('cilantro', 453),
 ('neutral_oil', 447),
 ('shaoxing_wine', 435),
 ('salt_pepper', 431),
 ('granulated_sugar', 414),
 ('light_soy', 403),
 ('cumin', 380),
 ('lime_juice', 367),
 ('sea_salt_black_pepper', 357),
 ('baking_soda', 353),
 ('oyster', 338),
 ('brown_sugar', 318),
 ('tomato', 307),
 ('butter', 300),
 ('dark_soy', 300),
 ('sake', 293),
 ('mirin', 287),
 ('avocado', 282),
 ('red_onion', 281),
 ('red_pepper_fl

In [615]:
cleaned_cookbook.isna().mean() * 100

recipe_title            0.000000
recipe_url              0.000000
ingredients             0.000000
num_steps               0.000000
total_time              9.384615
prep_time               2.139860
cook_time              17.846154
custom_time            90.433566
calories               21.174825
carbohydrates          23.832168
protein                23.860140
fat                    24.293706
saturated_fat          26.041958
polyunsaturated_fat    65.132867
monounsaturated_fat    65.916084
trans_fat              61.202797
cholesterol            36.923077
sodium                 25.090909
potassium              34.013986
fiber                  27.328671
sugar                  25.384615
vitamin_a              38.321678
vitamin_c              40.979021
calcium                35.482517
iron                   35.566434
serving_size           65.818182
dtype: float64

In [612]:
cleaned_cookbook.recipe_title = cleaned_cookbook.recipe_title.astype("string")
cleaned_cookbook.to_pickle("processed_cookbook.pkl")