# Data Cleaning

In [1]:
%load_ext kedro.ipython

In [6]:
%reload_kedro

In [60]:
import pandas as pd

## Load dataset

In [7]:
recipes = catalog.load('recipes')
images = catalog.load('recipe_images')

## Data Transformation Steps

Drop unnecessary columns and preserve the index as a separate column.

In [84]:
recipes_df = recipes[['Title', 'Instructions', 'Cleaned_Ingredients']].reset_index(drop=False)

Split the dataset into two separate dataframes:

1. `recipe_instructions`: contains the title and instructions for each recipe
2. `recipe_ingredients`: contains the cleaned ingredients for each recipe

This makes it easier to work with the ingredients data which needs to be parsed and exploded.

In [86]:
def split_to_instructions_and_ingredients(recipes_df:pd.DataFrame):
    recipe_instructions = recipes_df[['index', 'Title', 'Instructions']]
    recipe_ingredients = recipes_df[['index', 'Cleaned_Ingredients']].rename(
        columns={'index' : 'recipes_index', 'Cleaned_Ingredients' : 'ingredients'}
    )
    return recipe_instructions, recipe_ingredients

recipe_instructions, recipe_ingredients = split_to_instructions_and_ingredients(recipes_df)

In [87]:
recipe_instructions.head()

Unnamed: 0,index,Title,Instructions
0,0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ..."
1,1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...
2,2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...
3,3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...
4,4,Newton's Law,Stir together brown sugar and hot water in a c...


Ingredients are stored as a string, but its contents are a list of strings.

We reverse the process and split the string into a list of strings.

This step creates a list of strings, containing the ingredients but also some string artefacts like empty strings and stray commas that need to be removed.

In [89]:
def ingredients_str_to_list(ingredients_series:pd.Series):
    return (
        ingredients_series.str.replace('[', '')
        .str.replace(']', '')
        .str.split("\\'")
    )

recipe_ingredients['ingredients'] = ingredients_str_to_list(recipe_ingredients['ingredients'])

We explode the list of ingredients into separate rows, which makes it easier to filter out the artefacts.

In [104]:
def get_exploded_ingredients_per_recipe_index(recipe_ingredients:pd.DataFrame):
    # explode the list of ingredients into separate rows
    exploded_ingredients_per_recipe = recipe_ingredients.explode('ingredients')

    # remove leading and trailing whitespace from all entries
    exploded_ingredients_per_recipe.loc[:, 'ingredients'] = exploded_ingredients_per_recipe['ingredients'].str.strip()

    # remove rows containing str artefacts from the string split operation upstream
    empty_str_mask = exploded_ingredients_per_recipe['ingredients'] == ""
    comma_only_mask = exploded_ingredients_per_recipe['ingredients'] == ","
    exploded_ingredients_per_recipe = exploded_ingredients_per_recipe[~empty_str_mask & ~comma_only_mask]

    return exploded_ingredients_per_recipe

ingredients_per_recipe = get_exploded_ingredients_per_recipe_index(recipe_ingredients)
ingredients_per_recipe.reset_index(drop=True).reset_index(drop=False)

Unnamed: 0,index,recipes_index,ingredients
0,0,0,1 (3½–4-lb.) whole chicken
1,1,0,"2¾ tsp. kosher salt, divided, plus more"
2,2,0,2 small acorn squash (about 3 lb. total)
3,3,0,2 Tbsp. finely chopped sage
4,4,0,1 Tbsp. finely chopped rosemary
...,...,...,...
148307,148307,13500,3 tablespoons olive oil
148308,148308,13500,"6 (6-inch) corn tortillas, halved"
148309,148309,13500,"1 cup cooked black beans, rinsed and drained i..."
148310,148310,13500,Garnish: toasted pine nuts


In [107]:
print(f"Total number of recipes: {len(recipes)}")

Total number of recipes: 13501
