# Imports

In [303]:
import pandas as pd
import ast
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sqlalchemy import create_engine
from config import USERNAME, PASSWORD, HOST_PORT, DB_NAME
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
# create sqlalchemy engine for uploading data to postgresql db
engine = create_engine(f"postgresql+psycopg2://{USERNAME}:{PASSWORD}@localhost:{HOST_PORT}/{DB_NAME}")

# Data Cleaning

In [3]:
# read in recipes json file
data = pd.read_json("./data/foodnetwork_recipes5.json")

In [4]:
data.head().transpose()

Unnamed: 0,0,1,2,3,4
@context,http://schema.org,http://schema.org,http://schema.org,http://schema.org,http://schema.org
@type,Recipe,Recipe,Recipe,Recipe,Recipe
aggregateRating,"{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4...."
author,"[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Giada De Laurent..."
cookTime,0:40:00,,0:50:00,0:55:00,0:40:00
dateModified,2014-01-16T14:41:29.377-05:00,2014-01-23T01:58:35.392-05:00,2014-01-28T13:46:31.372-05:00,2014-01-04T04:16:01.997-05:00,2014-07-02T14:20:56.678-04:00
datePublished,2015-05-17T09:19:31.316-04:00,2015-05-16T08:11:53.938-04:00,2015-05-16T09:18:39.542-04:00,2015-05-17T08:07:06.398-04:00,2015-07-22T10:15:22.809-04:00
description,Make Paula Deen's Baked French Toast Casserole...,Paula Deen's Not Yo' Mama's Banana Pudding fro...,"For an autumnal treat, bake Paula Deen's Pumpk...","Try Paula Deen's creamy, cheesy Corn Casserole...","Giada De Laurentiis' Roman-Style Chicken, from..."
headline,Baked French Toast Casserole with Maple Syrup,Not Yo' Mama's Banana Pudding,Pumpkin Gooey Butter Cakes,Corn Casserole,Roman-Style Chicken
image,"{'@type': 'ImageObject', 'description': 'Baked...","{'@type': 'ImageObject', 'description': 'Not Y...","{'@type': 'ImageObject', 'description': 'Pumpk...","{'@type': 'ImageObject', 'description': 'Corn ...","{'@type': 'ImageObject', 'description': 'Roman..."


In [5]:
# remove extraneous columns
data = data.drop(["@context", "@type", "recipeYield", "review", "totalTime", "cookTime", "dateModified", "datePublished", "description", "headline", "image", "mainEntityOfPage", "nutrition", "prepTime", "publisher", "url", "video"], axis=1).reset_index()

In [6]:
data.head().transpose()

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
aggregateRating,"{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4....","{'@type': 'AggregateRating', 'ratingValue': 4...."
author,"[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","[{'@type': 'Person', 'name': 'Giada De Laurent..."
keywords,"Baking,Dessert,Make Ahead,American,French Toas...","Easy Dessert Recipes,Dessert,Easy,Banana Puddi...","Easy Dessert Recipes,Dessert,Easy,Easy Baking,...","Easy Casserole Recipes,Casserole,Easy,Easy Sid...","Easy Chicken,Chicken,Easy,Poultry,Easy Main Di..."
name,Baked French Toast Casserole with Maple Syrup,Not Yo' Mama's Banana Pudding,Pumpkin Gooey Butter Cakes,Corn Casserole,Roman-Style Chicken
recipeCategory,dessert,dessert,dessert,side-dish,main-dish
recipeCuisine,american,american,american,american,european
recipeIngredient,"[1 loaf French bread (13 to 16 ounces), 8 larg...","[1 (14-ounce) can sweetened condensed milk, 1 ...","[1 (18 1/4-ounce) package yellow cake mix, 1 e...","[1 (15 1/4-ounce) can whole kernel corn, drain...","[4 skinless chicken breast halves, with ribs, ..."
recipeInstructions,"[{'@type': 'HowToStep', 'text': 'Slice French ...","[{'@type': 'HowToStep', 'text': 'Line the bott...","[{'@type': 'HowToStep', 'text': 'Preheat oven ...","[{'@type': 'HowToStep', 'text': 'Preheat oven ...","[{'@type': 'HowToStep', 'text': 'Season the ch..."


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6652 entries, 0 to 6651
Data columns (total 9 columns):
index                 6652 non-null int64
aggregateRating       5994 non-null object
author                6652 non-null object
keywords              6632 non-null object
name                  6652 non-null object
recipeCategory        5518 non-null object
recipeCuisine         4891 non-null object
recipeIngredient      6629 non-null object
recipeInstructions    6649 non-null object
dtypes: int64(1), object(8)
memory usage: 467.8+ KB


In [8]:
# read in foodnetwork pickled data
pickled_data = pd.read_pickle("./data/foodnetwork_recipes.pkl")[["name", "recipeCategory", "recipeCuisine", "cook"]]

In [9]:
pickled_data.head().transpose()

Unnamed: 0,0,1,2,3,4
name,Baked French Toast Casserole with Maple Syrup,Not Yo' Mama's Banana Pudding,Pumpkin Gooey Butter Cakes,Corn Casserole,Roman-Style Chicken
recipeCategory,dessert,dessert,dessert,side-dish,main-dish
recipeCuisine,american,american,american,american,european
cook,Paula Deen,Paula Deen,Paula Deen,Paula Deen,Giada De Laurentiis


In [10]:
pickled_data.shape

(6652, 4)

In [11]:
# reset index of pickled dataframe to get index column for next step
pickled_data = pickled_data.reset_index()

In [12]:
# merge pickled data with the original dataframe
keywords_data = data.merge(pickled_data, how="left", on=["index", "name", "recipeCategory", "recipeCuisine"]).reset_index()[["index", "name", "cook", "keywords"]]

In [13]:
keywords_data.shape

(6652, 4)

In [14]:
keywords_data.head()

Unnamed: 0,index,name,cook,keywords
0,0,Baked French Toast Casserole with Maple Syrup,Paula Deen,"Baking,Dessert,Make Ahead,American,French Toas..."
1,1,Not Yo' Mama's Banana Pudding,Paula Deen,"Easy Dessert Recipes,Dessert,Easy,Banana Puddi..."
2,2,Pumpkin Gooey Butter Cakes,Paula Deen,"Easy Dessert Recipes,Dessert,Easy,Easy Baking,..."
3,3,Corn Casserole,Paula Deen,"Easy Casserole Recipes,Casserole,Easy,Easy Sid..."
4,4,Roman-Style Chicken,Giada De Laurentiis,"Easy Chicken,Chicken,Easy,Poultry,Easy Main Di..."


In [15]:
# convert index column by adding 1 to reflect recipe id
keywords_data["index"] = keywords_data["index"] + 1

In [16]:
keywords_data.head()

Unnamed: 0,index,name,cook,keywords
0,1,Baked French Toast Casserole with Maple Syrup,Paula Deen,"Baking,Dessert,Make Ahead,American,French Toas..."
1,2,Not Yo' Mama's Banana Pudding,Paula Deen,"Easy Dessert Recipes,Dessert,Easy,Banana Puddi..."
2,3,Pumpkin Gooey Butter Cakes,Paula Deen,"Easy Dessert Recipes,Dessert,Easy,Easy Baking,..."
3,4,Corn Casserole,Paula Deen,"Easy Casserole Recipes,Casserole,Easy,Easy Sid..."
4,5,Roman-Style Chicken,Giada De Laurentiis,"Easy Chicken,Chicken,Easy,Poultry,Easy Main Di..."


In [17]:
keywords_data.shape

(6652, 4)

In [18]:
keywords_data.isnull().sum()

index          0
name           0
cook        2471
keywords      20
dtype: int64

In [19]:
# fill null values in cook column as Unknown
keywords_data.loc[keywords_data.cook.isnull()==True, "cook"] = "Unknown"

In [20]:
keywords_data.isnull().sum()

index        0
name         0
cook         0
keywords    20
dtype: int64

In [21]:
# look at the recipes where there are no keywords available
keywords_data[keywords_data.keywords.isnull()==True]

Unnamed: 0,index,name,cook,keywords
1258,1259,Puerto Rican Rum Glazed Jumbo Shrimp with Cris...,Unknown,
1380,1381,"Pernil Asado (Roast Pork, Caribbean Style)",Unknown,
1660,1661,Pad Thai Chicken Burger,Unknown,
1669,1670,Shrimp Toasts,Unknown,
1927,1928,Chinese Checkerboard Cake,Unknown,
1944,1945,"Steamed Halibut Fillets, Chinese Style",Unknown,
2813,2814,Indian Pudding,Unknown,
2866,2867,Spicy Fried Chicken Thighs with Papads and Hot...,Unknown,
3845,3846,Deep-Fried Salmon Fillet with Mustard and Japa...,Unknown,
3849,3850,Pepper Crusted Rack of Lamb with Puree of Japa...,Unknown,


In [22]:
# drop rows where there are no keywords
keywords_data = keywords_data[keywords_data.keywords.isnull()!=True]

In [23]:
keywords_data.shape

(6632, 4)

In [24]:
# define query to pull in recipe name and id data from db
recipes_query = """SELECT recipes.recipe_id, recipes.title, users.user_name 
                            FROM food.recipes
                            INNER JOIN food.users ON users.user_id = recipes.user_id;
                            """

In [25]:
# pull data in from db
recipes = pd.read_sql_query(recipes_query, con=engine)

In [26]:
recipes.head()

Unnamed: 0,recipe_id,title,user_name
0,341,Vanilla Milkshake,Paula Deen
1,275,Spicy Cinnamon Cake,Paula Deen
2,336,Paula's Bananas Foster,Paula Deen
3,2489,Indian Succotash,Paula Deen
4,331,Fancy Green Beans,Paula Deen


In [27]:
recipes.shape

(6652, 3)

In [28]:
# merge recipes data with keywords data into one dataframe
recipes_keywords = keywords_data.merge(recipes, how="left", left_on=["index"], right_on=["recipe_id"])

In [29]:
recipes_keywords.shape

(6632, 7)

In [30]:
recipes_keywords.head()

Unnamed: 0,index,name,cook,keywords,recipe_id,title,user_name
0,1,Baked French Toast Casserole with Maple Syrup,Paula Deen,"Baking,Dessert,Make Ahead,American,French Toas...",1,Baked French Toast Casserole with Maple Syrup,Paula Deen
1,2,Not Yo' Mama's Banana Pudding,Paula Deen,"Easy Dessert Recipes,Dessert,Easy,Banana Puddi...",2,Not Yo' Mama's Banana Pudding,Paula Deen
2,3,Pumpkin Gooey Butter Cakes,Paula Deen,"Easy Dessert Recipes,Dessert,Easy,Easy Baking,...",3,Pumpkin Gooey Butter Cakes,Paula Deen
3,4,Corn Casserole,Paula Deen,"Easy Casserole Recipes,Casserole,Easy,Easy Sid...",4,Corn Casserole,Paula Deen
4,5,Roman-Style Chicken,Giada De Laurentiis,"Easy Chicken,Chicken,Easy,Poultry,Easy Main Di...",5,Roman-Style Chicken,Giada De Laurentiis


In [31]:
recipes_keywords.tail()

Unnamed: 0,index,name,cook,keywords,recipe_id,title,user_name
6627,6648,"Pat's Potato Pierogis ""Elegante""",Unknown,"Cheesy Potatoes,Cheese,Potato,Vegetable,Europe...",6648,"Pat's Potato Pierogis ""Elegante""",Unknown
6628,6649,Egg Baked in Acorn Squash,Unknown,"Acorn Squash,Vegetable,Egg Recipes,Main Dish,B...",6649,Egg Baked in Acorn Squash,Katie Lee
6629,6650,Grilled French Bread Pizza with Mushroom Pesto...,Bobby Flay,"Easy Appetizer,Appetizer,Easy,Easy Grilling Re...",6650,Grilled French Bread Pizza with Mushroom Pesto...,Bobby Flay
6630,6651,The Perfect Boiled Eggs,Unknown,"Egg Recipes,Appetizer,Main Dish,Brunch,Breakfa...",6651,The Perfect Boiled Eggs,Unknown
6631,6652,Fruit Cobbler,Jamie Oliver,"Easy Dessert Recipes,Dessert,Easy,Easy Baking,...",6652,Fruit Cobbler,Jamie Oliver


In [32]:
# check null values to check if merge was ok
recipes_keywords.isnull().sum()

index        0
name         0
cook         0
keywords     0
recipe_id    0
title        0
user_name    0
dtype: int64

In [33]:
recipes_keywords.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6632 entries, 0 to 6631
Data columns (total 7 columns):
index        6632 non-null int64
name         6632 non-null object
cook         6632 non-null object
keywords     6632 non-null object
recipe_id    6632 non-null int64
title        6632 non-null object
user_name    6632 non-null object
dtypes: int64(2), object(5)
memory usage: 414.5+ KB


In [34]:
# convert the string of a list of keywords to a list of keywords by splitting it at commas
recipes_keywords["keywords"] = recipes_keywords["keywords"].apply(lambda x: x.split(","))

In [35]:
recipes_keywords.head()

Unnamed: 0,index,name,cook,keywords,recipe_id,title,user_name
0,1,Baked French Toast Casserole with Maple Syrup,Paula Deen,"[Baking, Dessert, Make Ahead, American, French...",1,Baked French Toast Casserole with Maple Syrup,Paula Deen
1,2,Not Yo' Mama's Banana Pudding,Paula Deen,"[Easy Dessert Recipes, Dessert, Easy, Banana P...",2,Not Yo' Mama's Banana Pudding,Paula Deen
2,3,Pumpkin Gooey Butter Cakes,Paula Deen,"[Easy Dessert Recipes, Dessert, Easy, Easy Bak...",3,Pumpkin Gooey Butter Cakes,Paula Deen
3,4,Corn Casserole,Paula Deen,"[Easy Casserole Recipes, Casserole, Easy, Easy...",4,Corn Casserole,Paula Deen
4,5,Roman-Style Chicken,Giada De Laurentiis,"[Easy Chicken, Chicken, Easy, Poultry, Easy Ma...",5,Roman-Style Chicken,Giada De Laurentiis


In [36]:
# flatten keyword series to multiple rows
keywords_expand = pd.DataFrame(recipes_keywords.keywords.tolist(), index=recipes_keywords.recipe_id).stack().reset_index(name="keywords")[["keywords","recipe_id"]]


In [37]:
keywords_expand.head()

Unnamed: 0,keywords,recipe_id
0,Baking,1
1,Dessert,1
2,Make Ahead,1
3,American,1
4,French Toast Recipes,1


In [38]:
keywords_expand.shape

(75767, 2)

In [39]:
# assign each keyword with the type keyword
keywords_expand["type_tag"] = "keyword"

In [40]:
keywords_expand.head()

Unnamed: 0,keywords,recipe_id,type_tag
0,Baking,1,keyword
1,Dessert,1,keyword
2,Make Ahead,1,keyword
3,American,1,keyword
4,French Toast Recipes,1,keyword


In [41]:
# rename keywords column name to tag to prepare for insert into sql
keywords_expand.rename({"keywords":"tag"}, axis=1, inplace=True)

In [42]:
# drop duplicates before insert into db
unique_keywords = keywords_expand.drop_duplicates(subset="tag")

In [43]:
unique_keywords.shape

(691, 3)

In [44]:
unique_keywords.head()

Unnamed: 0,tag,recipe_id,type_tag
0,Baking,1,keyword
1,Dessert,1,keyword
2,Make Ahead,1,keyword
3,American,1,keyword
4,French Toast Recipes,1,keyword


# Insert Keywords Into PostgreSQL As Tags

In [45]:
# write to postgresql db tags table
unique_keywords[["tag", "type_tag"]].to_sql(name="tags", con=engine, schema="food", if_exists="append", index=False)

In [46]:
# define query to obtain tag data that was just inserted into db
tag_query = """SELECT * FROM food.tags
                        WHERE tags.type_tag = 'keyword';"""

In [47]:
# read in data from tags table to obtain tag_ids for next insert
tag_ids = pd.read_sql_query(tag_query, con=engine)

In [48]:
tag_ids.head()

Unnamed: 0,tag_id,tag,type_tag
0,31,Baking,keyword
1,32,Dessert,keyword
2,33,Make Ahead,keyword
3,34,American,keyword
4,35,French Toast Recipes,keyword


In [49]:
tag_ids.shape

(691, 3)

In [50]:
# merge tag_ids to keywords_expand table to obtain recipe ids to insert into recipes_tags table
recipe_tag_ids = keywords_expand.merge(tag_ids, how="left", on="tag")

In [51]:
recipe_tag_ids.shape

(75767, 5)

In [52]:
recipe_tag_ids.head()

Unnamed: 0,tag,recipe_id,type_tag_x,tag_id,type_tag_y
0,Baking,1,keyword,31,keyword
1,Dessert,1,keyword,32,keyword
2,Make Ahead,1,keyword,33,keyword
3,American,1,keyword,34,keyword
4,French Toast Recipes,1,keyword,35,keyword


In [53]:
# drop type_tag and tag columns for insert into recipes_tags table
recipes_tags_ids = recipe_tag_ids.drop(["tag", "type_tag_x", "type_tag_y"], axis=1)

In [54]:
recipes_tags_ids.head()

Unnamed: 0,recipe_id,tag_id
0,1,31
1,1,32
2,1,33
3,1,34
4,1,35


In [55]:
# write to postgresql db recipes_tags table
recipes_tags_ids[["tag_id", "recipe_id"]].to_sql(name="recipes_tags", con=engine, schema="food", if_exists="append", index=False)

In [56]:
# define query to obtain tag data that was just inserted into db
recipe_tag_query = """SELECT * FROM food.recipes_tags
                                  WHERE recipes_tags.tag_id >= 31;"""

In [57]:
# read in data from recipes_tags table to check insert ok
rt_ids = pd.read_sql_query(recipe_tag_query, con=engine)

In [58]:
rt_ids.head()

Unnamed: 0,recipe_id,tag_id
0,1,31
1,1,32
2,1,33
3,1,34
4,1,35


In [59]:
rt_ids.shape

(75767, 2)

# More Data Cleaning

## Recipe Instructions

In [60]:
# define query to obtain steps data and their corresponding recipe id
step_query = "SELECT steps.recipe_id, steps.steps FROM food.steps;"

In [61]:
# pull data in from postgres sql
steps = pd.read_sql_query(step_query, con=engine)

In [62]:
steps.head()

Unnamed: 0,recipe_id,steps
0,1,"Slice French bread into 20 slices, 1-inch each..."
1,1,"The next day, preheat oven to 350 degrees F."
2,1,Spread Praline Topping evenly over the bread a...
3,1,Combine all ingredients in a medium bowl and b...
4,2,Line the bottom of a 13 by 9 by 2-inch dish wi...


In [63]:
steps.shape

(26489, 2)

In [68]:
# group steps to recipe level and aggregate steps
steps_grouped = steps.groupby("recipe_id").agg({"steps": " ".join}).reset_index()

In [69]:
steps_grouped.head()

Unnamed: 0,recipe_id,steps
0,1,"Slice French bread into 20 slices, 1-inch each..."
1,2,Line the bottom of a 13 by 9 by 2-inch dish wi...
2,3,Preheat oven to 350 degrees F. Combine the cak...
3,4,Preheat oven to 350 degrees F. In a large bowl...
4,5,Season the chicken with 1/2 teaspoon salt and ...


In [70]:
steps_grouped.shape

(6652, 2)

In [71]:
steps_grouped.steps[0]

'Slice French bread into 20 slices, 1-inch each. (Use any extra bread for garlic toast or bread crumbs). Arrange slices in a generously buttered 9 by 13-inch flat baking dish in 2 rows, overlapping the slices. In a large bowl, combine the eggs, half-and-half, milk, sugar, vanilla, cinnamon, nutmeg and salt and beat with a rotary beater or whisk until blended but not too bubbly. Pour mixture over the bread slices, making sure all are covered evenly with the milk-egg mixture. Spoon some of the mixture in between the slices. Cover with foil and refrigerate overnight. The next day, preheat oven to 350 degrees F. Spread Praline Topping evenly over the bread and bake for 40 minutes, until puffed and lightly golden. Serve with maple syrup. Combine all ingredients in a medium bowl and blend well. Makes enough for Baked French Toast Casserole.'

In [72]:
# # pickle dataframe with aggregated steps
# steps_grouped.to_pickle("./data/foodnetwork_steps_grouped.pkl")

## Recipe Ingredients

In [312]:
data.head()

Unnamed: 0,index,aggregateRating,author,keywords,name,recipeCategory,recipeCuisine,recipeIngredient,recipeInstructions
0,0,"{'@type': 'AggregateRating', 'ratingValue': 4....","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","Baking,Dessert,Make Ahead,American,French Toas...",Baked French Toast Casserole with Maple Syrup,dessert,american,"[1 loaf French bread (13 to 16 ounces), 8 larg...","[{'@type': 'HowToStep', 'text': 'Slice French ..."
1,1,"{'@type': 'AggregateRating', 'ratingValue': 4....","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","Easy Dessert Recipes,Dessert,Easy,Banana Puddi...",Not Yo' Mama's Banana Pudding,dessert,american,"[1 (14-ounce) can sweetened condensed milk, 1 ...","[{'@type': 'HowToStep', 'text': 'Line the bott..."
2,2,"{'@type': 'AggregateRating', 'ratingValue': 4....","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","Easy Dessert Recipes,Dessert,Easy,Easy Baking,...",Pumpkin Gooey Butter Cakes,dessert,american,"[1 (18 1/4-ounce) package yellow cake mix, 1 e...","[{'@type': 'HowToStep', 'text': 'Preheat oven ..."
3,3,"{'@type': 'AggregateRating', 'ratingValue': 4....","[{'@type': 'Person', 'name': 'Paula Deen', 'ur...","Easy Casserole Recipes,Casserole,Easy,Easy Sid...",Corn Casserole,side-dish,american,"[1 (15 1/4-ounce) can whole kernel corn, drain...","[{'@type': 'HowToStep', 'text': 'Preheat oven ..."
4,4,"{'@type': 'AggregateRating', 'ratingValue': 4....","[{'@type': 'Person', 'name': 'Giada De Laurent...","Easy Chicken,Chicken,Easy,Poultry,Easy Main Di...",Roman-Style Chicken,main-dish,european,"[4 skinless chicken breast halves, with ribs, ...","[{'@type': 'HowToStep', 'text': 'Season the ch..."


In [313]:
# pull out the ingredients data with recipe name and index
ingredients = data[["index", "name", "recipeIngredient"]]

In [314]:
ingredients.head()

Unnamed: 0,index,name,recipeIngredient
0,0,Baked French Toast Casserole with Maple Syrup,"[1 loaf French bread (13 to 16 ounces), 8 larg..."
1,1,Not Yo' Mama's Banana Pudding,"[1 (14-ounce) can sweetened condensed milk, 1 ..."
2,2,Pumpkin Gooey Butter Cakes,"[1 (18 1/4-ounce) package yellow cake mix, 1 e..."
3,3,Corn Casserole,"[1 (15 1/4-ounce) can whole kernel corn, drain..."
4,4,Roman-Style Chicken,"[4 skinless chicken breast halves, with ribs, ..."


In [315]:
# change index column to reflect recipe_id
ingredients.loc[:, "index"] = ingredients["index"].apply(lambda x: x+1)

In [316]:
ingredients.head()

Unnamed: 0,index,name,recipeIngredient
0,1,Baked French Toast Casserole with Maple Syrup,"[1 loaf French bread (13 to 16 ounces), 8 larg..."
1,2,Not Yo' Mama's Banana Pudding,"[1 (14-ounce) can sweetened condensed milk, 1 ..."
2,3,Pumpkin Gooey Butter Cakes,"[1 (18 1/4-ounce) package yellow cake mix, 1 e..."
3,4,Corn Casserole,"[1 (15 1/4-ounce) can whole kernel corn, drain..."
4,5,Roman-Style Chicken,"[4 skinless chicken breast halves, with ribs, ..."


In [317]:
# rename column names to reflect those in the sql db
ingredients.rename({"index":"recipe_id", "name": "title", "recipeIngredient": "ingredient"}, axis=1, inplace=True)

In [318]:
ingredients.head()

Unnamed: 0,recipe_id,title,ingredient
0,1,Baked French Toast Casserole with Maple Syrup,"[1 loaf French bread (13 to 16 ounces), 8 larg..."
1,2,Not Yo' Mama's Banana Pudding,"[1 (14-ounce) can sweetened condensed milk, 1 ..."
2,3,Pumpkin Gooey Butter Cakes,"[1 (18 1/4-ounce) package yellow cake mix, 1 e..."
3,4,Corn Casserole,"[1 (15 1/4-ounce) can whole kernel corn, drain..."
4,5,Roman-Style Chicken,"[4 skinless chicken breast halves, with ribs, ..."


In [319]:
# look at first list of ingredients to understand the format of the data
ingredients.ingredient[0]

['1 loaf French bread (13 to 16 ounces)',
 '8 large eggs',
 '2 cups half-and-half',
 '1 cup milk',
 '2 tablespoons granulated sugar',
 '1 teaspoon vanilla extract',
 '1/4 teaspoon ground cinnamon',
 '1/4 teaspoon ground nutmeg',
 'Dash salt',
 'Praline Topping, recipe follows',
 'Maple syrup',
 '1/2 pound (2 sticks) butter',
 '1 cup packed light brown sugar',
 '1 cup chopped pecans',
 '2 tablespoons light corn syrup',
 '1/2 teaspoon ground cinnamon',
 '1/2 teaspoon ground nutmeg']

In [320]:
ingredients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6652 entries, 0 to 6651
Data columns (total 3 columns):
recipe_id     6652 non-null int64
title         6652 non-null object
ingredient    6629 non-null object
dtypes: int64(1), object(2)
memory usage: 156.0+ KB


In [321]:
# check for null values
ingredients.isnull().sum()

recipe_id      0
title          0
ingredient    23
dtype: int64

In [322]:
# fill null values in ingredient column with unavailable to proceed to next step
ingredients.loc[ingredients.ingredient.isnull()==True, "ingredient"] = "unavailable"

In [323]:
# flatten ingredient series to multiple rows
ingredients_expand = pd.DataFrame(ingredients.ingredient.tolist(), index=ingredients.recipe_id).stack().reset_index(name="ingredients")

In [324]:
ingredients_expand.head()

Unnamed: 0,recipe_id,level_1,ingredients
0,1,0,1 loaf French bread (13 to 16 ounces)
1,1,1,8 large eggs
2,1,2,2 cups half-and-half
3,1,3,1 cup milk
4,1,4,2 tablespoons granulated sugar


In [325]:
ingredients.ingredient[13]

['3 eggs',
 '1/3 cup water',
 'About 1 cup hot red pepper sauce (recommended: Texas Pete)',
 '2 cups self-rising flour',
 '1 teaspoon pepper',
 'House seasoning, recipe follows',
 '1 (1 to 2 1/2-pound) chicken, cut into pieces',
 'Oil, for frying, preferably peanut oil',
 '1 cup salt',
 '1/4 cup black pepper',
 '1/4 cup garlic powder']

In [461]:
# create function to pull out the ingredient quantity
def ingredients_counts(ingredient):
    # define list of units
    units = ["tablespoon", "tbsp", "tbs", "tbl", "teaspoon", "tsp", "ounce", "ounces", "fluid ounce", "fluid ounces", "oz", 
                "fluid oz", "fl oz", "gill", "cup", "c", "C", "pint", "pt", "fluid pint", "fl pt", "quart", "qt", "fluid quart", "fl qt", 
                "gallon", "liter", "litre", "L", "milliliter", "millilitre", "mL", "ml", "deciliter", "dl", "dL", "decilitre", "gal", "gram", 
                "gramme", "g", "pound", "lb", "milligram", "mg", "decigram", "dg", "kilogram", "kg", "kilogramme", "millimeter", 
                "millimetre", "mm", "decimeter", "decimetre", "dm", "meter", "metre", "m", "kilometer", "kilometre", "kilo", "km", 
                "centimeter", "centimetre", "cm", "inch", "in", "cubic meter", "cm3", "m3", "mm3", "km3", "celsius", "Celsius", "Fahrenheit", 
                "F", "pinch", "handful", "loaf", "dash", "Dash", "stick"]
    # join each unit with an or operator
    anyUnitRE = '|'.join(units)
    # remove all text between parentheses ingredients
    parsed = re.sub("([(][^(]+[$)])", "", ingredient)
    # remove punctuation
    parsed = re.sub('[%s]' % re.escape("!|#|$|%|&|'|(|)|*|+|,|-|.|:|;|<|=|>|?|@|[|]|^|_|`|{|}|~"), '', parsed)
    # define pattern to match with input
    m = re.match("((?:\d+.)?\d+(?:.\d+\s?\.?)?)", parsed)
    # set condition that if there is a match return it, else nothing
    if m:
        return m.group(1)
    else:
        return None

In [462]:
# create new column for ingredient quantity by parsing the ingredients using function above
ingredients_expand["ingredient_qty"] = ingredients_expand.ingredients.apply(lambda x: str(ingredients_counts(x)))

In [463]:
ingredients_expand.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf
1,1,1,8 large eggs,8,
2,1,2,2 cups half-and-half,2,cup
3,1,3,1 cup milk,1,cup
4,1,4,2 tablespoons granulated sugar,2,tablespoon


In [464]:
ingredients_expand.tail()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit
91834,6652,10,2 1/2 ounces (70 grams) sugar,2 1/2,ounce
91835,6652,11,A large pinch salt,,pinch
91836,6652,12,4 1/2 fluid ounces (130 milliliters) buttermilk,4 1/2,
91837,6652,13,"A little sugar, for dusting",,
91838,6652,14,"Vanilla ice cream, as an accompaniment",,


In [465]:
# create function to parse out units of ingredients
def ingredients_units(ingredient):
    # define list of units
    units = ["tablespoon", "tbsp", "tbs", "tbl", "teaspoon", "tsp", "ounce", "ounces", "fluid ounce", "fluid ounces", "oz", "fluid oz", "fl oz", "gill", "cup", "c", "C", "pint", "pt", "fluid pint", "fl pt", "quart", "qt", "fluid quart", "fl qt", "gallon", "liter", "litre", "L", "milliliter", "millilitre", "mL", "ml", "deciliter", "dl", "dL", "decilitre", "gal", "gram", "gramme", "g", "pound", "lb", "milligram", "mg", "decigram", "dg", "kilogram", "kg", "kilogramme", "millimeter", "millimetre", "mm", "decimeter", "decimetre", "dm", "meter", "metre", "m", "kilometer", "kilometre", "kilo", "km", "centimeter", "centimetre", "cm", "inch", "in", "cubic meter", "cm3", "m3", "mm3", "km3", "celsius", "Celsius", "Fahrenheit", "F", "pinch", "handful", "loaf", "dash", "Dash", "stick"]
    # remove all text between parentheses ingredients
    parsed = re.sub("([(][^(]+[$)])", "", ingredient)
    # remove the quantities
    parsed = re.sub("^(\d*\/?\d?\s?\d?\/?\d?)", " ", parsed)
    # remove punctuation
    parsed = re.sub('[%s]' % re.escape("!|#|$|%|&|'|(|)|*|+|,|-|.|:|;|<|=|>|?|@|[|]|^|_|`|{|}|~"), '', parsed)
    # add a underscore between fluid and ounces
    parsed = re.sub("((fluid)\s(ounces))", "fluid_ounces", parsed)
    # token words in the ingredients
    tokenized = word_tokenize(parsed)
    # instantiate Lemmatizer from NLTK to lemmatize text (return them to root words)
    wnl = WordNetLemmatizer()
    # lemmatize the words and place in new list
    lemmatized = [wnl.lemmatize(token.lower()) for token in tokenized]
    # remove punctuation again per word
    parsed_tokens = [re.sub('[%s]' % re.escape("!|#|$|%|&|'|(|)|*|+|,|-|.|:|;|<|=|>|?|@|[|]|^|_|`|{|}|~"), '', root) for root in lemmatized]
    # loop through tokens and return it if it is in the list of defined units
    for parsed_token in parsed_tokens:
        try:
            if parsed_token in units:
                return parsed_token
        except:
                return None

In [466]:
# create new column for ingredient unit by parsing the ingredients using function above
ingredients_expand["ingredient_unit"] = ingredients_expand.ingredients.apply(lambda x: ingredients_units(x))

In [467]:
ingredients_expand.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf
1,1,1,8 large eggs,8,
2,1,2,2 cups half-and-half,2,cup
3,1,3,1 cup milk,1,cup
4,1,4,2 tablespoons granulated sugar,2,tablespoon


In [468]:
ingredients_expand

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf
1,1,1,8 large eggs,8,
2,1,2,2 cups half-and-half,2,cup
3,1,3,1 cup milk,1,cup
4,1,4,2 tablespoons granulated sugar,2,tablespoon
5,1,5,1 teaspoon vanilla extract,1,teaspoon
6,1,6,1/4 teaspoon ground cinnamon,1/4,teaspoon
7,1,7,1/4 teaspoon ground nutmeg,1/4,teaspoon
8,1,8,Dash salt,,dash
9,1,9,"Praline Topping, recipe follows",,


In [486]:
# create function to parse out ingredients
def ingredients_parser(ingredient):
    # define list of units
    units = ["tablespoon", "tbsp", "tbs", "tbl", "teaspoon", "tsp", "ounce", "ounces", "fluid ounce", "fluid ounces", "oz", 
                "fluid oz", "fl oz", "gill", "cup", "c", "C", "pint", "pt", "fluid pint", "fl pt", "quart", "qt", "fluid quart", "fl qt", 
                "gallon", "liter", "litre", "L", "milliliter", "millilitre", "mL", "ml", "deciliter", "dl", "dL", "decilitre", "gal", "gram", 
                "gramme", "g", "pound", "lb", "milligram", "mg", "decigram", "dg", "kilogram", "kg", "kilogramme", "millimeter", 
                "millimetre", "mm", "decimeter", "decimetre", "dm", "meter", "metre", "m", "kilometer", "kilometre", "kilo", "km", 
                "centimeter", "centimetre", "cm", "inch", "in", "cubic meter", "cm3", "m3", "mm3", "km3", "celsius", "Celsius", "Fahrenheit", 
                "F", "pinch", "handful", "loaf", "dash", "Dash", "stick"]
    # join each unit with an or operator
    anyUnitRE = '|'.join(units)
    # remove all text between parentheses ingredients
    parsed = re.sub("([(][^(]+[$)])", "", ingredient)
    # remove punctuation
    parsed = re.sub('[%s]' % re.escape(string.punctuation), '', parsed)
    # define pattern to match with input
    m = re.match(
                        r'(?P<amount>\d{1,3})\s*'
                        r'(?P<unit>(' + anyUnitRE + r')?)\s*'
                        r'(?P<preposition>(of)?)\s*'
                        r'(?P<name>.*$)', parsed)
    if m:
        return m.groupdict()["name"]
    else:
        return None

In [487]:
# create new column for ingredient by parsing the ingredients using function above
ingredients_expand["ingredient"] = ingredients_expand.ingredients.apply(lambda x: ingredients_parser(x))

In [488]:
ingredients_expand.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread
1,1,1,8 large eggs,8,,large eggs
2,1,2,2 cups half-and-half,2,cup,s halfandhalf
3,1,3,1 cup milk,1,cup,milk
4,1,4,2 tablespoons granulated sugar,2,tablespoon,s granulated sugar


In [489]:
ingredients_expand

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread
1,1,1,8 large eggs,8,,large eggs
2,1,2,2 cups half-and-half,2,cup,s halfandhalf
3,1,3,1 cup milk,1,cup,milk
4,1,4,2 tablespoons granulated sugar,2,tablespoon,s granulated sugar
5,1,5,1 teaspoon vanilla extract,1,teaspoon,vanilla extract
6,1,6,1/4 teaspoon ground cinnamon,1/4,teaspoon,ground cinnamon
7,1,7,1/4 teaspoon ground nutmeg,1/4,teaspoon,ground nutmeg
8,1,8,Dash salt,,dash,
9,1,9,"Praline Topping, recipe follows",,,


In [502]:
# create function to further clean each ingredient string of the trailing s
def clean_ingredients(ingredient):
    if ingredient != None:
        return ingredient.replace("s ", "")

In [509]:
# further clean the ingredient strings in the ingredients column with function above
ingredients_expand.loc[:, "ingredient"] = ingredients_expand.ingredient.apply(lambda x: clean_ingredients(x))

In [510]:
ingredients_expand.head()

Unnamed: 0,recipe_id,level_1,ingredients,ingredient_qty,ingredient_unit,ingredient
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread
1,1,1,8 large eggs,8,,large eggs
2,1,2,2 cups half-and-half,2,cup,halfandhalf
3,1,3,1 cup milk,1,cup,milk
4,1,4,2 tablespoons granulated sugar,2,tablespoon,granulated sugar


In [511]:
ingredients_expand.shape

(91839, 6)

In [512]:
# drop duplicates to upload into postgresql
unique_ingredients = ingredients_expand.drop_duplicates("ingredient")[["ingredient"]]

In [513]:
# check shape of dataframe after dropping duplicates
unique_ingredients.shape

(26947, 1)

In [514]:
unique_ingredients.head()

Unnamed: 0,ingredient
0,French bread
1,large eggs
2,halfandhalf
3,milk
4,granulated sugar


In [515]:
# write to postgresql db ingredients table
unique_ingredients.ingredient.to_sql(name="ingredients", con=engine, schema="food", if_exists="append", index=False)

In [516]:
# define query to pull ingredient data
ingred_query = """SELECT * FROM food.ingredients;"""

In [517]:
# pull data in from postgres sql to pull ingredient id
ingred_id = pd.read_sql_query(ingred_query, con=engine)

In [518]:
ingred_id.head()

Unnamed: 0,ingredient_id,ingredient
0,1,French bread
1,2,large eggs
2,3,halfandhalf
3,4,milk
4,5,granulated sugar


In [519]:
ingred_id.shape

(26947, 2)

In [520]:
# merge ids to ingredients dataframe
ingredients_ids = ingredients_expand.merge(ingred_id, how="left", on="ingredient")

In [523]:
# rename ingredients column to match recipes_ingredients table in db
ingredients_ids.rename({"ingredients":"ingredient_comment"}, axis=1, inplace=True)

In [524]:
ingredients_ids.head()

Unnamed: 0,recipe_id,level_1,ingredient_comment,ingredient_qty,ingredient_unit,ingredient,ingredient_id
0,1,0,1 loaf French bread (13 to 16 ounces),1,loaf,French bread,1
1,1,1,8 large eggs,8,,large eggs,2
2,1,2,2 cups half-and-half,2,cup,halfandhalf,3
3,1,3,1 cup milk,1,cup,milk,4
4,1,4,2 tablespoons granulated sugar,2,tablespoon,granulated sugar,5


In [525]:
ingredients_ids.shape

(91839, 7)

In [527]:
# write to postgresql db recipes_ingredients table
ingredients_ids[["recipe_id", "ingredient_id", "ingredient_qty", "ingredient_unit", "ingredient_comment"]].to_sql(name="recipes_ingredients", con=engine, schema="food", if_exists="append", index=False)

In [528]:
# define query to pull recipe ingredient data
recipe_ingred_query = """SELECT * FROM food.recipes_ingredients;"""

In [529]:
# pull data in from postgres sql to check insert is ok
recipe_ingred = pd.read_sql_query(recipe_ingred_query, con=engine)

In [530]:
recipe_ingred.head()

Unnamed: 0,recipe_id,ingredient_id,ingredient_qty,ingredient_unit,ingredient_comment
0,1,1,1,loaf,1 loaf French bread (13 to 16 ounces)
1,1,2,8,,8 large eggs
2,1,3,2,cup,2 cups half-and-half
3,1,4,1,cup,1 cup milk
4,1,5,2,tablespoon,2 tablespoons granulated sugar


In [531]:
recipe_ingred.shape

(91839, 5)

In [533]:
# # pickle ingredients dataframe
# ingredients_expand.to_pickle("./data/foodnetwork_ingredients.pkl")
# ingredients_ids.to_pickle("./data/foodnetwork_ingredient_ids.pkl")