In [7]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('../data/recipes.csv', index_col='RecipeId')

In [3]:
df = df[df['Description'].str.startswith('Make and share this') == False]
df = df.sample(n=100000)

In [4]:
df.head()

Unnamed: 0_level_0,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,RecipeCategory,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65316,Crunch for Lunch Chicken Stir-Fry,21752,Mirj2338,PT20M,PT15M,PT35M,2003-06-20T20:04:00Z,Another winner from the New York Daily News. T...,character(0),Lunch/Snacks,...,1.1,54.9,191.6,3.9,1.4,1.7,23.8,6.0,,"c(""Snip tips off snow peas."", ""Wash them and s..."
508120,Brown Rice by a MAN,498595,MHulak,PT20M,PT5M,PT25M,2013-10-21T17:46:00Z,Is rice hard to make? It ain't hard mofo. It i...,character(0),Asian,...,0.5,0.0,13.6,71.5,3.2,0.8,7.3,4.0,4 bowls,"c(""Simply mang, measure 1 part brown rice to 2..."
304349,Southern Georgia Peachy Baked Beans,488441,pamela t.,PT25M,PT15M,PT40M,2008-05-20T17:57:00Z,This is a nice change of pace from usual brown...,character(0),Beans,...,0.8,3.3,42.9,18.8,5.1,3.4,6.3,10.0,,"c(""Cook bacon, drain and crumble."", ""Clean and..."
109199,Kerala Mutton Curry - Mild,186771,MariaBright,PT1H,PT30M,PT1H30M,2005-01-23T20:00:00Z,"The recipe is particularly good for those, lik...",character(0),Curries,...,11.9,124.0,130.7,38.5,6.2,10.1,38.1,4.0,,"c(""Mix spices for marinade with the yoghurt an..."
138395,Big Batch Beef Sauce,123871,adena mangis,PT1H,PT15M,PT1H15M,2005-09-21T18:24:00Z,This is a batch of beef that can be frozen and...,character(0),< 4 Hours,...,7.2,82.2,746.7,15.1,3.9,8.8,25.9,,15 cups,"c(""In a Dutch oven over medium heat cook beef,..."


In [5]:
def str_to_time(str_time):
    
    if pd.isnull(str_time):
        return None
    
    hours = re.findall('[0-9]*H', str_time)
    if hours:
        hours = int(hours[0][:-1])
    else:
        hours='00'
        
    minutes = re.findall('[0-9]*M', str_time)
    if minutes:
        minutes = minutes[0][:-1]
    else:
        minutes = '00'
        
    seconds = re.findall('[0-9]*S', str_time)
    if seconds:
        seconds = seconds[0][:-1]
    else:
        seconds='00'
        
    return f'{hours}:{minutes}:{seconds}'

In [8]:
timeCols = ['CookTime', 'PrepTime', 'TotalTime']
for col in timeCols:
    df[col] = df[col].apply(str_to_time)

In [9]:
def parse_instructions(instructions):
    if pd.isnull(instructions):
        return None
    instructions = instructions[3:len(instructions)-2]
    instructions = instructions.split("\", \"")
    instructions = '\n'.join(instructions)
    return instructions

In [10]:
df['RecipeInstructions'] = df['RecipeInstructions'].apply(parse_instructions)


In [11]:
def parse_ingredients(row):
    ingredients = row.RecipeIngredientParts
    ingredients = ingredients[3:len(ingredients)-2]
    ingredients = ingredients.split('\", \"')
    ingredients = [ingredient.title() for ingredient in ingredients]
    quantities = row.RecipeIngredientQuantities
    quantities = quantities[3:len(quantities)-2]
    quantities = quantities.split('\", \"')
    
    return dict(zip(ingredients, quantities))

In [12]:
df['Ingredients'] = [parse_ingredients(row) for row in df.itertuples(index=False)]


In [13]:
def recipe_yield_fix(quantity:str):
    if pd.isnull(quantity):
        return None
    return quantity.title()

df['RecipeYield'] = df['RecipeYield'].apply(recipe_yield_fix)

In [14]:
def keywords_fix(keywords: str):
    if pd.isnull(keywords):
        return None
    keywords = keywords[3:len(keywords)-2]
    keywords = keywords.split('\", \"')
    return keywords

df['Keywords'] = df['Keywords'].apply(keywords_fix)

In [15]:
cols_to_drop = ['AuthorId', 'AuthorName', 'DatePublished', 'Images', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'ReviewCount']
ndf = df.drop(columns=cols_to_drop)

In [16]:
ndf.head()

Unnamed: 0_level_0,Name,CookTime,PrepTime,TotalTime,Description,RecipeCategory,Keywords,AggregatedRating,Calories,FatContent,...,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions,Ingredients
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65316,Crunch for Lunch Chicken Stir-Fry,00:20:00,00:15:00,00:35:00,Another winner from the New York Daily News. T...,Lunch/Snacks,"[Chicken, Poultry, Meat, Asian, Kosher, < 60 M...",4.0,212.1,10.9,...,54.9,191.6,3.9,1.4,1.7,23.8,6.0,,Snip tips off snow peas.\nWash them and set as...,"{'Snow Peas': '2', 'Boneless Skinless Chicken ..."
508120,Brown Rice by a MAN,00:20:00,00:5:00,00:25:00,Is rice hard to make? It ain't hard mofo. It i...,Asian,"[Low Protein, Low Cholesterol, Healthy, < 30 M...",,342.2,2.7,...,0.0,13.6,71.5,3.2,0.8,7.3,4.0,4 Bowls,"Simply mang, measure 1 part brown rice to 2 pa...","{'Brown Rice': '2', 'Water': '4', 'Chicken Bou..."
304349,Southern Georgia Peachy Baked Beans,00:25:00,00:15:00,00:40:00,This is a nice change of pace from usual brown...,Beans,[60 Min],5.0,123.6,2.5,...,3.3,42.9,18.8,5.1,3.4,6.3,10.0,,"Cook bacon, drain and crumble.\nClean and pit ...","{'Great Northern Beans': '2', 'Onion': '1', 'B..."
109199,Kerala Mutton Curry - Mild,1:00:00,00:30:00,1:30:00,"The recipe is particularly good for those, lik...",Curries,"[Lamb/Sheep, Meat, Asian, Indian, < 4 Hours]",,640.9,37.9,...,124.0,130.7,38.5,6.2,10.1,38.1,4.0,,Mix spices for marinade with the yoghurt and m...,"{'Water': '1/2', 'Bay Leaf': '1/2', 'Yoghurt':..."
138395,Big Batch Beef Sauce,1:00:00,00:15:00,1:15:00,This is a batch of beef that can be frozen and...,< 4 Hours,,5.0,328.2,18.7,...,82.2,746.7,15.1,3.9,8.8,25.9,,15 Cups,"In a Dutch oven over medium heat cook beef, on...","{'Ground Beef': '4', 'Onions': '4', 'Celery Ri..."


In [17]:
ndf['ProteinPercentage'] = ndf['ProteinContent'] * 4 / ndf['Calories']
ndf['CarbohydratePercentage'] = ndf['CarbohydrateContent'] * 4 / ndf['Calories']
ndf['SugarPercentage'] = ndf['SugarContent'] * 4 / ndf['Calories']
ndf['FatPercentage'] = ndf['FatContent'] * 9 / ndf['Calories']

In [19]:
ndf.to_csv('../data/recipes_finalv1.csv', index=False)

In [20]:
##########################################################################################################################