## BEFORE YOU BEGIN

1. create a directory called 'raw_data' in the directory containing this notebook
2. add the data files to in `./raw_data` with the name "RAW_<type_of_data>.csv"
  - recipes: `RAW_recipes.csv`
    - https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data?select=RAW_recipes.csv
  - interactions: `RAW_interactions.csv`
    - https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions/data?select=RAW_interactions.csv
  - ingredients: `RAW_ingredients.csv`
    - https://www.kaggle.com/datasets/thedevastator/the-nutritional-content-of-food-a-comprehensive
  - allergens: `RAW_allergens.csv`
    - https://www.kaggle.com/datasets/boltcutters/food-allergens-and-allergies 
3. create a directory called 'cleaned_data' in the directory containing this notebook, this is where the output csvs will end up

In [199]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import ast
import re

recipes = pd.read_csv('./raw_data/RAW_recipes.csv')
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)

## Cleaning Ingredients

In [200]:
ingredients = pd.read_csv('./raw_data/RAW_ingredients.csv')
ingredients['Shrt_Desc'] = ingredients['Shrt_Desc'].str.lower().apply(lambda x: re.sub(r'\s*,\s*', ' ', x))
basics = {"onion": "onions", "apples": "gala apples", "cream": "whipping cream", "apple": "gala apple", "nuts": "mixed nuts", "whipping cream": "heavy whipping cream", "strawberry": "strawberries", "oil": "canola oil", "milk": "whole milk", "yogurt": "yogurt plain", "flour": "all-purpose flour", "sugar": "granulated sugar", "pepper": "black pepper", "eggs": "whole egg", "egg": "whole egg", "butter": "unsalted butter", "tomato": "red tomato", "tomatoes": "red tomatoes"}

In [201]:
ingredients

Unnamed: 0,index,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),...,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Refuse_Pct
0,0,1001,butter with salt,15.87,717,0.85,81.11,2.11,0.06,0.0,...,7.0,51.368,21.021,3.043,215.0,5.00,"1 pat, (1"" sq, 1/3"" high)",14.2,1 tbsp,0.0
1,1,1002,butter whipped w/ salt,16.72,718,0.49,78.30,1.62,2.87,0.0,...,4.6,45.390,19.874,3.331,225.0,3.80,"1 pat, (1"" sq, 1/3"" high)",9.4,1 tbsp,0.0
2,2,1003,butter oil anhydrous,0.24,876,0.28,99.48,0.00,0.00,0.0,...,8.6,61.924,28.732,3.694,256.0,12.80,1 tbsp,205.0,1 cup,0.0
3,3,1004,cheese blue,42.41,353,21.40,28.74,5.11,2.34,0.0,...,2.4,18.669,7.778,0.800,75.0,28.35,1 oz,17.0,1 cubic inch,0.0
4,4,1005,cheese brick,41.11,371,23.24,29.68,3.18,2.79,0.0,...,2.5,18.764,8.598,0.784,94.0,132.00,"1 cup, diced",113.0,"1 cup, shredded",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8785,8785,83110,mackerel salted,43.00,305,18.50,25.10,13.40,0.00,0.0,...,7.8,7.148,8.320,6.210,95.0,80.00,"1 piece, (5-1/2"" x 1-1/2"" x 1/2"")",17.0,"1 cubic inch, boneless",0.0
8786,8786,90240,scallop (bay&sea) ckd stmd,70.25,111,20.54,0.84,2.97,5.41,0.0,...,0.0,0.218,0.082,0.222,41.0,85.00,3 oz,,,0.0
8787,8787,90480,syrup cane,26.00,269,0.00,0.00,0.86,73.14,0.0,...,0.0,0.000,0.000,0.000,0.0,21.00,1 serving,,,0.0
8788,8788,90560,snail raw,79.20,90,16.10,1.40,1.30,2.00,0.0,...,0.1,0.361,0.259,0.252,50.0,85.00,3 oz,,,0.0


In [202]:
remove_keywords = ["sugar sub", "entree", "ckd", "infant formula", "inf formula", "child formula", "meat only", "skin only", "applebee's", "t.g.i friday's", "restaurant", "denny's", "ckd", "babyfood", "fast foods", "burger king", "mcdonald's", "wendy's", "pizza hut", "kfc", "taco bell", "subway", "domino's", "papa john's", "little caesars", "chick-fil-a", "popeyes", "olive garden", "on the border", "carraba's", "cracker barrel"]
replace_raw = {'pork cured bacon': "bacon", 'commly prep': "", 'no chol': "", 'unenr': "", 'raw': "", 'frsh': "fresh", 'frz': "frozen", 'crm': "cream", 'pnut': "peanut", 'pln': "plain", 'whl': "whole", 'dk': "dark", 'broilers or fryers': "", 'drsng': "dressing", 'spread': "spread", 'flr': "flour", 'bf': "beef" ,'w/': "with", 'wo/': "without", 'hvy': "heavy", 'lt': "light", 'juc': "juice", 'wo/salt': "unsalted", 'wo/ salt': "unsalted", 'without salt': "unsalted", "salad or cooking": "", 'yel': "yellow", 'sau': "sauce", 'veg': "vegetable"}

mask = ingredients['Shrt_Desc'].str.contains('|'.join(remove_keywords))
ingredients = ingredients[~mask]

replace = {f'^{key}\\b': value + " " for key, value in replace_raw.items()}
replace.update({f'\\b{key}$': " " + value for key, value in replace_raw.items()})
replace.update({f'\\b{key}\\b': " " + value + " " for key, value in replace_raw.items()})
replace.update({'\\bmilkfat\\b.*$': ""})
replace.update({'\\((.*?)\\)': ""})

ingredients.reset_index(drop=True, inplace=True)
ingredients["Cleaned_Desc"] = ingredients["Shrt_Desc"].replace(replace, regex=True).replace(r'\s+', ' ', regex=True).str.strip()
ingredient_names = ingredients['Cleaned_Desc'].unique()

ingredient_names

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredients["Cleaned_Desc"] = ingredients["Shrt_Desc"].replace(replace, regex=True).replace(r'\s+', ' ', regex=True).str.strip()


array(['butter with salt', 'butter whipped w/ salt',
       'butter oil anhydrous', ..., 'syrup cane', 'snail', 'turtle green'],
      dtype=object)

In [203]:
def matchWord(target, tomatch):
  l = list(map(lambda x : x[0], process.extractBests(target, tomatch, scorer=fuzz.token_sort_ratio, score_cutoff=50, limit=20)))
  return process.extractOne(target, l, scorer=fuzz.token_set_ratio)

### Scraper to get ingredient quantities from Food.com

In [204]:
from bs4 import BeautifulSoup
from fractions import Fraction
import requests

ingredient_quantity_map = {
    'cups': ['cup', 'cups'],
    'teaspoons': ['teaspoon', 'teaspoons', 'tsp', 'tsps'],
    'tablespoons': ['tablespoon', 'tablespoons', 'tbsp', 'tbsps'],
    'pints': ['pint', 'pints', 'pt', 'pts'],
    'quarts': ['quart', 'quarts', 'qt', 'qts'],
    'gallons': ['gallon', 'gallons', 'gal', 'gals'],
    'milliliters': ['milliliter', 'milliliters', 'ml', 'mls'],
    'liters': ['liter', 'liters'],
    'grams': ['gram', 'grams', 'g'],
    'kilograms': ['kilogram', 'kilograms', 'kg', 'kgs'],
    'ounces': ['ounce', 'ounces', 'oz'],
    'pounds': ['pound', 'pounds', 'lb', 'lbs'],
}

quantity_map = {value: key for key, values in ingredient_quantity_map.items() for value in values}

url_base = "https://www.food.com/recipe/"

def delete_and_everything_after(string, substring):
  return string.partition(substring)[0]

def get_after(string, substring):
  index = string.find(substring)
  return string[index + len(substring):] if index != -1 else string

def match_quantity_type(input_string):
  if input_string == "":
    return None

  choices = quantity_map.keys()

  best_match, similarity = process.extractOne(input_string, choices)

  return quantity_map[best_match] if similarity >= 90 else None
  
def parse_mixed_fraction(mixed_fraction):
  parts = mixed_fraction.split()

  if len(parts) == 1:
    # If there is no space, it's just a whole number or a fraction
    return float(Fraction(parts[0]))
  elif len(parts) == 2:
    # If there is a space, it's a mixed fraction
    whole_number = float(parts[0])
    fraction_parts = parts[1].split('/')
    numerator = float(fraction_parts[0])
    denominator = float(fraction_parts[1]) if len(fraction_parts) > 1 else 1

    return whole_number + (numerator / denominator)
  else:
    raise ValueError("Invalid mixed fraction format")

def get_ingredient_quantities(recipe):
  url = url_base + recipe['name'].replace(" ", "-") + "-" + str(recipe['id'])
  print(url)
  page = requests.get(url)
  
  if page.status_code != 200:
    return None

  soup = BeautifulSoup(page.content, 'html.parser')
  
  if soup.find(class_='ingredient-list') is None:
    return None
  
  ingredient_list = soup.select('ul.ingredient-list li')
  
  if len(ingredient_list) != len(recipe['ingredients']):
    return None
  
  quantities = dict()
  for element, ingredient in list(zip(ingredient_list, recipe['ingredients'])):
    quantity_span = element.find('span', class_='ingredient-quantity')
    ingredient_quantity_match = re.compile(r'\d+\s+\d+\/\d+|\d+\/\d+|\d+').search(quantity_span.text) if quantity_span else None
    ingredient_quantity = ingredient_quantity_match.group() if ingredient_quantity_match else None


    text_span = element.find('span', class_='ingredient-text')
    ingredient_quantity_type = text_span.get_text(strip=True) if text_span else None
    
    if ingredient_quantity_type and re.compile(r'\d+\s+\d+\/\d+|\d+\/\d+|\d+').search(ingredient_quantity_type):
      ingredient_quantity = re.compile(r'\d+\s+\d+\/\d+|\d+\/\d+|\d+').search(ingredient_quantity_type).group()
      ingredient_quantity_type = get_after(ingredient_quantity_type, ingredient_quantity)

    quantity = parse_mixed_fraction(ingredient_quantity) if ingredient_quantity else None
    quantity_type = match_quantity_type(delete_and_everything_after(ingredient_quantity_type, ingredient)) if ingredient_quantity_type else None
    
    quantities[ingredient] = (quantity, quantity_type)
  return quantities

#### Get recipes that match cleaned ingredients

In [205]:
valid = dict()
valid_with_ingred = dict()
bad_match = dict()
no_match = []
for _, recipe in recipes.loc[:2000].iterrows():
  skip = False
  ingreds_in_recipe = dict()
  for ingred in recipe["ingredients"]:
    to_match = ingred
    if ingred in basics.keys():
      to_match = basics[ingred]

    res = matchWord(to_match, ingredient_names)
    if res is None:
      no_match.append(to_match)
      skip = True
      break
    match, percent = res
    ingreds_in_recipe[to_match] = match
    if percent < 90:
      bad_match[to_match] = (match, percent)
      skip = True
      break
  if not skip:
    quantity_info = get_ingredient_quantities(recipe)
    if quantity_info:
      result_dict = {key: (ingreds_in_recipe[key], quantity_info[key]) for key in set(ingreds_in_recipe) & set(quantity_info)}
      valid_with_ingred[recipe['name']] = (quantity_info)
      valid[recipe['name']] = result_dict
    skip = False
    

https://www.food.com/recipe/better-than-sex--strawberries-42198
https://www.food.com/recipe/chinese--chop-suey-8559
https://www.food.com/recipe/fried--potatoes-37073
https://www.food.com/recipe/momma-s-special--marinade-30131
https://www.food.com/recipe/munch-without-guilt--tomatoes-30300
https://www.food.com/recipe/say-what---banana-sandwich-95926
https://www.food.com/recipe/300-icing-208179
https://www.food.com/recipe/blepandekager---danish---apple-pancakes-503475
https://www.food.com/recipe/burek--or-feta-cheese--phyllo-pie-310570
https://www.food.com/recipe/german--barbecued-carrots-109818
https://www.food.com/recipe/no-name---beef-recipe---loco-moco-101115
https://www.food.com/recipe/pizza-pan--potato-skins-60938
https://www.food.com/recipe/puddingkuchen---custard-bake-353171
https://www.food.com/recipe/the--creamed-dried-beef-chipped-beef-recipe-163238
https://www.food.com/recipe/we-hate-zucchini-bread---zucchini-bread-130815
https://www.food.com/recipe/yummy-n--easiest--warm-blu

## Get needed ingredients

In [206]:
print("Valid recipes: ", len(valid))
unique_ingreds = list(set(ingred for recipe in valid.values() for ingred, _ in recipe.values()))
print("Unique ingredients in recipes: ", len(unique_ingreds))

mask = ingredients['Cleaned_Desc'].isin(unique_ingreds)
unneeded_cols = ["Shrt_Desc", "Refuse_Pct", "index"]
ingredients_working = ingredients[mask].drop(columns=unneeded_cols).drop_duplicates(subset=['Cleaned_Desc'], keep='first').reset_index(drop=True)
ingredients_working


Valid recipes:  111
Unique ingredients in recipes:  186


Unnamed: 0,NDB_No,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),Calcium_(mg),...,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Cleaned_Desc
0,1004,42.41,353,21.40,28.74,5.11,2.34,0.0,0.50,528.0,...,2.4,18.669,7.778,0.800,75.0,28.35,1 oz,17.0,1 cubic inch,cheese blue
1,1006,48.42,334,20.75,27.68,2.70,0.45,0.0,0.45,184.0,...,2.3,17.410,8.013,0.826,100.0,28.35,1 oz,144.0,"1 cup, sliced",cheese brie
2,1009,37.02,404,22.87,33.31,3.71,3.09,0.0,0.48,710.0,...,2.4,18.867,9.246,1.421,99.0,132.00,"1 cup, diced",244.0,"1 cup, melted",cheese cheddar
3,1015,81.24,81,10.45,2.27,1.27,4.76,0.0,4.00,111.0,...,0.0,1.235,0.516,0.083,12.0,113.00,4 oz,226.0,"1 cup, (not packed)",cheese cottage lowfat 2%
4,1017,52.62,350,6.15,34.44,1.27,5.52,0.0,3.76,97.0,...,2.1,20.213,8.907,1.483,101.0,14.50,1 tbsp,232.0,1 cup,cheese cream
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,42140,70.00,200,0.30,20.00,3.00,6.70,0.2,1.94,6.0,...,12.5,2.880,4.660,11.580,0.0,14.00,1 tbsp,216.0,1 cup,salad dressing italian dressing red cal
182,42291,1.10,590,24.00,49.90,3.17,21.83,6.6,9.29,41.0,...,0.6,7.716,23.582,14.363,0.0,16.00,1 tbsp,,,peanut butter red na
183,43312,90.20,37,1.40,0.20,0.89,7.31,3.1,2.41,21.0,...,18.2,0.039,0.022,0.098,0.0,182.00,1 cup,,,vegetables mxd cnd no salt
184,43598,21.70,688,0.00,77.80,0.40,0.30,0.0,0.30,7.0,...,24.7,10.784,18.026,45.539,0.0,15.00,1 tbsp,239.0,1 cup,mayonnaise dressing


In [207]:
volume_units = ['cups', 'teaspoons', 'tablespoons', 'fluid ounces', 'pints', 'quarts', 'gallons', 'milliliters', 'liters']
weight_units = ['grams', 'kilograms', 'ounces', 'pounds']

def get_number (string):
  return re.compile(r'\d?\.\d+|\d+').search(string).group()
def something(ingredient):
  quantities = {'volume': {'quantity': None, 'desc': None}, 'weight': {'quantity': None, 'desc': None}, 'unit': None}
  gm1 = ingredient['GmWt_Desc1']
  gm2 = ingredient['GmWt_Desc2']
  
  if isinstance(gm1, str):
    gm1_match = match_quantity_type(gm1)
    gm1_num = get_number(gm1)
    if gm1_match in volume_units:
      quantities['volume'] = {'quantity': gm1_num, 'desc': gm1_match}
    elif gm1_match in weight_units:
      quantities['weight'] = {'quantity': gm1_num, 'desc': gm1_match}
    else:
      quantities['unit'] = gm1_num
    
  if isinstance(gm2, str):
    gm2_match = match_quantity_type(gm2)
    gm2_num = get_number(gm2)
    if gm2_match in volume_units:
      quantities['volume'] = {'quantity': gm2_num, 'desc': gm2_match}
    elif gm2_match in weight_units:
      quantities['weight'] = {'quantity': gm2_num, 'desc': gm2_match}
    else:
      quantities['unit'] = gm2_num
  return quantities
  
ingredients_working['quantities'] = ingredients_working.apply(something, axis=1)
ingredients_csv = ingredients_working.join(ingredients_working['quantities'].apply(pd.Series))
ingredients_csv = ingredients_csv.join(ingredients_csv['volume'].apply(pd.Series).add_prefix('volume_'))
ingredients_csv = ingredients_csv.join(ingredients_csv['weight'].apply(pd.Series).add_prefix('weight_'))

to_keep = ["NDB_No", "Energ_Kcal", "Protein_(g)", "Carbohydrt_(g)", "Fiber_TD_(g)", "Sugar_Tot_(g)", "Calcium_(mg)", "Iron_(mg)", "Magnesium_(mg)", "Sodium_(mg)", "Cholestrl_(mg)", "Cleaned_Desc", "unit", "volume_quantity", "volume_desc", "weight_quantity", "weight_desc"]
to_drop = set(ingredients_csv.columns) - set(to_keep)

ingredients_csv = ingredients_csv.drop(columns=to_drop)

ingredients_csv

Unnamed: 0,NDB_No,Energ_Kcal,Protein_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),Calcium_(mg),Iron_(mg),Magnesium_(mg),Sodium_(mg),Cholestrl_(mg),Cleaned_Desc,unit,volume_quantity,volume_desc,weight_quantity,weight_desc
0,1004,353,21.40,2.34,0.0,0.50,528.0,0.31,23.0,1146.0,75.0,cheese blue,1,,,1,ounces
1,1006,334,20.75,0.45,0.0,0.45,184.0,0.50,20.0,629.0,100.0,cheese brie,,1,cups,1,ounces
2,1009,404,22.87,3.09,0.0,0.48,710.0,0.14,27.0,653.0,99.0,cheese cheddar,,1,cups,,
3,1015,81,10.45,4.76,0.0,4.00,111.0,0.13,9.0,308.0,12.0,cheese cottage lowfat 2%,,1,cups,4,ounces
4,1017,350,6.15,5.52,0.0,3.76,97.0,0.11,9.0,314.0,101.0,cheese cream,,1,cups,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,42140,200,0.30,6.70,0.2,1.94,6.0,0.13,2.0,1074.0,0.0,salad dressing italian dressing red cal,,1,cups,,
182,42291,590,24.00,21.83,6.6,9.29,41.0,1.90,159.0,203.0,0.0,peanut butter red na,,1,tablespoons,,
183,43312,37,1.40,7.31,3.1,2.41,21.0,0.65,15.0,26.0,0.0,vegetables mxd cnd no salt,,1,cups,,
184,43598,688,0.00,0.30,0.0,0.30,7.0,0.23,1.0,486.0,0.0,mayonnaise dressing,,1,cups,,


## Get valid recipes

In [208]:
valid_recipe_names = list(valid.keys())

recipes_working = recipes[recipes['name'].isin(valid_recipe_names)].drop_duplicates(subset=['name'], keep='first').reset_index(drop=True)

recipes_working['steps'] = recipes_working['steps'].apply(ast.literal_eval)

recipes_working

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,better than sex strawberries,42198,1460,41531,2002-10-03,"['weeknight', 'time-to-make', 'course', 'main-...","[734.1, 66.0, 199.0, 10.0, 10.0, 117.0, 28.0]",8,[crush vanilla wafers into fine crumbs and lin...,simple but sexy. this was in my local newspape...,"[vanilla wafers, butter, powdered sugar, eggs,...",7
1,chinese chop suey,8559,70,4481,2001-01-27,"['weeknight', 'time-to-make', 'course', 'main-...","[395.4, 31.0, 20.0, 29.0, 51.0, 33.0, 8.0]",8,"[brown ground meat and onion in a large pot, a...",easy one-pot dinner.,"[celery, onion, ground pork, soy sauce, beef b...",7
2,fried potatoes,37073,40,1533,2002-08-13,"['60-minutes-or-less', 'time-to-make', 'course...","[132.6, 8.0, 4.0, 3.0, 4.0, 5.0, 6.0]",14,"[preheat oven to 400 degrees, cut the potatoes...","my husband made these up last week, very tasty...","[red potatoes, margarine, rosemary]",3
3,momma s special marinade,30131,10,41480,2002-06-03,"['15-minutes-or-less', 'time-to-make', 'course...","[199.2, 31.0, 6.0, 10.0, 1.0, 13.0, 1.0]",4,[depending on how much meat i have i usually h...,we usually use with chicken. my neices ask for...,"[lemon juice, oil, worcestershire sauce, basil...",7
4,munch without guilt tomatoes,30300,10,6164,2002-06-04,"['15-minutes-or-less', 'time-to-make', 'course...","[3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",3,[put a slice of tomato on each biscuit / crack...,anytime munchies...another sweat free preparat...,"[tomatoes, crackers, mayonnaise, black pepper]",4
...,...,...,...,...,...,...,...,...,...,...,...,...
106,7 ingredient peanut butter cookies,250024,20,575052,2007-08-30,"['30-minutes-or-less', 'time-to-make', 'course...","[103.6, 6.0, 53.0, 1.0, 5.0, 4.0, 4.0]",11,"[preheat oven to 350f, sift together sugar , n...","i was trying to make simple, three ingredient ...","[peanut butter, flour, granulated sugar, egg, ...",6
107,a berry good banana smoothie,204114,10,359051,2007-01-08,"['15-minutes-or-less', 'time-to-make', 'course...","[303.9, 14.0, 108.0, 4.0, 21.0, 27.0, 16.0]",7,"[peel and slice the bananas, wash and cut off ...","easy, easy recipe for kids to make and the kid...","[bananas, milk, strawberries, plain yogurt]",4
108,a different tomato salad dressing,223242,5,461283,2007-04-17,"['15-minutes-or-less', 'time-to-make', 'course...","[127.4, 20.0, 0.0, 31.0, 2.0, 9.0, 0.0]",3,"[whisk all ingredients together, toss with qua...","for an easy, tasty, and different twist on the...","[olive oil, soy sauce, balsamic vinegar]",3
109,a kick in the asparagus,124597,186,35526,2005-06-06,"['time-to-make', 'course', 'main-ingredient', ...","[146.1, 17.0, 21.0, 29.0, 6.0, 9.0, 3.0]",8,[mix everything except asparagus and set aside...,"i got this off another site, not sure where. ...","[asparagus, italian salad dressing, cayenne pe...",4


##### Get recipe steps

In [209]:
steps_csv = recipes_working.explode('steps')

steps_csv['order'] = steps_csv.groupby('id').cumcount() + 1
steps_csv = steps_csv.reset_index(drop=True)

to_keep = ['id', 'steps', 'order']
to_drop = set(steps_csv) - set(to_keep)

steps_csv = steps_csv.drop(columns=to_drop)

steps_csv

Unnamed: 0,id,steps,order
0,42198,crush vanilla wafers into fine crumbs and line...,1
1,42198,mix butter or margarine and sugar,2
2,42198,add beaten eggs,3
3,42198,spread the mixture over the wafer crumbs,4
4,42198,"crush strawberries and spread over sugar , egg...",5
...,...,...,...
844,327059,"in a blender , combine yolks , , mayonnaise , ...",10
845,327059,"process until smooth , about 7 seconds",11
846,327059,spoon the filling into a pastry bag with a lar...,12
847,327059,pipe filling into egg white halves,13


### Get final recipes data

In [210]:
recipes_working

to_keep = ["name", "id", "contributor_id", "submitted", "tags", "description", "nutrition"]
to_drop = set(recipes_working.columns) - set(to_keep)

recipes_csv = recipes_working.drop(columns=to_drop)
recipes_csv['calories'] = recipes_working['nutrition'].apply(ast.literal_eval).apply(lambda arr: arr[0])
recipes_csv = recipes_csv.drop(columns=['nutrition'])

recipes_csv

Unnamed: 0,name,id,contributor_id,submitted,tags,description,calories
0,better than sex strawberries,42198,41531,2002-10-03,"['weeknight', 'time-to-make', 'course', 'main-...",simple but sexy. this was in my local newspape...,734.1
1,chinese chop suey,8559,4481,2001-01-27,"['weeknight', 'time-to-make', 'course', 'main-...",easy one-pot dinner.,395.4
2,fried potatoes,37073,1533,2002-08-13,"['60-minutes-or-less', 'time-to-make', 'course...","my husband made these up last week, very tasty...",132.6
3,momma s special marinade,30131,41480,2002-06-03,"['15-minutes-or-less', 'time-to-make', 'course...",we usually use with chicken. my neices ask for...,199.2
4,munch without guilt tomatoes,30300,6164,2002-06-04,"['15-minutes-or-less', 'time-to-make', 'course...",anytime munchies...another sweat free preparat...,3.0
...,...,...,...,...,...,...,...
106,7 ingredient peanut butter cookies,250024,575052,2007-08-30,"['30-minutes-or-less', 'time-to-make', 'course...","i was trying to make simple, three ingredient ...",103.6
107,a berry good banana smoothie,204114,359051,2007-01-08,"['15-minutes-or-less', 'time-to-make', 'course...","easy, easy recipe for kids to make and the kid...",303.9
108,a different tomato salad dressing,223242,461283,2007-04-17,"['15-minutes-or-less', 'time-to-make', 'course...","for an easy, tasty, and different twist on the...",127.4
109,a kick in the asparagus,124597,35526,2005-06-06,"['time-to-make', 'course', 'main-ingredient', ...","i got this off another site, not sure where. ...",146.1


## Make recipe-ingredient relations

In [211]:
def find_recipe_id(recipe_name):
  return recipes_csv[recipes_csv['name'] == recipe_name]['id'].item()

def find_ingredient_id(ingredient_name):
  return ingredients_csv[ingredients_csv['Cleaned_Desc'] == ingredient_name]["NDB_No"].item()

def create_relationship(recipe_name, ingredients):
  recipe_id = find_recipe_id(recipe_name)
  ingred_rows = [{"recipe_id": recipe_id, "ingredient_id": find_ingredient_id(name), "quantity": quantity, "quantity_desc": type, "variant": key} for key, (name, (quantity, type)) in ingredients.items()]
  return ingred_rows

relation_temp = [create_relationship(recipe_name, ingredients) for recipe_name, (ingredients) in valid.items()]
ingredient_in_csv = pd.DataFrame([row for rows in relation_temp for row in rows])

ingredient_in_csv

Unnamed: 0,recipe_id,ingredient_id,quantity,quantity_desc,variant
0,42198,19336,16.0,ounces,powdered sugar
1,42198,18609,12.0,ounces,vanilla wafers
2,42198,12155,,,walnuts
3,8559,28387,6.0,,hamburger buns
4,8559,11143,1.0,,celery
...,...,...,...,...,...
437,124597,42140,2.0,,italian salad dressing
438,327059,11282,2.0,,green onions
439,327059,2044,4.0,tablespoons,fresh basil
440,327059,43598,1.0,cups,mayonnaise


## Interactions

In [212]:
interactions = pd.read_csv('./raw_data/RAW_interactions.csv')
interactions_csv = interactions[interactions['recipe_id'].isin(recipes_csv['id'])].reset_index(drop=True)
interactions_csv

Unnamed: 0,user_id,recipe_id,date,rating,review
0,1237233,240311,2009-04-13,5,This bread pudding tastes just like the ones s...
1,424680,400243,2009-12-29,5,"What a fast, easy & interesting way to make a ..."
2,1544208,208179,2010-04-16,5,I have made a recipe just like this for years....
3,1612775,201750,2010-05-09,5,"This was great, thank you! I have an electric..."
4,431813,201750,2007-04-26,4,This was really nice. I would suggest watchin...
...,...,...,...,...,...
1057,198154,152441,2015-03-08,3,"I made these for DH, as is the only one who li..."
1058,2000335947,152441,2015-07-13,5,I loved the recipe the way it came out. Its wa...
1059,1949845,152441,2017-04-17,4,I have made these twice in the last two weeks ...
1060,377366,152441,2017-06-13,5,"As a full time working mom, wife, etc. no time..."


## Create User Data

In [213]:
user_ids = pd.concat([interactions_csv['user_id'], recipes_csv['contributor_id']], ignore_index=True).unique()
print("Unique users: ", len(user_ids))

Unique users:  1016


In [214]:
from faker import Faker

fake = Faker()

In [215]:
def generate_info():
  name = fake.first_name() + " " + fake.last_name()
  domain = fake.free_email_domain()
  split_name = name.lower().split(" ")
  rand = fake.random_int(min=0, max=100) % 7
  email = ""
  if rand == 1:
    email = split_name[0] + split_name[1] + "@" + domain
  elif rand == 2:
    email = split_name[0] + "." + split_name[1] + "@" + domain
  elif rand == 3:
    email = split_name[0][0] + split_name[1] + str(fake.random_int(min=1, max=100)) + "@" + domain
  elif rand == 4:
    email = split_name[0] + str(fake.random_int(min=1, max=100)) + "@" + domain
  elif rand == 5:
    email = split_name[1] + split_name[0] + "@" + domain
  elif rand == 6:
    email = split_name[1] + "." + split_name[0] + "@" + domain
  else:
    email = split_name[0] + "_" + split_name[1] + str(fake.random_int(min=1, max=100)) + "@" + domain
  return name, email
    

users_csv = pd.DataFrame([{"id": user_id, "name": name, "email": email} for user_id, (name, email) in zip(user_ids, [generate_info() for _ in user_ids])])

users_csv

Unnamed: 0,id,name,email
0,1237233,Allison Owens,owens.allison@hotmail.com
1,424680,Mark Washington,mark_washington87@yahoo.com
2,1544208,Brandon Browning,browningbrandon@hotmail.com
3,1612775,Samantha Frazier,sfrazier60@hotmail.com
4,431813,Rodney Pena,rodney.pena@gmail.com
...,...,...,...
1011,386585,Jennifer Oconnor,oconnorjennifer@gmail.com
1012,575052,Brandon Jones,jonesbrandon@yahoo.com
1013,359051,Laurie Hartman,lhartman31@gmail.com
1014,461283,Kristina Hampton,kristina32@hotmail.com


## Allergen Data

In [216]:
allergies_to_consider = ["nut", "lactose"]
allergens = pd.read_csv('./raw_data/RAW_allergens.csv')

allergens['Allergy'] = allergens['Allergy'].str.lower()
allergens['Food'] = allergens['Food'].str.lower()

allergens.dropna(subset=['Allergy'], inplace=True)

allergen_keywords = {allergen : allergens[allergens['Allergy'].str.contains(allergen)]["Food"].values for allergen in allergies_to_consider}

allergen_keywords

{'nut': array(['almond', 'chestnut', 'ginkgo nut', 'peanut', 'pecan', 'walnut'],
       dtype=object),
 'lactose': array(['butter', 'buttermilk', 'casein', 'cheese', 'cream', 'custard',
        'ice cream', 'lactose', 'milk', 'sour cream', 'whey', 'yogurt '],
       dtype=object)}

In [217]:
allergy_csv = pd.DataFrame([{"allergy": allergy, "allergy_id": hash(allergy) % 10000} for allergy in allergies_to_consider])

allergy_csv

Unnamed: 0,allergy,allergy_id
0,nut,7881
1,lactose,2286


In [218]:
allergens_list = []
for allergy in allergies_to_consider:
  keyword_list = np.append(allergen_keywords[allergy], allergy)
  allergens_list.extend([{"allergy_id": hash(allergy) % 10000, "ingredient_id": ingred_id} for ingred_id in ingredients_csv[ingredients_csv["Cleaned_Desc"].str.contains('|'.join(keyword_list), regex=True)]["NDB_No"]])

allergens_csv = pd.DataFrame(allergens_list)

allergens_csv

Unnamed: 0,allergy_id,ingredient_id
0,7881,2025
1,7881,12142
2,7881,12155
3,7881,16399
4,7881,42291
5,2286,1004
6,2286,1006
7,2286,1009
8,2286,1015
9,2286,1017


## Output

In [219]:
recipes_csv.to_csv('./cleaned_data/recipes.csv', index=False)
ingredients_csv.to_csv('./cleaned_data/ingredients.csv', index=False)
steps_csv.to_csv('./cleaned_data/steps.csv', index=False)
ingredient_in_csv.to_csv('./cleaned_data/ingredient_in.csv', index=False)
interactions_csv.to_csv('./cleaned_data/interactions.csv', index=False)
users_csv.to_csv('./cleaned_data/users.csv', index=False)
allergy_csv.to_csv('./cleaned_data/allergies.csv', index=False)
allergens_csv.to_csv('./cleaned_data/allergens.csv', index=False)