## BEFORE YOU BEGIN

1. add the data files in a directory called "raw_data" in the folder containing this notebook
2. the data files should be named "RAW_<type_of_data>.csv"
  - recipes: RAW_recipes.csv
  - interactions: RAW_interactions.csv
  - ingredients: RAW_ingredients.csv

In [87]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import ast
import re

recipes = pd.read_csv('./raw_data/RAW_recipes.csv')
interactions = pd.read_csv('./raw_data/RAW_interactions.csv')
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)

## Cleaning Ingredients

In [128]:
ingredients = pd.read_csv('./raw_data/RAW_ingredients.csv')
ingredients['Shrt_Desc'] = ingredients['Shrt_Desc'].str.lower().apply(lambda x: re.sub(r'\s*,\s*', ' ', x))
basics = {"onion": "onions", "apples": "gala apples", "cream": "whipping cream", "apple": "gala apple", "nuts": "mixed nuts", "whipping cream": "heavy whipping cream", "strawberry": "strawberries", "oil": "canola oil", "milk": "whole milk", "yogurt": "yogurt plain", "flour": "all-purpose flour", "sugar": "granulated sugar", "pepper": "black pepper", "eggs": "whole egg", "egg": "whole egg", "butter": "unsalted butter", "tomato": "red tomato", "tomatoes": "red tomatoes"}

In [129]:
ingredients

Unnamed: 0,index,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),...,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Refuse_Pct
0,0,1001,butter with salt,15.87,717,0.85,81.11,2.11,0.06,0.0,...,7.0,51.368,21.021,3.043,215.0,5.00,"1 pat, (1"" sq, 1/3"" high)",14.2,1 tbsp,0.0
1,1,1002,butter whipped w/ salt,16.72,718,0.49,78.30,1.62,2.87,0.0,...,4.6,45.390,19.874,3.331,225.0,3.80,"1 pat, (1"" sq, 1/3"" high)",9.4,1 tbsp,0.0
2,2,1003,butter oil anhydrous,0.24,876,0.28,99.48,0.00,0.00,0.0,...,8.6,61.924,28.732,3.694,256.0,12.80,1 tbsp,205.0,1 cup,0.0
3,3,1004,cheese blue,42.41,353,21.40,28.74,5.11,2.34,0.0,...,2.4,18.669,7.778,0.800,75.0,28.35,1 oz,17.0,1 cubic inch,0.0
4,4,1005,cheese brick,41.11,371,23.24,29.68,3.18,2.79,0.0,...,2.5,18.764,8.598,0.784,94.0,132.00,"1 cup, diced",113.0,"1 cup, shredded",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8785,8785,83110,mackerel salted,43.00,305,18.50,25.10,13.40,0.00,0.0,...,7.8,7.148,8.320,6.210,95.0,80.00,"1 piece, (5-1/2"" x 1-1/2"" x 1/2"")",17.0,"1 cubic inch, boneless",0.0
8786,8786,90240,scallop (bay&sea) ckd stmd,70.25,111,20.54,0.84,2.97,5.41,0.0,...,0.0,0.218,0.082,0.222,41.0,85.00,3 oz,,,0.0
8787,8787,90480,syrup cane,26.00,269,0.00,0.00,0.86,73.14,0.0,...,0.0,0.000,0.000,0.000,0.0,21.00,1 serving,,,0.0
8788,8788,90560,snail raw,79.20,90,16.10,1.40,1.30,2.00,0.0,...,0.1,0.361,0.259,0.252,50.0,85.00,3 oz,,,0.0


In [131]:
remove_keywords = ["sugar sub", "entree", "ckd", "infant formula", "inf formula", "child formula", "meat only", "skin only", "applebee's", "t.g.i friday's", "restaurant", "denny's", "ckd", "babyfood", "fast foods", "burger king", "mcdonald's", "wendy's", "pizza hut", "kfc", "taco bell", "subway", "domino's", "papa john's", "little caesars", "chick-fil-a", "popeyes", "olive garden", "on the border", "carraba's", "cracker barrel"]
replace_raw = {'pork cured bacon': "bacon", 'commly prep': "", 'no chol': "", 'unenr': "", 'raw': "", 'frsh': "fresh", 'frz': "frozen", 'crm': "cream", 'pnut': "peanut", 'pln': "plain", 'whl': "whole", 'dk': "dark", 'broilers or fryers': "", 'drsng': "dressing", 'spread': "spread", 'flr': "flour", 'bf': "beef" ,'w/': "with", 'wo/': "without", 'hvy': "heavy", 'lt': "light", 'juc': "juice", 'wo/salt': "unsalted", 'wo/ salt': "unsalted", 'without salt': "unsalted", "salad or cooking": "", 'yel': "yellow", 'sau': "sauce", 'veg': "vegetable"}

mask = ingredients['Shrt_Desc'].str.contains('|'.join(remove_keywords))
ingredients = ingredients[~mask]

replace = {f'^{key}\\b': value + " " for key, value in replace_raw.items()}
replace.update({f'\\b{key}$': " " + value for key, value in replace_raw.items()})
replace.update({f'\\b{key}\\b': " " + value + " " for key, value in replace_raw.items()})
replace.update({'\\bmilkfat\\b.*$': ""})
replace.update({'\\((.*?)\\)': ""})

ingredients.reset_index(drop=True, inplace=True)
ingredients["Cleaned_Desc"] = ingredients["Shrt_Desc"].replace(replace, regex=True).replace(r'\s+', ' ', regex=True).str.strip()
ingredient_names = ingredients['Cleaned_Desc'].unique()

ingredient_names

array(['butter with salt', 'butter whipped w/ salt',
       'butter oil anhydrous', ..., 'syrup cane', 'snail', 'turtle green'],
      dtype=object)

In [132]:
def matchWord(target, tomatch):
  l = list(map(lambda x : x[0], process.extractBests(target, tomatch, scorer=fuzz.token_sort_ratio, score_cutoff=50, limit=20)))
  return process.extractOne(target, l, scorer=fuzz.token_set_ratio)

#### Get recipes that match cleaned ingredients

In [133]:
valid = dict()
bad_match = dict()
no_match = []
for _, recipe in recipes.loc[:10000].iterrows():
  skip = False
  ingreds_in_recipe = dict()
  for ingred in recipe["ingredients"]:
    to_match = ingred
    if ingred in basics.keys():
      to_match = basics[ingred]

    res = matchWord(to_match, ingredient_names)
    if res is None:
      no_match.append(to_match)
      skip = True
      break
    match, percent = res
    ingreds_in_recipe[to_match] = match
    if percent < 90:
      bad_match[to_match] = (match, percent)
      skip = True
      break
  if not skip:
    # print(ingreds_in_recipe)
    valid[recipe['name']] = ingreds_in_recipe
    skip = False

{'vanilla wafers': 'keebler vanilla wafers', 'unsalted butter': 'butter unsalted', 'powdered sugar': 'sugars powdered', 'whole egg': 'egg whole fresh', 'heavy whipping cream': 'cream fluid heavy whipping', 'strawberries': 'strawberries', 'walnuts': 'walnuts english'}
{'celery': 'celery', 'onions': 'onions', 'ground pork': 'pork fresh ground', 'soy sauce': 'soy sauce made from soy', 'beef broth': 'soup beef broth cubed dry', 'cooking oil': 'oil pam cooking spray original', 'hamburger buns': 'pepperidge farm hamburger buns with sesame'}
{'red potatoes': 'potatoes red flesh & skn', 'margarine': 'margarine reg hard soybn', 'rosemary': 'rosemary dried'}
{'lemon juice': 'lemon juice', 'canola oil': 'oil canola', 'worcestershire sauce': 'sauce worcestershire', 'basil': 'basil fresh', 'soy sauce': 'soy sauce made from soy', 'garlic powder': 'garlic powder', 'black pepper': 'pepper black'}
{'red tomatoes': 'tomatoes red ripe cnd stwd', 'crackers': 'crackers milk', 'mayonnaise': 'mayonnaise dres

## Get needed ingredients

In [195]:
print("Valid recipes: ", len(valid))
unique_ingreds = list(set(ingred for recipe in valid.values() for ingred in recipe.values()))
print("Unique ingredients in recipes: ", len(unique_ingreds))

mask = ingredients['Cleaned_Desc'].isin(unique_ingreds)
unneeded_cols = ["Shrt_Desc", "Refuse_Pct", "index"]
ingredients_csv = ingredients[mask].drop(columns=unneeded_cols).drop_duplicates(subset=['Cleaned_Desc'], keep='first').reset_index(drop=True)
ingredients_csv


Valid recipes:  62
Unique ingredients in recipes:  128


Unnamed: 0,NDB_No,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),Calcium_(mg),...,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Cleaned_Desc
0,1004,42.41,353,21.40,28.74,5.11,2.34,0.0,0.50,528.0,...,2.4,18.669,7.778,0.800,75.0,28.35,1 oz,17.00,1 cubic inch,cheese blue
1,1009,37.02,404,22.87,33.31,3.71,3.09,0.0,0.48,710.0,...,2.4,18.867,9.246,1.421,99.0,132.00,"1 cup, diced",244.00,"1 cup, melted",cheese cheddar
2,1015,81.24,81,10.45,2.27,1.27,4.76,0.0,4.00,111.0,...,0.0,1.235,0.516,0.083,12.0,113.00,4 oz,226.00,"1 cup, (not packed)",cheese cottage lowfat 2%
3,1019,55.22,264,14.21,21.28,5.20,4.09,0.0,4.09,493.0,...,1.8,14.946,4.623,0.591,89.0,150.00,"1 cup, crumbled",28.35,1 oz,cheese feta
4,1033,29.16,392,35.75,25.83,6.04,3.22,0.0,0.80,1184.0,...,1.7,16.410,7.515,0.569,68.0,28.35,1 oz,10.30,1 cubic inch,cheese parmesan hard
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,28162,3.00,367,7.50,0.40,,84.40,1.2,9.30,,...,,0.100,0.100,0.200,0.0,33.00,6 tbsp,,,kellogg's corn flakes crumbs
124,28387,33.20,280,0.01,4.08,1.92,49.90,3.1,7.21,151.0,...,,1.000,0.870,2.020,1.0,53.00,1 serving,,,pepperidge farm hamburger buns with sesame
125,42291,1.10,590,24.00,49.90,3.17,21.83,6.6,9.29,41.0,...,0.6,7.716,23.582,14.363,0.0,16.00,1 tbsp,,,peanut butter red na
126,43598,21.70,688,0.00,77.80,0.40,0.30,0.0,0.30,7.0,...,24.7,10.784,18.026,45.539,0.0,15.00,1 tbsp,239.00,1 cup,mayonnaise dressing


## Get valid recipes

In [198]:
valid_recipe_names = list(valid.keys())

recipes_csv = recipes[recipes['name'].isin(valid_recipe_names)].drop_duplicates(subset=['name'], keep='first').reset_index(drop=True)

recipes_csv

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,better than sex strawberries,42198,1460,41531,2002-10-03,"['weeknight', 'time-to-make', 'course', 'main-...","[734.1, 66.0, 199.0, 10.0, 10.0, 117.0, 28.0]",8,['crush vanilla wafers into fine crumbs and li...,simple but sexy. this was in my local newspape...,"[vanilla wafers, butter, powdered sugar, eggs,...",7
1,chinese chop suey,8559,70,4481,2001-01-27,"['weeknight', 'time-to-make', 'course', 'main-...","[395.4, 31.0, 20.0, 29.0, 51.0, 33.0, 8.0]",8,"['brown ground meat and onion in a large pot',...",easy one-pot dinner.,"[celery, onion, ground pork, soy sauce, beef b...",7
2,fried potatoes,37073,40,1533,2002-08-13,"['60-minutes-or-less', 'time-to-make', 'course...","[132.6, 8.0, 4.0, 3.0, 4.0, 5.0, 6.0]",14,"['preheat oven to 400 degrees', 'cut the potat...","my husband made these up last week, very tasty...","[red potatoes, margarine, rosemary]",3
3,momma s special marinade,30131,10,41480,2002-06-03,"['15-minutes-or-less', 'time-to-make', 'course...","[199.2, 31.0, 6.0, 10.0, 1.0, 13.0, 1.0]",4,['depending on how much meat i have i usually ...,we usually use with chicken. my neices ask for...,"[lemon juice, oil, worcestershire sauce, basil...",7
4,munch without guilt tomatoes,30300,10,6164,2002-06-04,"['15-minutes-or-less', 'time-to-make', 'course...","[3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",3,['put a slice of tomato on each biscuit / crac...,anytime munchies...another sweat free preparat...,"[tomatoes, crackers, mayonnaise, black pepper]",4
...,...,...,...,...,...,...,...,...,...,...,...,...
57,100 year old souffle,20238,40,19060,2002-02-20,"['60-minutes-or-less', 'time-to-make', 'course...","[203.8, 24.0, 1.0, 16.0, 19.0, 46.0, 1.0]",10,"['melt butter in a saucepan', 'sprinkle in flo...",this is a simple recipe passed down for over 1...,"[butter, flour, milk, eggs, salt, sharp chedda...",6
58,100 whole wheat bread abm,292147,220,379836,2008-03-17,"['time-to-make', 'course', 'preparation', 'for...","[2154.8, 86.0, 333.0, 112.0, 152.0, 66.0, 122.0]",6,"[""microwave buttermilk for 30secs to a minute ...",i wanted 100% which is light and fluffy so i s...,"[buttermilk, eggs, whole wheat flour, salt, gl...",8
59,100 stroke banana bread,213681,75,19185,2007-02-24,"['time-to-make', 'course', 'main-ingredient', ...","[270.9, 15.0, 98.0, 15.0, 7.0, 4.0, 14.0]",4,"['mash bananas with sugar , oil , and egg', 'a...",contributed to the little church of the desert...,"[bananas, sugar, canola oil, egg, flour, salt,...",9
60,12 hour tomatoes,449768,750,1072593,2011-02-28,"['course', 'main-ingredient', 'preparation', '...","[267.5, 31.0, 51.0, 1.0, 10.0, 13.0, 7.0]",19,"['preheat the oven to 200 degrees', 'line a la...",from joe yonan's cookbook.,"[tomatoes, salt, fresh ground black pepper, ol...",5


## Make recipe-ingredient relations

In [208]:
def find_recipe_id(recipe_name):
  return recipes_csv[recipes_csv['name'] == recipe_name]['id'].item()

def find_ingredient_id(ingredient_name):
  return ingredients_csv[ingredients_csv['Cleaned_Desc'] == ingredient_name]["NDB_No"].item()

def create_relationship(recipe_name, ingredients):
  recipe_id = find_recipe_id(recipe_name)
  ingred_rows = [{"recipe_id": recipe_id, "ingredient_id": find_ingredient_id(value), "variant": key} for key, value in ingredients.items()]
  return ingred_rows

relation_temp = [create_relationship(recipe_name, ingredients) for recipe_name, ingredients in valid.items()]
ingredient_in_csv = pd.DataFrame([row for rows in relation_temp for row in rows])

ingredient_in_csv

Unnamed: 0,recipe_id,ingredient_id,variant
0,42198,18609,vanilla wafers
1,42198,1145,unsalted butter
2,42198,19336,powdered sugar
3,42198,1123,whole egg
4,42198,1053,heavy whipping cream
...,...,...,...
368,449768,2014,cumin seeds
369,302815,20420,pasta
370,302815,11215,garlic clove
371,302815,1145,unsalted butter


## Interactions

In [212]:
interactions_csv = interactions[interactions['recipe_id'].isin(recipes_csv['id'])].reset_index(drop=True)
interactions_csv

Unnamed: 0,user_id,recipe_id,date,rating,review
0,1544208,208179,2010-04-16,5,I have made a recipe just like this for years....
1,2710499,310570,2013-02-25,5,great recipe - very much like the real thing f...
2,563669,310570,2009-05-05,4,I have been looking for a cheese burek (or bur...
3,1593428,310570,2011-07-01,5,this was so yummy and not to difficult to make...
4,2000743062,310570,2016-11-12,4,"Very nice recipe, but missing a few things for..."
...,...,...,...,...,...
465,44757,101115,2004-10-03,0,"This is known as a ""Loco Moco"". It has been sa..."
466,537188,126072,2008-01-27,5,I usually always manage to make my pastry go h...
467,476326,126072,2009-12-24,4,"Soft, tender crust, but a pain to try to roll...."
468,1701315,126072,2012-03-30,5,I've never made a pie crust using milk before....


## Create User Data

In [228]:
user_ids = pd.concat([interactions_csv['user_id'], recipes_csv['contributor_id']], ignore_index=True).unique()
print("Unique users: ", len(user_ids))

Unique users:  487


In [303]:
from faker import Faker

fake = Faker()

In [305]:
def generate_info():
  name = fake.first_name() + " " + fake.last_name()
  domain = fake.free_email_domain()
  split_name = name.lower().split(" ")
  rand = fake.random_int(min=0, max=100) % 7
  email = ""
  if rand == 1:
    email = split_name[0] + split_name[1] + "@" + domain
  elif rand == 2:
    email = split_name[0] + "." + split_name[1] + "@" + domain
  elif rand == 3:
    email = split_name[0][0] + split_name[1] + str(fake.random_int(min=1, max=100)) + "@" + domain
  elif rand == 4:
    email = split_name[0] + str(fake.random_int(min=1, max=100)) + "@" + domain
  elif rand == 5:
    email = split_name[1] + split_name[0] + "@" + domain
  elif rand == 6:
    email = split_name[1] + "." + split_name[0] + "@" + domain
  else:
    email = split_name[0] + "_" + split_name[1] + str(fake.random_int(min=1, max=100)) + "@" + domain
  return name, email
    

users_csv = pd.DataFrame([{"id": user_id, "name": name, "email": email} for user_id, (name, email) in zip(user_ids, [generate_info() for _ in user_ids])])

users_csv

Unnamed: 0,id,name,email
0,1544208,Shannon Sparks,shannon_sparks9@gmail.com
1,2710499,Tony Hoffman,tony.hoffman@gmail.com
2,563669,Lisa Gamble,gamble.lisa@hotmail.com
3,1593428,Krystal Estrada,estradakrystal@yahoo.com
4,2000743062,Carlos George,carlosgeorge@hotmail.com
...,...,...,...
482,205983,Clarence Lopez,lopez.clarence@hotmail.com
483,19060,Amy Koch,amy.koch@gmail.com
484,379836,Dawn Baker,dbaker70@yahoo.com
485,19185,Linda Ferguson,linda_ferguson26@hotmail.com


## TODO Allergen Data

In [330]:
allergies_to_consider = ["nut", "lactose"]
allergens = pd.read_csv('./raw_data/RAW_allergens.csv')

allergens['Allergy'] = allergens['Allergy'].str.lower()
allergens['Food'] = allergens['Food'].str.lower()

allergens.dropna(subset=['Allergy'], inplace=True)

allergen_keywords = {allergen : allergens[allergens['Allergy'].str.contains(allergen)]["Food"].values for allergen in allergies_to_consider}

allergen_keywords

{'nut': array(['almond', 'chestnut', 'ginkgo nut', 'peanut', 'pecan', 'walnut'],
       dtype=object),
 'lactose': array(['butter', 'buttermilk', 'casein', 'cheese', 'cream', 'custard',
        'ice cream', 'lactose', 'milk', 'sour cream', 'whey', 'yogurt '],
       dtype=object)}

In [339]:
allergy_csv = pd.DataFrame([{"allergy": allergy, "allergy_id": hash(allergy) % 10000} for allergy in allergies_to_consider])

allergy_csv

Unnamed: 0,allergy,allergy_id
0,nut,7500
1,lactose,1416


In [366]:
allergens_list = []
for allergy in allergies_to_consider:
  keyword_list = np.append(allergen_keywords[allergy], allergy)
  allergens_list.extend([{"allergy_id": hash(allergy) % 10000, "ingredient_id": ingred_id} for ingred_id in ingredients_csv[ingredients_csv["Cleaned_Desc"].str.contains('|'.join(keyword_list), regex=True)]["NDB_No"]])

allergens_csv = pd.DataFrame(allergens_list)

allergens_csv

Unnamed: 0,allergy_id,ingredient_id
0,7500,2025
1,7500,12142
2,7500,12155
3,7500,16399
4,7500,42291
5,1416,1004
6,1416,1009
7,1416,1015
8,1416,1019
9,1416,1033


## Output

In [367]:
recipes_csv.to_csv('./cleaned_data/recipes.csv', index=False)
ingredients_csv.to_csv('./cleaned_data/ingredients.csv', index=False)
ingredient_in_csv.to_csv('./cleaned_data/ingredient_in.csv', index=False)
interactions_csv.to_csv('./cleaned_data/interactions.csv', index=False)
users_csv.to_csv('./cleaned_data/users.csv', index=False)
allergy_csv.to_csv('./cleaned_data/allergies.csv', index=False)
allergens_csv.to_csv('./cleaned_data/allergens.csv', index=False)