## Data Cleaning

In [1]:
#Dependencies
import pandas as pd
import numpy as np
import os

## Step 1. Import csv files, convert to df and concatentate them into one file. 

In [2]:
recipe_df1 = pd.read_csv('ScrapeData/csv_recipes/recipe_df1.csv',low_memory=False)
recipe_df2 = pd.read_csv('ScrapeData/csv_recipes/recipe_df2.csv',low_memory=False)
recipe_df3 = pd.read_csv('ScrapeData/csv_recipes/recipe_df3.csv',low_memory=False)
recipe_df4 = pd.read_csv('ScrapeData/csv_recipes/recipe_df4.csv',low_memory=False)
recipe_df5 = pd.read_csv('ScrapeData/csv_recipes/recipe_df5.csv',low_memory=False)

# Add merge all df into one df. 
all_recipes = pd.concat([recipe_df1, recipe_df2,recipe_df3, recipe_df4, recipe_df5],ignore_index=True)

In [3]:
all_recipes.head()

Unnamed: 0.1,Unnamed: 0,bestrating,id,imagesrc,ingredients,makeitagainscore,preparation,rating,reviews,tags,title
0,0.0,4.0,,https://assets.epicurious.com/photos/56df1d8cb...,"['2 cups whole-wheat flour', '2 cups all-purpo...",90%,Put oven rack in middle position and preheat o...,3.5,50.0,"['Irish', 'Bread', 'Bake', 'Vegetarian', 'Quic...",Irish Brown Bread
1,1.0,4.0,,https://assets.epicurious.com/photos/5a78b5919...,"['1 Tbsp. crushed red pepper flakes', '1 Tbsp....",91%,"Coarsely grind red pepper flakes, peppercorns,...",3.0,11.0,"['Pepper', 'Fennel', 'Pork', 'Garlic', 'White ...",Slow-Cooked Pork with Chickpeas
2,2.0,4.0,,https://assets.epicurious.com/photos/5a68f33b3...,"['1/4 cup extra-virgin olive oil', '12 oz. dry...",67%,Pour oil into cooker insert and select “Sauté....,2.0,15.0,"['Instant Pot', 'Kale', 'Potato', 'Onion', 'Ga...",Instant Pot Caldo Verde
3,3.0,4.0,,https://assets.epicurious.com/photos/5a9971b01...,"['1 tablespoon olive oil', '2 garlic cloves, t...",81%,Preheat oven to 350°F. Heat oil in a large ski...,3.5,26.0,"['Side', 'Bake', 'Christmas', 'Easter', 'Thank...",Parmesan Bread Pudding with Broccoli Rabe and ...
4,4.0,4.0,,https://assets.epicurious.com/photos/54b71d9a2...,"['1 1/2 pounds boneless, skinless chicken brea...",96%,"Place chicken, garlic, bay leaves, allspice, a...",3.5,23.0,"['Asian', 'Soup/Stew', 'Chicken', 'Low Carb', ...",Brothy Poached Chicken with Mushrooms and Fres...


In [5]:
# Check for missing rows
all_recipes.count()

Unnamed: 0          4858
bestrating          6662
id                  1816
imagesrc            5016
ingredients         6674
makeitagainscore    6662
preparation         6672
rating              6662
reviews             6662
tags                6646
title               6673
dtype: int64

In [6]:
# Drop 'Unnamed 0:' and 'id 'columns before we dropping rows with empty cells.
all_recipes.drop('Unnamed: 0',axis=1, inplace=True)
all_recipes.drop('id',axis=1, inplace=True)

all_recipes.count()

bestrating          6662
imagesrc            5016
ingredients         6674
makeitagainscore    6662
preparation         6672
rating              6662
reviews             6662
tags                6646
title               6673
dtype: int64

In [8]:
# Drop empty rows and convert into a new df.
all_clean_recipes = all_recipes.dropna(axis=0, how="any")
all_clean_recipes.count()

bestrating          4990
imagesrc            4990
ingredients         4990
makeitagainscore    4990
preparation         4990
rating              4990
reviews             4990
tags                4990
title               4990
dtype: int64

## Step 2. Get all the tags from each recipe to use for UI selection.

In [13]:
# Getting the tags from the first recipe
all_clean_recipes['tags'].iloc[0]

'[\'Irish\', \'Bread\', \'Bake\', \'Vegetarian\', \'Quick & Easy\', "St. Patrick\'s Day", \'Spring\', \'Gourmet\']'

In [14]:
# ast - Abstract Syntax Tree - processes trees of the Python abstract syntax grammar.
import ast

# The "ast.literal_eval()" evaluates an expression node or a Unicode
# or Latin-1 encoded string containing a Python literal. 
# The string or node provided may only consist of the following 
# Python literal structures: strings, numbers, tuples, lists, dicts, booleans, and None.
# Returns words that have proper syntax/grammar.
all_tags = []
for index, row in all_clean_recipes.iterrows():
    tags = ast.literal_eval(row["tags"])
    for tag in tags:
        all_tags.append(tag)

In [15]:
# What the tags look like after using ast 
all_tags[0:8]

['Irish',
 'Bread',
 'Bake',
 'Vegetarian',
 'Quick & Easy',
 "St. Patrick's Day",
 'Spring',
 'Gourmet']

In [16]:
# How many tags from all the recipes?
len(all_tags)

75836

In [18]:
# Get unigue tags and remove duplicates. Convert it to a list for indexing. 
unique_tags = list(set(all_tags))
len(unique_tags)

633

In [19]:
# Convert the list to a dictionary
tags_dict = {k: v for k, v in enumerate(unique_tags)}
tags_dict

{0: 'Fry',
 1: 'Wild Rice',
 2: 'One-Pot Meal',
 3: 'Kidney Friendly',
 4: 'Poppy',
 5: 'Sukkot',
 6: 'Radicchio',
 7: 'White Wine',
 8: 'Japanese',
 9: 'Instant Pot',
 10: 'Fat Free',
 11: 'Portland',
 12: 'Lamb',
 13: 'Wok',
 14: 'Asian',
 15: 'Argentine',
 16: 'Low Carb',
 17: 'House Cocktail',
 18: 'Marzipan',
 19: 'Poultry Sausage',
 20: 'Winter',
 21: 'Virginia',
 22: 'Casserole/Gratin',
 23: 'Apple Juice',
 24: 'Olive',
 25: 'Chocolate',
 26: 'Tuna',
 27: 'Graduation',
 28: 'Organic',
 29: 'Seattle',
 30: 'Rum',
 31: 'Central/South American',
 32: 'Spritzer',
 33: 'Wasabi',
 34: 'Edible Gift',
 35: 'Duck',
 36: 'Pistachio',
 37: 'Wedding',
 38: 'Cottage Cheese',
 39: 'Cumin',
 40: 'Middle Eastern',
 41: 'Slow Cooker',
 42: 'Thyme',
 43: 'Pork',
 44: 'Cuban',
 45: 'Dairy',
 46: 'No-Cook',
 47: 'Bronx',
 48: 'Rutabaga',
 49: 'Salmon',
 50: 'Meat',
 51: 'Herb',
 52: 'Pork Rib',
 53: 'Cocktail',
 54: 'Quail',
 55: 'White Chocolate',
 56: 'Sour Cream',
 57: 'Engagement Party',
 58: '

## Summary for recipe tags.
--- 
- There were 75,836 tags for all the recipes. 
- These were reduced to unique 633 tags (0.8%). 
- However, some tags can need to be removed. like, "Kidney Friendly", "Bon App��tit","#CAKEWEEK".

## Step 3.  Get all the ingredients from each recipe to use for UI selection.

In [20]:
# Getting the ingredients from the first recipe
all_clean_recipes['ingredients'].iloc[0]

"['2 cups whole-wheat flour', '2 cups all-purpose flour plus additional for kneading', '1/2 cup toasted wheat germ', '2 teaspoons salt', '2 teaspoons sugar', '1 teaspoon baking soda', '1/2 teaspoon cream of tartar', '1 stick (1/2 cup) cold unsalted butter, cut into 1/2-inch cubes', '2 cups well-shaken buttermilk']"

In [21]:
# Each character and letter is indexed. 
len(all_clean_recipes['ingredients'].iloc[0])

314

### We need to separate out each step and then separate each word in each step to get all the unique ingredients for all the recipes.

In [24]:
# Using ast we can separate out the indvidual steps.
all_ingredients = []
for index, row in all_clean_recipes.iterrows():
    tags = ast.literal_eval(row["ingredients"])
    for tag in tags:
        all_ingredients.append(tag)

In [25]:
# The first 8 ingredients
all_ingredients[0:9]

['2 cups whole-wheat flour',
 '2 cups all-purpose flour plus additional for kneading',
 '1/2 cup toasted wheat germ',
 '2 teaspoons salt',
 '2 teaspoons sugar',
 '1 teaspoon baking soda',
 '1/2 teaspoon cream of tartar',
 '1 stick (1/2 cup) cold unsalted butter, cut into 1/2-inch cubes',
 '2 cups well-shaken buttermilk']

In [26]:
# The number of steps in all the ingridients
len(all_ingredients)

56256

In [27]:
# Remove characters and numbers in the ingredients using regular expression (re) module.
import re
ingredients = []
pattern = "([^\d\\.\\,\\\;\\*\\]\\[\\%\\½\\<\\>\\:\\°\\?\\�\\}\\{\)\(\"\/\s]+)"
for ingredient in set(all_ingredients):
    p = re.compile(pattern)
    result = p.findall(ingredient) 
    ingredients.append(result)

In [28]:
# The first 8 ingredients
ingredients[0:9]

[['Garnish', 'pomegranate', 'seeds', 'from', 'pomegranate'],
 ['tablespoons', 'dried', 'chanterelle', 'or', 'shiitake', 'mushrooms'],
 ['small', 'pork', 'tenderloins', 'to', 'pounds', 'total'],
 ['cups', 'or', 'more', 'flour'],
 ['pound',
  'boneless',
  'skinless',
  'chicken',
  'breasts',
  'cut',
  'into',
  '-inch',
  'strips'],
 ['cup', 'slivered', 'almonds'],
 ['Peanut', 'oil', 'for', 'frying'],
 ['mango', 'cut', 'into', 'thin', 'strips'],
 ['cup',
  'chopped',
  'fresh',
  'basil',
  'plus',
  'whole',
  'large',
  'basil',
  'leaves']]

In [29]:
# So we have a list of words and lists. 
# We need to iterated through the lists to get single words in one list.
separate_ingridents = []
for food in ingredients:
    for item in food:
        separate_ingridents.append(item)

In [30]:
# The first 8 ingredients
separate_ingridents[0:9]

['Garnish',
 'pomegranate',
 'seeds',
 'from',
 'pomegranate',
 'tablespoons',
 'dried',
 'chanterelle',
 'or']

In [31]:
# How many ingredients
len(separate_ingridents)

182402

In [32]:
# Get the unique ingredients and put them in a list. 
unique_ingredients = list(set(separate_ingridents))

In [33]:
# The first 8 ingredients
unique_ingredients[0:9]

['rim',
 'quick-rising',
 'High-speed',
 'Popovers',
 'delicata',
 'Porcini',
 'acini',
 'depending',
 'thickly']

In [34]:
# How many ingredients
len(unique_ingredients)

5592

### We reduced out 182,402 words in the ingridients to  5,592. This is still a lot.  Using a json food file that has food labels we can add these food labels to stopwords and use the Natural Language Tool Kit (nltk) to remove these words from our list. 

In [36]:
import json
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [37]:
food_data = json.load(open('validfood.json'))
food_data

{'hints': [{'food': {'label': 'Lima beans, immature seeds, raw',
    'uri': 'http://www.edamam.com/ontologies/edamam.owl#Food_11031'},
   'measures': [{'label': 'Pound',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_pound'},
    {'label': 'Kilogram',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_kilogram'},
    {'label': 'Ounce',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_ounce'},
    {'label': 'Gram',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_gram'},
    {'label': 'Cup',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_cup'},
    {'label': 'Salt spoon',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_salt_spoon'},
    {'label': 'Quart',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_quart'},
    {'label': 'Teaspoon',
     'uri': 'http://www.edamam.com/ontologies/edamam.owl#Measure_teaspoon'},
    {'label': 'Smidgen',
     'uri': 'http://www.edamam.com/o

In [38]:
# Get the food label from the json file and add them to a list
labels = []
for i in range(len(food_data["hints"][0]["measures"])):
    label = food_data["hints"][0]["measures"][i]["label"]
    labels.append(label.lower())

In [39]:
labels

['pound',
 'kilogram',
 'ounce',
 'gram',
 'cup',
 'salt spoon',
 'quart',
 'teaspoon',
 'smidgen',
 'drop',
 'gallon',
 'dash',
 'handful',
 'scoop',
 'bowl',
 'cubic inch',
 'pinch',
 'liter',
 'milliliter',
 'tablespoon',
 'fluid ounce',
 'bottle',
 'tad',
 'pint',
 'dessert spoon']

In [40]:
# Add the food labels to the stopwords
stopwords = nltk.corpus.stopwords.words('english')
for label in labels:
    stopwords.append(label)

In [46]:
# print the stopwords to make sure our words are there. 
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [47]:
# Remove stopword from the ingridients 

filtered_foods = []
for food in unique_ingredients:
    if food not in stopwords:
        filtered_foods.append(food)

In [51]:
# Get the first 8 foods
print(filtered_foods)

['rim', 'quick-rising', 'High-speed', 'Popovers', 'delicata', 'Porcini', 'acini', 'depending', 'thickly', 'Pattypan', 'heated', 'kirsch', 'Asian', 'k', 'speck', 'fluke', 'three', 'keeping', 'Pecans', 'prime', 'oblong', 'sheep', 'annatto', 'shiny', 'Test', 'chicharrón', 'core', 'fully', 'lepicerie', 'matchstick', 'vinaigrette', 'seasoning', 'Herb', 'one-cup', 'gutted', 'pickled', 'PHILADELPHIA', 'citrus-cranberry', 'flax', 'tomatoes', 'young', '¼', 'katsuobushi', 'Point', 'attachment', 'Chihuahua', 'walnuts', 'single-serve', 'Sautéed', 'bell-shaped', 'sauté', 'Sierra', 'crusts', 'oxtail', 'Mix', 'Maltaise', 'per', 'snapped', 'quills', 'Choy', 'framboise', 'eyes', 'peel', 'zest', 'uniform', 'pita', 'liverwurst', 'juniper', 'Garlic', 'feta', 'fry', 'Baby', 'plump', 'standing', 'poultry', 'Semi-pearled', 'tissue', 'applejack', 'Few', 'multigrain', 'omega-', 'Nocello', 'coast', 'meatloaf', '-g', 'tablespoons', 'sectioned', 'damson', 'Simple', 'Tangy', 'complete', 'Abruzzo', 'caciocavallo', 

In [52]:
# How many did we get?
len(filtered_foods)

5476

In [54]:
# Read in the some common foods csv to match the foods in our list. These could've been added
# to the stopwords are we could do this before the stopwords. 
foods_df = pd.read_csv('foodb_2017_06_29_csv/foods.csv',encoding='latin-1', low_memory=False)

In [55]:
foods_df.head()

Unnamed: 0,id,name,name_scientific,description,itis_id,wikipedia_id,wikipedia_id.1,picture_content_type,picture_file_size,picture_updated_at,...,food_subgroup,food_type,created_at,updated_at,creator_id,updater_id,export_to_afcdb,category,ncbi_taxonomy_id,export_to_foodb
0,1,Angelica,Angelica keiskei,Angelica is a genus of about 60 species of tal...,,Angelica,1.jpg,image/jpeg,111325.0,2012-04-20 09:29:57 UTC,...,Herbs,Type 1,2011-02-09 00:37:14 UTC,2017-06-27 17:13:48 UTC,,2.0,False,specific,357850.0,True
1,2,Savoy cabbage,Brassica oleracea var. sabauda,Savoy cabbage (Brassica oleracea convar. capit...,,Savoy cabbage,2.jpg,image/jpeg,155178.0,2012-04-20 09:39:54 UTC,...,Cabbages,Type 1,2011-02-09 00:37:15 UTC,2017-03-20 21:26:22 UTC,,,False,specific,1216010.0,True
2,3,Silver linden,Tilia argentea,Tilia tomentosa (Silver Lime in the UK and Sil...,,Tilia tomentosa,3.jpg,image/jpeg,56367.0,2012-04-20 09:41:25 UTC,...,Herbs,Type 1,2011-02-09 00:37:15 UTC,2015-10-02 21:24:39 UTC,,,False,specific,,True
3,4,Kiwi,Actinidia chinensis,"The kiwifruit, often shortened to kiwi in many...",506775.0,Kiwifruit,4.jpg,image/jpeg,110661.0,2012-04-20 09:32:21 UTC,...,Tropical fruits,Type 1,2011-02-09 00:37:15 UTC,2017-03-20 21:26:22 UTC,,,False,specific,3625.0,True
4,5,Allium (Onion),Allium,Allium haematochiton is a species of wild onio...,42634.0,Allium haematochiton,5.jpg,image/jpeg,341911.0,2012-04-20 09:37:44 UTC,...,Onion-family vegetables,Type 1,2011-02-09 00:37:15 UTC,2017-03-20 21:26:22 UTC,,,False,specific,4678.0,True


In [57]:
# Convert the food "name" column to a list
foods = foods_df['name'].tolist()
print(foods)

['Angelica', 'Savoy cabbage', 'Silver linden', 'Kiwi', 'Allium (Onion)', 'Garden onion', 'Leek', 'Garlic', 'Chives', 'Lemon verbena', 'Cashew nut', 'Pineapple', 'Dill', 'Custard apple', 'Wild celery', 'Peanut', 'Burdock', 'Horseradish', 'Tarragon', 'Mugwort', 'Asparagus', 'Oat', 'Star fruit', 'Brazil nut', 'Common beet', 'Borage', 'Chinese mustard', 'Swede', 'Rape', 'Common cabbage', 'Cauliflower', 'Brussel sprouts', 'Kohlrabi', 'Broccoli', 'Chinese cabbage', 'Turnip', 'Pigeon pea', 'Tea', 'Capers', 'Pepper (C. annuum)', 'Papaya', 'Safflower', 'Caraway', 'Pecan nut', 'Chestnut', 'Roman camomile', 'Chickpea', 'Endive', 'Chicory', 'Chinese cinnamon', 'Ceylon cinnamon', 'Watermelon', 'Lime', 'Lemon', 'Pummelo', 'Mandarin orange (Clementine, Tangerine)', 'Sweet orange', 'Coffee', 'Arabica coffee', 'Robusta coffee', 'Coriander', 'Common hazelnut', 'Saffron', 'Muskmelon', 'Cucumber', 'Cucurbita (Gourd)', 'Cumin', 'Turmeric', 'Quince', 'Lemon grass', 'Globe artichoke', 'Wild carrot', 'Japanes

In [58]:
# How many?
len(foods)

907

In [59]:
# Some of the compound words can be split because they would be part of the ingridents.
food_list = []
for food in foods:
    food_list.append(food.split())

In [61]:
# Iterate through to get each in a list.
common_foods = []
for foods in food_list:
    for food in foods:
        common_foods.append(food.lower())

In [62]:
len(common_foods)

1548

In [64]:
# Get only the unique foods.
separate_common_foods = list(set(common_foods))

In [65]:
len(separate_common_foods)

952

In [66]:
# Now, iterate through the 'filtered_foods'and if a food matches one that is in the
# 'separate_common_foods' then put that food in a list.

new_foods = []
for i in range(len(separate_common_foods)):
    for j in range(len(filtered_foods)):
        if separate_common_foods[i] == filtered_foods[j]:
            new_foods.append(filtered_foods[j])

In [67]:
# How many foods now?
len(new_foods)

458

In [68]:
print(new_foods)

['nectarine', 'jicama', 'snow', 'pudding', 'tostada', 'savory', 'products', 'guava', 'plantain', 'ground', 'domestic', 'pepper', 'scarlet', 'sugar', 'bar', 'cooking', 'cucumber', 'kelp', 'brown', 'sheep', 'kiwi', 'chip', 'mixed', 'filling', 'crab', 'raisin', 'grits', 'ravioli', 'citrus', 'dried', 'mint', 'de', 'tamarind', 'yam', 'custard', 'sorrel', 'barley', 'pecan', 'bamboo', 'gooseberry', 'amaranth', 'pot', 'rhubarb', 'dandelion', 'catfish', 'lingonberry', 'sorghum', 'salad', 'yellow', 'vegetable', 'baby', 'burdock', 'rum', 'mousse', 'black-eyed', 'apricot', 'kale', 'celeriac', 'turkey', 'cracker', 'flour', 'vitamin', 'dog', 'heart', 'plum', 'banana', 'boysenberry', 'cardamom', 'agave', 'maitake', 'dish', 'fruits', 'beech', 'buttermilk', 'corn', 'fig', 'strawberry', 'food', 'sherry', 'bread', 'cumin', 'hen', 'oregano', 'arepa', 'savoy', 'cereal', 'pan', 'quail', 'chives', 'parsnip', 'radish', 'dough', 'mandarin', 'juice', '+', 'arctic', 'grouper', 'almond', 'tomato', 'artichoke', 'p

In [70]:
# Convert the list to a dictionary
food_dict = {k: v for k, v in enumerate(new_foods)}
print(food_dict)

{0: 'nectarine', 1: 'jicama', 2: 'snow', 3: 'pudding', 4: 'tostada', 5: 'savory', 6: 'products', 7: 'guava', 8: 'plantain', 9: 'ground', 10: 'domestic', 11: 'pepper', 12: 'scarlet', 13: 'sugar', 14: 'bar', 15: 'cooking', 16: 'cucumber', 17: 'kelp', 18: 'brown', 19: 'sheep', 20: 'kiwi', 21: 'chip', 22: 'mixed', 23: 'filling', 24: 'crab', 25: 'raisin', 26: 'grits', 27: 'ravioli', 28: 'citrus', 29: 'dried', 30: 'mint', 31: 'de', 32: 'tamarind', 33: 'yam', 34: 'custard', 35: 'sorrel', 36: 'barley', 37: 'pecan', 38: 'bamboo', 39: 'gooseberry', 40: 'amaranth', 41: 'pot', 42: 'rhubarb', 43: 'dandelion', 44: 'catfish', 45: 'lingonberry', 46: 'sorghum', 47: 'salad', 48: 'yellow', 49: 'vegetable', 50: 'baby', 51: 'burdock', 52: 'rum', 53: 'mousse', 54: 'black-eyed', 55: 'apricot', 56: 'kale', 57: 'celeriac', 58: 'turkey', 59: 'cracker', 60: 'flour', 61: 'vitamin', 62: 'dog', 63: 'heart', 64: 'plum', 65: 'banana', 66: 'boysenberry', 67: 'cardamom', 68: 'agave', 69: 'maitake', 70: 'dish', 71: 'fru

## Summary for ingredients.
--- 
- There were 182,402 are words associated with the ingredients.
- Using regex qnd getting unique words these were reduced to 5,592 words associated with the ingredients.
- Using nltk, a common food list, and a food measurement list these 5,592 were reduced to 458 words associated with the ingredients. 