In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import zipfile
import csv

###Loading Data

In [None]:
!wget https://eightportions.com/recipes_raw.zip
!unzip /content/recipes_raw.zip -d /content/datasets/

--2023-03-17 16:50:54--  https://eightportions.com/recipes_raw.zip
Resolving eightportions.com (eightportions.com)... 104.21.4.85, 172.67.131.221, 2606:4700:3033::6815:455, ...
Connecting to eightportions.com (eightportions.com)|104.21.4.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53355492 (51M) [application/zip]
Saving to: ‘recipes_raw.zip’


2023-03-17 16:50:54 (127 MB/s) - ‘recipes_raw.zip’ saved [53355492/53355492]

Archive:  /content/recipes_raw.zip
  inflating: /content/datasets/recipes_raw_nosource_ar.json  
  inflating: /content/datasets/recipes_raw_nosource_epi.json  
  inflating: /content/datasets/recipes_raw_nosource_fn.json  
  inflating: /content/datasets/LICENSE  


In [None]:
# read data files
file_names = ['recipes_raw_nosource_ar.json', 'recipes_raw_nosource_epi.json', 'recipes_raw_nosource_fn.json']

data_keys = None
recipe_dataset = []
for file_name in file_names:
  file_path = '/content/datasets/' + file_name
  data_file = json.load(open(file_path))
  data_items = list(data_file.values())
  recipe_dataset += data_items

  if data_keys == None: 
    data_keys = list(data_items[0].keys())

In [None]:
print(data_keys)
print("Length of recipe dataset: ", len(recipe_dataset))

['title', 'ingredients', 'instructions', 'picture_link']
Length of recipe dataset:  125164


### Data Processing


*  Remove incomplete data
*  Remove recipes whose ingredient lists does not contain targeted vegetables (recipes not related to this project)
*  Remove junk words in the data (i.e., the word "ADVERTISEMENT")
*  Generating a list of tags for each recipe for faster query search. Tags will be the targeted vegetables in ingredients of the recipe


In [None]:
# target vegetables
vegetable_classes = ['asparagus', 'bell pepper', 'broccoli', 'cabbage', 'carrot', 'celery', 'chilli pepper', 'corn', 'cucumber', 'eggplant', 
                     'lettuce', 'mushroom', 'onion', 'peas', 'potato', 'pumpkin', 'raddish', 'spinach', 'sweet potato', 'tomato']


units = ['tablespoons', 'teaspoons', 'cans', 'packages', 'ounces', 'pieces', 'cups', 'pounds', 'inches', 'slices', 'halves']

action_words = ['torned', 'crushed', 'chopped', 'softened', 'beaten', 'mashed', 'peeled', 'cut', 'divided', 'minced', 'tasted', 'sliced', 'melted', 
                'packed', 'diced', 'shredded', 'grated', 'cooked', 'rinsed', 'drained', 'prepared']

common_words = ['of', 'finely', 'and', 'into', 'to', 'or', 'for', 'only', 'part', 'large', 'small', 'cube', 'optional', 'with', 'cooking']

non_keywords = units + action_words + common_words
print(non_keywords)


['tablespoons', 'teaspoons', 'cans', 'packages', 'ounces', 'pieces', 'cups', 'pounds', 'inches', 'slices', 'halves', 'torned', 'crushed', 'chopped', 'softened', 'beaten', 'mashed', 'peeled', 'cut', 'divided', 'minced', 'tasted', 'sliced', 'melted', 'packed', 'diced', 'shredded', 'grated', 'cooked', 'rinsed', 'drained', 'prepared', 'of', 'finely', 'and', 'into', 'to', 'or', 'for', 'only', 'part', 'large', 'small', 'cube', 'optional', 'with', 'cooking']


In [None]:
# seasonings and spices
# seasoning = ['salt', 'vinegar', 'black pepper', 'garlic powder', 'oregano', 'basil', 'parsley', 'vanilla extract', 'thyme', 'rosemary',
#              'curry powder', 'ground ginger', 'ground cumin', 'ground cinnamon', 'red wine', 'almond extract']

# common ingredient 
# common_ingredients = ['buttter', 'cheese', 'cream cheese', 'brown sugar', 'vinegar', 'soy sauce', 'salt', 'white sugar', 'milk', 'sour cream',
#                       'egg', 'honey', 'plain yogurt', 'lemon juice', 'whole wheat flour', 'all-purpose flour', 'white rice', 'bread crumbs',
#                       'applesauce', 'spaghetti sauce', 'baking powder', 'baking soda', 'peanut butter', ]

# meat = ['shrimp', 'chicken', 'beef', 'pork', 'salmon', 'turkey', 'bacon']

Remove incomplete data

In [None]:
def is_incomplete(recipe):
  title = recipe.get('title')
  ingredients = recipe.get('ingredients')
  instructions = recipe.get('instructions')

  if title == None or ingredients == None or instructions == None or len(recipe['ingredients']) == 0:
    return False
  
  return True

In [None]:
num_incomplete = 0

# loop 3 times to make sure all the incomplete recipes are removed
for i in range(0,3):
  for recipe in recipe_dataset:
    if is_incomplete(recipe) == False:
      recipe_dataset.remove(recipe)
      num_incomplete += 1

print("Number of incomplete recipes: ", num_incomplete)
print("Length of recipe dataset: ", len(recipe_dataset))

Number of incomplete recipes:  2211
Length of recipe dataset:  122953


Remove recipes whose ingredient lists does not contain targeted vegetables

In [None]:
def contains_ingredient(recipe, target_list):
  for ingredient in recipe.get('ingredients'):
    for target in target_list:
      if target in ingredient:
        return True

  return False

In [None]:
print("Length of recipe dataset (before processing): ", len(recipe_dataset))

num_missing_ingredients = 0
for i, recipe in enumerate(recipe_dataset):
  if i % 10000 == 0:
    print("Completed: ", i, "/122953")
  if contains_ingredient(recipe, vegetable_classes) == False:
    recipe_dataset.remove(recipe)
    num_missing_ingredients += 1

print("Number of recipes that does not contain targeted vegetables: ", num_missing_ingredients)
print("Length of recipe dataset (after processing): ", len(recipe_dataset))

Length of recipe dataset (before processing):  122953
Completed:  0 /122953
Completed:  10000 /122953
Completed:  20000 /122953
Completed:  30000 /122953
Completed:  40000 /122953
Completed:  50000 /122953
Completed:  60000 /122953
Completed:  70000 /122953
Completed:  80000 /122953
Number of recipes that does not contain targeted vegetables:  38220
Length of recipe dataset (after processing):  84733


Remove junk words in the data (ex. ADVERTISEMENT) and for the vegetables that can take on multiple names, replace the names with the our "standard" ones for consistency

In [None]:
target_word = "ADVERTISEMENT"
alternate_names = {
  "chile pepper": "chili pepper",
  "capsicum": "bell pepper",
  "brinjal": "eggplant",
  "fungi" : "mushroom"
}

def remove_junk(recipe):
  updated_recipe = []
  for ingredient in recipe.get('ingredients'):
    ingredient = ingredient.replace(target_word, "")
    for key in alternate_names:
      ingredient = ingredient.replace(key, alternate_names[key])
    updated_recipe.append(ingredient)
  # print(updated_recipe)
  return updated_recipe

In [None]:
i = 0
for recipe in recipe_dataset:
  new_recipe = remove_junk(recipe)
  recipe_dataset[i]["ingredients"] = new_recipe
  i = i + 1
print("Updated ingredients field for first object: ", recipe_dataset[0]["ingredients"])


Updated ingredients field for first object:  ['4 skinless, boneless chicken breast halves ', '2 tablespoons butter ', '2 (10.75 ounce) cans condensed cream of chicken soup ', '1 onion, finely diced ', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ', '']


# Adding an Additional Veggies Field to Help with Later Querying

In [None]:
import re

for recipe in recipe_dataset:
  veggies_list = ""
  keywords = ""

  for ingredient in recipe.get('ingredients'):
    for target in vegetable_classes:
      if target in ingredient and target not in veggies_list:
        if not veggies_list:
          veggies_list = target
        else:
          veggies_list = veggies_list + "\n" + target
        continue
  recipe["veggies"] = veggies_list

  for ingredient in recipe.get('ingredients'):
    temp = re.findall(r'\w+', ingredient)
    for item in temp:
      if item.isalpha() and item not in keywords and item not in vegetable_classes:
        for word in non_keywords:
          match_found = False
          if item.lower() in word:
            match_found = True
            break
        if match_found == False:
          if not keywords:
            keywords = item
          else:
            keywords = keywords + "\n" + item     
      # if item is of type char and not in the non_keywords list
      # if item.isalpha() and item not in keywords and item.lower() not in non_keywords:
        
  recipe["keywords"] = keywords

print(recipe_dataset[0])
print(recipe_dataset[20])

{'title': 'Slow Cooker Chicken and Dumplings', 'ingredients': ['4 skinless, boneless chicken breast halves ', '2 tablespoons butter ', '2 (10.75 ounce) cans condensed cream of chicken soup ', '1 onion, finely diced ', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ', ''], 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n', 'picture_link': '55lznCYBbs2mT8BTx6BTkLhynGHzM.S', 'veggies': 'onion', 'keywords': 'skinless\nboneless\nchicken\nbreast\nbutter\ncondensed\ncream\nsoup\nrefrigerated\nbiscuit\ndough'}
{'title': 'Baked Teriyaki Chicken', 'ingredients': ['1 tablespoon cornstarch ', '1 tablespoon cold water ', '1/2 cup white sugar ', '1/2 cup soy sauce ', '1/4 cup cider vinegar ', '1 clove garlic, minced ', '1/2 teaspoon grou

In [None]:
print(recipe_dataset[158])

{'title': "Cha Cha's White Chicken Chili", 'ingredients': ['1 tablespoon vegetable oil ', '1 onion, chopped ', '3 cloves garlic, crushed ', '1 (4 ounce) can diced jalapeno peppers ', '1 (4 ounce) can chopped green chilli peppers ', '2 teaspoons ground cumin ', '1 teaspoon dried oregano ', '1 teaspoon ground cayenne pepper ', '2 (14.5 ounce) cans chicken broth ', '3 cups chopped cooked chicken breast ', '3 (15 ounce) cans white beans ', '1 cup shredded Monterey Jack cheese ', ''], 'instructions': 'Heat the oil in a large saucepan over medium-low heat. Slowly cook and stir the onion until tender. Mix in the garlic, jalapeno, green chile peppers, cumin, oregano and cayenne. Continue to cook and stir the mixture until tender, about 3 minutes. Mix in the chicken broth, chicken and white beans. Simmer 15 minutes, stirring occasionally.\nRemove the mixture from heat. Slowly stir in the cheese until melted. Serve warm.\n', 'picture_link': 'CAL0XavDT2gYTpAMyCDQARVTZ4DflD.', 'veggies': 'onion', 

### Saving as CSV file


*   Ingredients are given as a list, need to reformat before saving to csv file



In [None]:
headers = ['title', 'veggies', 'keywords', 'ingredients', 'instructions', 'picture_link']
num_recipes = 300

csv_path = "/content/drive/MyDrive/APS360 Project/recipe_data.csv"

count = 0
with open(csv_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()

    for row in recipe_dataset:
      if count > num_recipes:
        break
      # only write row if veggies list is non-empty
      if row["veggies"]:
        writer.writerow(row)
        count = count + 1
