In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import platform
import time
import pathlib
import os
import json
import zipfile

In [3]:
cache_dir = './tmp'
pathlib.Path(cache_dir).mkdir(exist_ok=True)

In [8]:
dataset_file_name = 'recipes_raw.zip'
dataset_file_origin = 'https://storage.googleapis.com/recipe-box/recipes_raw.zip'

In [9]:
dataset_file_path = tf.keras.utils.get_file(
    fname=dataset_file_name,
    origin=dataset_file_origin,
    cache_dir=cache_dir,
    extract=True,
    archive_format='zip'
)

Downloading data from https://storage.googleapis.com/recipe-box/recipes_raw.zip


In [10]:
!ls -la ./tmp/datasets/

total 251968
drwxr-xr-x 2 root root     4096 Nov 26 10:09 .
drwxr-xr-x 3 root root     4096 Nov 26 10:08 ..
-rw-r--r-- 1 root root    20437 Nov 26 10:09 LICENSE
-rw-r--r-- 1 root root 49784325 Nov 26 10:08 recipes_raw_nosource_ar.json
-rw-r--r-- 1 root root 61133971 Nov 26 10:08 recipes_raw_nosource_epi.json
-rw-r--r-- 1 root root 93702755 Nov 26 10:09 recipes_raw_nosource_fn.json
-rw-r--r-- 1 root root 53355492 Nov 26 10:08 recipes_raw.zip


In [12]:
def load_dataset(silent=False):
  dataset_file_names = [
      'recipes_raw_nosource_ar.json',
      'recipes_raw_nosource_epi.json',
      'recipes_raw_nosource_fn.json'
  ]

  dataset = []

  for file_name in dataset_file_names:
    file_path = f'{cache_dir}/datasets/{file_name}'

    with open(file_path) as dataset_file:
      json_data_dict = json.load(dataset_file)
      json_data_list = list(json_data_dict.values())   
      dict_keys = [key for key in json_data_list[0]]
      dict_keys.sort()

      dataset += json_data_list

  return dataset   


In [13]:
dataset_raw = load_dataset()

In [14]:
dataset_raw[0]

{'ingredients': ['4 skinless, boneless chicken breast halves ADVERTISEMENT',
  '2 tablespoons butter ADVERTISEMENT',
  '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT',
  '1 onion, finely diced ADVERTISEMENT',
  '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT',
  'ADVERTISEMENT'],
 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n',
 'picture_link': '55lznCYBbs2mT8BTx6BTkLhynGHzM.S',
 'title': 'Slow Cooker Chicken and Dumplings'}

In [15]:
len(dataset_raw)

125164

In [24]:
def recipe_validate_required_fields(recipe):
  required_keys = ['title', 'ingredients', 'instructions']

  if not recipe:
    return False

  for required_key in required_keys:
    if not recipe[required_key]:
      return False

    if type(recipe[required_key]) == list and len(recipe[required_key]) == 0:
      return False
      
  return True

In [29]:
valid_datasets = [recipe for recipe in dataset_raw if recipe_validate_required_fields(recipe)]

In [31]:
len(valid_datasets)

122938

In [32]:
STOP_WORD_TITLE = '📗 '
STOP_WORD_INGREDIENTS = '\n🥕\n\n'
STOP_WORD_INSTRUCTIONS = '\n📝\n\n'

In [33]:
def recipe_to_str(recipe):
  noize_string = 'ADVERTISEMENT'

  title = recipe['title']
  ingredients = recipe['ingredients']
  instructions = recipe['instructions'].split('\n')

  ingredients_str = ''

  for ing in ingredients:
    ing = ing.replace(noize_string, '')
    if ing:
      ingredients_str += f'* {ing}\n'

  instructions_str = ''

  for inst in instructions:
    inst = inst.replace(noize_string, '')
    if inst:
      instructions_str += f'*{instrcution}\n'


  return f'{STOP_WORD_TITLE}{title}\n{STOP_WORD_INGREDIENTS}{ingredients_str}{STOP_WORD_INSTRUCTIONS}{instructions_str}'

