In [1]:
import json
import os
import pathlib
import pickle
import tensorflow as tf
import re
import importlib
import zipfile
from tabulate import tabulate as Tabulate

In [7]:
database_dir = 'database_raw'


def get_recipe_parse_ready(recipe, f):
    return ((f + recipe).strip()).split("\n")


# function to read file and parse
def read_raw_file(filename: str):
    with open(f'./{database_dir}/{filename}', 'r') as file:
        file_read = file.read().strip().replace("ADVERTISEMENT", '')

        # get the separator
        current_f = file_read.split("\n")[0]
        split = file_read.split(current_f)[1:]

        return [(get_recipe_parse_ready(recipe, current_f)) for recipe in split]


files = (os.listdir(f'./{database_dir}/'))
data = []

for filename in files:
    parsed_file = read_raw_file(filename)
    data.extend(parsed_file)

print(len(data))


122930


In [8]:
print(data[1])

['MMMMM----- Recipe via Meal-Master (tm) v8.05', '', 'Title: Blackend Medallions of Salmon Over Blue Home Fries and Blueberry and Citrus Vinaigrette', '', '1/4 cup kosher salt', '1/4 cup cayenne pepper', '1/4 cup paprika', '1/4 cup garlic powder', '1/4 cup ground black pepper', '2 tablespoons onion powder', '2 tablespoons dried oregano', '2 tablespoons dried thyme', '4 ( 6 ounce) pieces of salmon', '6 tablespoons Cajun spice mix', '1 tablespoon peanut oil or vegetable oil', '1 pint blueberries, juiced', '1 orange, zested and juiced', '1 lemon, zested and juiced', '1 large shallot, diced', '1 cup olive oil', 'Salt and fresh ground pepper, to taste', '1 pound Blue Peruvian potatoes, peeled and diced', '1 medium onion, diced', '2 jalapenos, seeded and diced', '1 red pepper, seeded and diced', '2 tablespoons fresh thyme, picked', '1 tablespoon garlic, chopped', '1 1/2 cups peanut or vegetable oil', 'Salt, to taste', 'Fresh ground pepper to taste', 'For the Cajun spice mix, combine ingredie

In [13]:

import parse_recipe as RecipeParser
importlib.reload(RecipeParser)

recipes = [RecipeParser.parse_recipe(x) for x in data]
print(len(recipes))

122930


In [15]:
print(recipes[1].directions)

['Salt and fresh ground pepper, to taste 1 pound Blue Peruvian potatoes, peeled and diced 1 medium onion, diced 2 jalapenos, seeded and diced 1 red pepper, seeded and diced 2 tablespoons fresh thyme, picked 1 tablespoon garlic, chopped 1 1/2 cups peanut or vegetable oil Salt, to taste Fresh ground pepper to taste For the Cajun spice mix, combine ingredients and store in a jar until ready to use. For the salmon, heat a nonstick skillet to almost smoking. Coat 2 sides of salmon in spice mix. Add oil to skillet and place salmon, on a spiced side down in skillet, turn salmon to the other side when spices start to blacken. If you like well done, transfer skillet to a 375 degree preheated oven for 3 minutes. For the blueberry vinaigrette, In a small pan reduce the juices of the blueberries and orange by half. Let cool, add lemon juice and shallots, 1/2 teaspoon each, chopped zest of orange and lemon. Stir in olive oil and season with salt and pepper to taste. Bring vinaigrette to room temper

In [16]:
TITLE_STOP_WORD = '🆃🅸🆃🅻🅴\n'
INGREDIENTS_STOP_WORD = '🅸🅽🅶🆁🅴🅳🅸🅴🅽🆃🆂\n'
INSTRUCTIONS_STOP_WORD = '🅸🅽🆂🆃🆁🆄🅲🆃🅸🅾🅽🆂\n'


def recipe_to_string(recipe: RecipeParser.Recipe):
    # This string is presented as a part of recipes so we need to clean it up.

    title = recipe.title
    ingredients = recipe.ingredients
    instructions = recipe.directions

    ingredients_string = ''
    for ingredient in ingredients:
        if ingredient:
            ingredients_string += f'• {ingredient}\n'

    instructions_string = ''
    for instruction in instructions:
        if instruction:
            instructions_string += f'▪︎ {instruction}\n'

    return f'{TITLE_STOP_WORD}{title}\n\n{INGREDIENTS_STOP_WORD}{ingredients_string}\n{INSTRUCTIONS_STOP_WORD}{instructions_string}'


string_recipes = [recipe_to_string(recipe) for recipe in recipes]
print(len(string_recipes))


122930


In [24]:
print(len(string_recipes[1].split(" ")))


425


In [25]:
MAX_WORD_LENGTH = 300


def filter_recipes_by_length(recipe_test):
    return len((recipe_test.split(" "))) <= MAX_WORD_LENGTH


max_len_recipes = [
    recipe_text for recipe_text in string_recipes if filter_recipes_by_length(recipe_text)]
print(len(max_len_recipes))


91545


In [26]:
import nltk
nltk.download('punkt')
for i in range(len(max_len_recipes)):
    sentence = max_len_recipes[i]
    tokens = nltk.word_tokenize(sentence)
    nltk_data = " ".join(tokens)
    max_len_recipes[i] = nltk_data


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isonlaxman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
STOP_SIGN = '<STOP>'
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='\n.*',
    lower=True,
    split=' ',
    oov_token="<OOV>",
    char_level=False
)

# Stop word is not a part of recipes, but tokenizer must know about it as well.
tokenizer.fit_on_texts([STOP_SIGN])
tokenizer.fit_on_texts(max_len_recipes)
# tokenizer.get_config()


In [28]:
VOCABULARY_SIZE = len(tokenizer.word_counts) + 1
print(VOCABULARY_SIZE)


53573


In [36]:
# data_vectorized = tokenizer.texts_to_sequences(max_len_recipes)
# print(re.sub(' (?! )', '' , as_string))
print(max_len_recipes[1])


🆃🅸🆃🅻🅴 Peas and Pancetta 🅸🅽🅶🆁🅴🅳🅸🅴🅽🆃🆂 • 1/4 pou d ancetta , diced ( 1 cup ) • 1 shall t , sliced • 1 teasp on minced garlic 🅸🅽🆂🆃🆁🆄🅲🆃🅸🅾🅽🆂 ▪︎ Grated zest and juice of 1 orange 3 cups shelled fresh peas ( about 1 pound ) 1/4 cup chopped fresh parsley 1 tablespoon unsalted butter Kosher salt Cook the pancetta in a large skillet over medium-low heat until crispy , about 5 minutes . Add the shallot and garlic , cook 1 minute and then add the orange juice . Increase the heat to medium , add the peas and cook until tender , about 3 minutes . Remove the pan from the heat and fold in the orange zest , parsley and butter . Taste for seasoning and add salt if necessary , though you should n't need much , if any , because the pancetta adds a natural saltiness to the dish . Photograph by Yunhee Kim -- -- --


In [35]:
data_vectorized = tokenizer.texts_to_sequences(max_len_recipes)
# print(' '.join(tokenizer.sequences_to_texts(data_vectorized)))

In [37]:

data_padded = tf.keras.preprocessing.sequence.pad_sequences(
    data_vectorized,
    padding='post',
    truncating='post',
    maxlen=MAX_WORD_LENGTH - 1,
    value=tokenizer.texts_to_sequences([STOP_SIGN])[0]
)

data_padded = tf.keras.preprocessing.sequence.pad_sequences(
    data_padded,
    padding='post',
    truncating='post',
    maxlen=MAX_WORD_LENGTH + 1,
    value=tokenizer.texts_to_sequences([STOP_SIGN])[0]
)


In [41]:
# print(data_padded[2])
print(tokenizer.sequences_to_texts([data_padded[2]]))


['🆃🅸🆃🅻🅴 roasted sarandeado-style whole snapper 🅸🅽🅶🆁🅴🅳🅸🅴🅽🆃🆂 • 8 table po ns ( 1 stick ) cold unsalted but • 1 1/2 t 2 pounds whole thai snapper , cle • ter , cu ed divided • aned , h ad and tail intact 🅸🅽🆂🆃🆁🆄🅲🆃🅸🅾🅽🆂 ▪︎ salt and freshly ground black pepper 1 tablespoon garlic powder 1 tablespoon tomato bouillon 1 teaspoon bottled hot sauce ( recommended : huichol ) 2 limes , halved , plus additional for serving 1 orange 6 ( 6-inch ) corn tortillas preheat the oven to 350 degrees f line a large baking sheet with foil and dot with 2 tablespoons butter butterfly the fish by cutting almost in half horizontally and opening like a book carefully cut and remove the spine place the fish , skin-side down , on the foil roll up the edges of the foil to come close to the edge of the fish to prevent the juices from spilling onto the baking sheet sprinkle the fish with salt and pepper sprinkle the fish with the garlic powder and then with the tomato bouillon drizzle the fish with the hot sauce and then

In [44]:
# os.makedirs('./saved_data')
with open('./saved_data/tokenizer.pickle', 'wb+') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('./saved_data/data_padded.pickle', 'wb+') as handle:
    pickle.dump(data_padded, handle, protocol=pickle.HIGHEST_PROTOCOL)