In [None]:
import json
with open('/app/data/epirecipes/full_format_recipes.json') as json_data:
    recipe_data = json.load(json_data)
filtered_data = [
    'Recipe for ' + x['title']+ ' | ' + ' '.join(x['directions'])
 for x in recipe_data
 if 'title' in x
 and x['title'] is not None
 and 'directions' in x
 and x['directions'] is not None
]

In [20]:
# Tokenization
import string 
import re 
import tensorflow as tf
from tensorflow.keras import layers
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r' \1 ', s)
    s= re.sub(' +', '', s)
    return s 

#Pad the punctuation marks, to treat them as separate words
text_data = [pad_punctuation(x) for x in filtered_data]
#Convert to a TensorFlow Dataset
text_dataset = tf.data.Dataset.from_tensor_slices(text_data).batch(32).shuffle(1000)

"""Create a Keras TextVectorization layer to convert text to lowercase, give the
most prevalent 10,000 words a corresponding integer token, and trim or pad the
sequence to 201 tokens long.
"""
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=10000,
    output_mode="int",
    output_sequence_length=200 + 1,
)
# Apply the TextVectorization layer to the training data.
vectorize_layer.adapt(text_dataset)
#The vocab variable stores a list of the word tokens.
vocab = vectorize_layer.get_vocabulary()


# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")


0: 
1: [UNK]
2: recipeforpâtebrisée|inalargebowlblendtheflour,thebutter,thevegetableshortening,andthesaltuntilthemixtureresemblesmeal.add3tablespoonsicewater,tossthemixtureuntilthewaterisincorporated,andformthedoughintoaball.kneadthedoughlightlywiththeheelofthehandagainstasmoothsurfaceforafewsecondstodistributethefatevenlyandre-formitintoaball.dustthedoughwithflourandchillit,wrappedinwaxpaper,for1hour.
3: coolingtemperature:80°f
4: recipeforyogurtandlemondressing|inthejar,combinetheyogurt,lemonjuice,andsalt.coverwiththelidandshaketoblend.tasteforseasoning.thedressingcanbeusedimmediately.(storethedressingintherefrigeratorforupto1week.shaketoblendagainbeforeusing.)
5: recipeforvanillaandalmondfrosting|usingelectricmixer,beatbutterinlargebowluntilfluffy.graduallybeatinsugar,thencreamandvanilla.dividebetween2bowls.mix1teaspoonvanillainto1bowlofbasefrosting.mixalmondextractintosecondbowlofbasefrosting.mixinyellowfoodcoloring,1dropatatime,untildesiredshadeisreached.
6: recipeforturkeygravy|a

In [None]:
# Display an example of a recipe
example_data = text_data[9]
print(example_data)
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

In [19]:
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_dataset.map(prepare_inputs)

In [13]:
import tensorflow as tf 
from tensorflow.keras import layers ,losses ,models 

#The Input layer does not need us to specify the sequence length in advance 
#(it can be flexible), so we use None as a placeholder.
inputs = layers.Input(shape=(None,) , dtype = 'int32')
#The Embedding layer requires two parameters, the size of the vocabulary 
#(10,000 tokens) and the dimensionality of the embedding vector (100).
x = layers.Embedding(10000,100)(inputs)
#The LSTM layers require us to specify the dimensionality of the
#hidden vector (128) , and we choose to return the full sequence of the
#hidden state.
x = layers.LSTM(128,return_sequences = True)(x)
# The Dense layer transforms the hidden states at each timestep into a
#vector of probabilities for the next token.
outputs = layers.Dense(10000, activation = 'softmax')(x)

lstm = models.Model(inputs,outputs)

loss_fn = losses.SparseCategoricalCrossentropy()
#The model is compiled with SparseCategoricalCrossentropy loss—this is the
#same as categorical cross-entropy, but is used when the labels are
# integers rather than one-hot encoded vectors.
lstm.compile("adam", loss_fn)


In [15]:
import numpy as np
from tensorflow.keras import callbacks
# The TextGenerator callback function
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [16]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")



In [17]:

# Tokenize starting prompt
text_generator = TextGenerator(vocab, lstm)


In [21]:
lstm.fit(
    train_ds,
    epochs=25,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)


Epoch 1/25
generated text:
recipe for 

Epoch 2/25
generated text:
recipe for 

Epoch 3/25
generated text:
recipe for 

Epoch 4/25
generated text:
recipe for 

Epoch 5/25
generated text:
recipe for 

Epoch 6/25
generated text:
recipe for 

Epoch 7/25
generated text:
recipe for 

Epoch 8/25
generated text:
recipe for 

Epoch 9/25
generated text:
recipe for 

Epoch 10/25
generated text:
recipe for 

Epoch 11/25
generated text:
recipe for 

Epoch 12/25
generated text:
recipe for 

Epoch 13/25
generated text:
recipe for 

Epoch 14/25
generated text:
recipe for 

Epoch 15/25
generated text:
recipe for 

Epoch 16/25
generated text:
recipe for 

Epoch 17/25
generated text:
recipe for 

Epoch 18/25
generated text:
recipe for 

Epoch 19/25
generated text:
recipe for 

Epoch 20/25
generated text:
recipe for 

Epoch 21/25
generated text:
recipe for 

Epoch 22/25
generated text:
recipe for 

Epoch 23/25
generated text:
recipe for [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [U

<keras.callbacks.History at 0x7fba769f4070>

In [None]:

stop = False 
recipe_for = 'recipe for'
while not stop:
    text = input('Which recipe do you want today?')
    if text == 'stop' or text == 'exit' :
        stop = True
    else:
        recipe = recipe_for + ' ' + text
        info = text_generator.generate(start_prompt =recipe,
                                       max_tokens = 100
                                       , temperature = 0.2)
        print('With the probabilities : \n ')
        print_probs(info ,vocab)
        print('Done!')
        

In [None]:
#Overall Stacked LSTMS 
text_in = layers.Input(shape = (None,))
embedding = layers.Embedding(total_words,embedding_size)(text_in)
x = layers.LSTM(n_units,return_sequences = True)(x)
x = layers.LSTM(n_units,return_sequences = True)(x)
probabilities = layers.Dense(10000, activation = 'softmax')(x)

model = models.Model(text_in , probavilities)

In [24]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | [UNK]


PROMPT: recipe for chocolate ice cream |
[UNK]:   	52.78%
:   	29.8%
removestringandthinlyslicepork;coverandsetaside.reheatstockandcooknoodles:   	1.48%
scatteroverthemintorbasiltoserve.:   	1.16%
smoothaspossibletogiveacreamymixture.oncethetartbaseshavesetandarefeelingfirm,:   	1.04%
--------



In [25]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | [UNK]


PROMPT: recipe for chocolate ice cream |
[UNK]:   	94.57%
:   	5.43%
removestringandthinlyslicepork;coverandsetaside.reheatstockandcooknoodles:   	0.0%
scatteroverthemintorbasiltoserve.:   	0.0%
smoothaspossibletogiveacreamymixture.oncethetartbaseshavesetandarefeelingfirm,:   	0.0%
--------

