In [1]:
#### load the data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json
with open("/content/drive/MyDrive/Colab Notebooks/Autoregressive/full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

In [4]:
filtered_data =[
    'Recipe for' + x['title'] + '|' + ''.join(x['directions'])
    for x in recipe_data
    if 'title' in x
    and x['title'] is not None
    and 'directions' in x
    and x['directions'] is not None
]

In [5]:
filtered_data[1]

'Recipe forBoudin Blanc Terrine with Red Onion Confit |Combine first 9 ingredients in heavy medium saucepan. Add 3 shallots. Bring to simmer. Remove from heat, cover and let stand 30 minutes. Chill overnight.Preheat oven to 325°F. Line 7-cup pâté or bread pan with plastic wrap. Melt butter in heavy small skillet over low heat. Add remaining 5 shallots. Cover and cook until very soft, stirring occasionally, about 15 minutes. Transfer to processor. Add pork, eggs, flour and Port and puree. Strain cream mixture, pressing on solids to extract as much liquid as possible. With processor running, add cream through feed tube and process just until combined with pork. Transfer to large bowl. Mix in currants.Spoon mixture into prepared pan. Cover with foil. Place pan in large pan. Add boiling water to larger pan to within 1/2 inch of top of terrine. Bake until terrine begins to shrink from sides of pan and knife inserted into center comes out clean, about 1 1/2 hours. Uncover and cool on rack. C

In [6]:
#### tokenization

In [7]:
import re
import string
import tensorflow as tf
from tensorflow.keras import layers, models,losses


def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r' \1 ', s)
    s = re.sub(' +', ' ', s)
    return s

text_data = [pad_punctuation(x) for x in filtered_data]

text_ds = tf.data.Dataset.from_tensor_slices(text_data).batch(32).shuffle(1000)
vectorize_layer = layers.TextVectorization(
    standardize = 'lower',
    max_tokens = 10000,
    output_mode = "int",
    output_sequence_length = 200 + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [8]:
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)

In [9]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(10000, 100)(inputs)
x = layers.LSTM(128, return_sequences=True)(x)
outputs = layers.Dense(10000, activation = 'softmax')(x)
lstm = models.Model(inputs, outputs)

loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)
lstm.fit(train_ds, epochs=25)

Epoch 1/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 43ms/step - loss: 4.9771
Epoch 2/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 43ms/step - loss: 3.0238
Epoch 3/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - loss: 2.4947
Epoch 4/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - loss: 2.2273
Epoch 5/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 45ms/step - loss: 2.0668
Epoch 6/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - loss: 1.9737
Epoch 7/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 45ms/step - loss: 1.8998
Epoch 8/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 45ms/step - loss: 1.8415
Epoch 9/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - loss: 1.7946
Epoch 10/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29

<keras.src.callbacks.history.History at 0x7f9d03cf41f0>

In [13]:

import numpy as np
from tensorflow.keras import callbacks

class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y = self.model.predict(x)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append({'prompt': start_prompt , 'word_probs': probs})
            start_tokens.append(sample_token)
            start_prompt = start_prompt + ' ' + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens = 100, temperature = 1.0)

In [None]:
# Instantiate the TextGenerator callback
text_gen_callback = TextGenerator(index_to_word=vocab)

# 6. Train the Model with the Callback
lstm.fit(train_ds, epochs=25, callbacks=[text_gen_callback])

Epoch 1/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



[1m428/629[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m9s[0m 46ms/step - loss: 1.4235Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-5336f993bbbd>", line 5, in <cell line: 5>
    lstm.fit(train_ds, epochs=25, callbacks=[text_gen_callback])
  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit
    callbacks.on_train_batch_end(step, logs)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/callbacks/callback_list.py", line 106, in on_train_batch_end
    callback.on_train_batch_end(batch, logs=logs)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/cal


KeyboardInterrupt

