# Importing the important libraries

In [None]:
# Packages for training the model and working with the dataset.
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

# Utility/helper packages.
import platform
import time
import pathlib
import os
import re

In [None]:
dataset_raw = pd.read_csv(r'C:\Users\Ayush Mourya\OneDrive\Desktop\IIITD\Novel Recipe Generation\All CSVs\recipe_over_2m.csv')
dataset_raw = dataset_raw [:10]
dataset_raw.head()

In [None]:
dataset_raw.info()

In [None]:
dataset_validated = [recipe for recipe in dataset_raw.iterrows()]

In [None]:
STOP_WORD_TITLE = '📕 '
STOP_WORD_INGREDIENTS = '\n🥩\n\n'
STOP_WORD_INSTRUCTIONS = '\n✍️\n\n'

In [None]:
def recipe_to_string(recipe):
    recipe = recipe[1]

    #title = recipe['recipe_name']
    #title = recipe['name']
    title = recipe['title']
    ingredients = recipe['ingredients']
    instructions = recipe['directions']
    #instructions = recipe['steps']

    ingredients_string = ''
    #.strip("[]") removes the square brackets (`[` and `]`) from the string `ingredients`
    #.split(', ') splits the string `ingredients` into a list of strings, using the comma and space (`', '`) as the separator
    for ingredient in ingredients.strip("[]").split(', '):
        if ingredient:
            ingredient = ingredient.replace("'", "")
            ingredients_string += f'• {ingredient}\n'

    instructions_string = ''
    for instruction in instructions.strip('][').split(', '):
        if instruction:
            instruction = instruction.replace("'", "")
            instructions_string += f'▪︎ {instruction}\n'

    return f'{STOP_WORD_TITLE}{title}\n{STOP_WORD_INGREDIENTS}{ingredients_string}{STOP_WORD_INSTRUCTIONS}{instructions_string}'

dataset_stringified = [recipe_to_string(recipe) for recipe in dataset_validated]

for recipe_index, recipe_string in enumerate(dataset_stringified[:5]):
    print('Recipe #{}\n---------'.format(recipe_index + 1))
    print(recipe_string)
    print('\n')

This line prints the value of the 801st element of the `dataset_stringified` list to the console.

In [None]:
print(dataset_stringified[3])

In [None]:
recipes_lengths = []
for recipe_text in dataset_stringified:
    recipes_lengths.append(len(recipe_text))

In [None]:
plt.hist(recipes_lengths, bins=50)
plt.show()

In [None]:
plt.hist(recipes_lengths,range=(0,2000), bins=50)
plt.show()

In [None]:
MAX_RECIPE_LENGTH = 750

In [None]:
def filter_max_recipes_by_length(recipe_test):
    return (len(recipe_test) <= MAX_RECIPE_LENGTH)

dataset_max_filtered = [recipe_text for recipe_text in dataset_stringified if filter_max_recipes_by_length(recipe_text)]

print('Dataset size BEFORE filtering length: ', len(dataset_stringified))
print('Dataset size AFTER filtering length: ', len(dataset_max_filtered))
print('Number of eliminated recipes length: ', len(dataset_stringified) - len(dataset_max_filtered))

In [None]:
plt.hist(recipes_lengths,range=(0,750), bins=50)
plt.show()

In [None]:
MIN_RECIPE_LENGTH = 250

In [None]:
def filter_min_recipes_by_length(recipe_test):
    return (len(recipe_test) >= MIN_RECIPE_LENGTH)

dataset_filtered = [recipe_text for recipe_text in dataset_max_filtered if filter_min_recipes_by_length(recipe_text)]

print('Dataset size BEFORE filtering length: ', len(dataset_max_filtered))
print('Dataset size AFTER filtering length: ', len(dataset_filtered))
print('Number of eliminated recipes length: ', len(dataset_max_filtered) - len(dataset_filtered))

In [None]:
plt.hist(recipes_lengths,range=(250,750), bins=50)
plt.show()

In [None]:
# Indicator of the end of the recipe.
STOP_SIGN = '␣'

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=True,
    filters='',
    lower=False,
    split=''
)

In [None]:
# Stop word is not a part of recipes, but tokenizer must know about it as well.
tokenizer.fit_on_texts([STOP_SIGN])

In [None]:
tokenizer.fit_on_texts(dataset_filtered)

In [None]:
tokenizer.get_config()

In [None]:
# Adding +1 to take into account a special unassigned 0 index.
VOCABULARY_SIZE = len(tokenizer.word_counts) + 1

print('VOCABULARY_SIZE: ', VOCABULARY_SIZE)

In [None]:
print(tokenizer.index_word[21])

This code is printing the index of the word m in the tokenizer's vocabulary

In [None]:
tokenizer.word_index['m']

In [None]:
# For demo application we need to have an array of characters as vocabulary.
array_vocabulary = tokenizer.sequences_to_texts([[word_index] for word_index in range(VOCABULARY_SIZE)])
print([char for char in array_vocabulary])

In [None]:
tokenizer.texts_to_sequences(['🥩 meat'])

In [None]:
dataset_vectorized = tokenizer.texts_to_sequences(dataset_filtered)

print('Vectorized dataset size', len(dataset_vectorized))

In [None]:
def recipe_sequence_to_string(recipe_sequence):
    recipe_stringified = tokenizer.sequences_to_texts([recipe_sequence])[0] ## msh fahma awi leh 7atena 0
    recipe_stringified = re.sub(r'(?<=\S)\s(?=\S)', '', recipe_stringified).replace("   ", " ")
    print(recipe_stringified)

In [None]:
recipe_sequence_to_string(dataset_vectorized[99])

### Add padding to sequences

This code iterates over the first 20 elements of the list `dataset_vectorized` and print the length of it

In [None]:
for recipe_index, recipe in enumerate(dataset_vectorized[:20]):
    print('Recipe #{} length: {}'.format(recipe_index + 1, len(recipe)))

This code prints the variable `MAX_RECIPE_LENGTH` which assigns the maximum length of the recipes

In [None]:
MAX_RECIPE_LENGTH

In [None]:
dataset_vectorized_padded_without_stops = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized,
    padding='post',
    truncating='post',
    maxlen=MAX_RECIPE_LENGTH-1,
    value=tokenizer.texts_to_sequences([STOP_SIGN])[0] # 0 is the index of '␣'
)

We repeat the function using -1 above and +1 below  to make sure that all recipes will have at least 1 stop sign at the end,

In [None]:
dataset_vectorized_padded = tf.keras.preprocessing.sequence.pad_sequences(
    dataset_vectorized_padded_without_stops,
    padding='post',
    truncating='post',
    maxlen=MAX_RECIPE_LENGTH+1,
    value=tokenizer.texts_to_sequences([STOP_SIGN])[0]
)

This code iterates over the first 20 elements of the list `dataset_vectorized_padded` and print the length of it

In [None]:
for recipe_index, recipe in enumerate(dataset_vectorized_padded[:20]):
    print('Recipe #{} length: {}'.format(recipe_index, len(recipe)))

### Create TensorFlow dataset

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(dataset_vectorized_padded)

This line displays information about the dataset, such as the number of elements, the data types of the elements.

In [None]:
print(dataset)

In [None]:
print('Recipe in tensorflow:\n', dataset.take(1), '\n\n\n')
for recipe in dataset.take(1):
        print('Raw recipe:\n', recipe.numpy(), '\n\n\n')
        print('Stringified recipe:\n')
        recipe_sequence_to_string(recipe.numpy())


In [None]:
def split_input_target(recipe):
    input_text = recipe[:-1]
    target_text = recipe[1:]

    return input_text, target_text

In [None]:
dataset_targeted = dataset.map(split_input_target)

print(dataset_targeted)

In [None]:
def split_train_test_data(dataset, train_ratio):
    num_samples = len(dataset)
    num_train_samples = int(num_samples * train_ratio)
    data_train = dataset.take(num_train_samples)
    data_test = dataset.skip(num_train_samples)
    data_train = list(data_train.as_numpy_iterator())
    data_test = list(data_test.as_numpy_iterator())
    return data_train, data_test

In [None]:
data_text_list = list(dataset_targeted)

data_train, data_test = split_train_test_data(dataset_targeted, 0.7)

In [None]:
data_train_subset = data_train[:1]  # Take the first element from the list

for input_example, target_example in data_train_subset:
    print('Input sequence size:', repr(len(input_example)))
    print('Target sequence size:', repr(len(target_example)))
    print()

    input_stringified = tokenizer.sequences_to_texts([input_example[:50]])[0]
    target_stringified = tokenizer.sequences_to_texts([target_example[:50]])[0]

    print('Input:  ', repr(''.join(input_stringified)))
    print('Target: ', repr(''.join(target_stringified)))

In [None]:
len( data_test)

In [None]:
len( data_train)

In [None]:
def transform_element(input_target):
    input_sequence, target_sequence = input_target[0], input_target[1]

    # Apply tf.squeeze() only if the shape is compatible
    if input_sequence.shape[-1] == 1:
        input_sequence = tf.squeeze(input_sequence, axis=-1)
    if target_sequence.shape[-1] == 1:
        target_sequence = tf.squeeze(target_sequence, axis=-1)

    return input_sequence, target_sequence

In [None]:
#for input_example, target_example in dataset_targeted.take(1):
 #   print('Input sequence size:', repr(len(input_example.numpy())))
  #  print('Target sequence size:', repr(len(target_example.numpy())))
   # print()

    #input_stringified = tokenizer.sequences_to_texts([input_example.numpy()[:50]])[0]
    #target_stringified = tokenizer.sequences_to_texts([target_example.numpy()[:50]])[0]

    
    #print('Input:  ', repr(''.join(input_stringified)))
    
    #print('Target: ', repr(''.join(target_stringified)))

In [None]:
#for i, (input_idx, target_idx) in enumerate(zip(input_example[:10], target_example[:10])):
 #   print('Step {}:'.format(i + 1))
  #  print('  input: {} ({:s})'.format(input_idx, repr(tokenizer.sequences_to_texts([[input_idx.numpy()]])[0])))
   # print('  expected output: {} ({:s})'.format(target_idx, repr(tokenizer.sequences_to_texts([[target_idx.numpy()]])[0])))

### Split up the dataset into batches

 This display information about the targeted dataset, such as the number of elements.

In [None]:
print(dataset_targeted)

In [None]:
# Batch size.



BATCH_SIZE = 64

# Buffer size to shuffle the dataset (TF data is designed to work
# with possibly infinite sequences, so it doesn't attempt to shuffle
# the entire sequence in memory. Instead, it maintains a buffer in
# which it shuffles elements).
SHUFFLE_BUFFER_SIZE = 1000

dataset_train = dataset_targeted.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).repeat()

print(dataset_train)

In [None]:
for input_text, target_text in dataset_train.take(1):
    print('1st batch: input_text:', input_text,'\n')
    print('1st batch: target_text:', target_text,'\n')

## Build the model

This print the size of vocabulary

In [None]:
print(VOCABULARY_SIZE)

First line assign the length of `vocab_size` to the actual size of the dataset (`VOCABULARY_SIZE`)

Second Line assign the embedding dimension (`embedding_dim`) to 256

Third line assign `rnn units` to 1024

In [None]:
vocab_size = VOCABULARY_SIZE
embedding_dim = 256
rnn_units = 1024

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.models.Sequential()

    model.add(tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        batch_input_shape=[batch_size, None]
    ))

    model.add(tf.keras.layers.LSTM(
        units=rnn_units,
        return_sequences=True,
        stateful=True,
        recurrent_initializer=tf.keras.initializers.GlorotNormal()
    ))

    model.add(tf.keras.layers.Dense(vocab_size))

    return model

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

model.summary()

In [None]:
tf.keras.utils.plot_model(
    model,
    show_shapes=True,
    show_layer_names=True,
    to_file='model.png'
)

## Trying The Model

In [None]:
for input_example_batch, target_example_batch in dataset_train.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
sampled_indices = tf.random.categorical(
    logits=example_batch_predictions[0],
    num_samples=1
)

sampled_indices.shape

 This code prints the first 10 elements of a list `sampled_indices`

In [None]:
sampled_indices[:10]

We use `tf.squeeze` to remove any dimensions of size 1 from the `sampled_indices` tensor, converts it into a NumPy array, and then prints the shape of the resulting array.

The purpose of this step is to make code simpler

In [None]:
sampled_indices = tf.squeeze(sampled_indices).numpy()
sampled_indices.shape

This code prints the first 10 elements of a list `sampled_indices` after squeezed

In [None]:
sampled_indices[:10]

In [None]:
print('Input:\n', repr(''.join(tokenizer.sequences_to_texts([input_example_batch[0].numpy()[:50]]))))
print()
print('Next char prediction:\n', repr(''.join(tokenizer.sequences_to_texts([sampled_indices[:50]]))))

### Trying the model with variable input

In [None]:
for input_example_batch_custom, target_example_batch_custom in dataset_train.take(1):
    random_input = np.zeros(shape=(BATCH_SIZE, 10))
    example_batch_predictions_custom = model(random_input)
    print('Prediction shape: ', example_batch_predictions_custom.shape, "# (batch_size, sequence_length, vocab_size)\n")
    print('Custom length input: ')
    print(random_input)

## Training the model

### Attach an optimizer, and a loss function

In [None]:
# An objective function.
# The function is any callable with the signature scalar_loss = fn(y_true, y_pred).
def loss(labels, logits):
    entropy = tf.keras.losses.sparse_categorical_crossentropy(
      y_true=labels,
      y_pred=logits,
      from_logits=True
    )

    return entropy

example_batch_loss = loss(target_example_batch, example_batch_predictions)

print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss.shape:      ", example_batch_loss.shape)
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [None]:
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(
    optimizer=adam_optimizer,
    loss=loss,
    metrics=['accuracy']
)

### Creating a checkpoints directory

In [None]:
checkpoint_dir = 'tmp/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

### Configuring callbacks

In [None]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    patience=5,
    monitor='loss',
    restore_best_weights=True,
    verbose=1
)

In [None]:
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

### Execute the training

In [None]:
EPOCHS = 60
INITIAL_EPOCH = 1
STEPS_PER_EPOCH = 700

In [None]:

history = model.fit(
    x=dataset_train,
    epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    initial_epoch=INITIAL_EPOCH,
    callbacks=[
        checkpoint_callback,
        early_stopping_callback
    ]
)

### Saving the trained model to file

In [None]:
model_name = 'recipe_generation_rnn_raw_' + str(INITIAL_EPOCH) + '.h5'
model.save(model_name, save_format='h5')

### Visualizing training progress

In [None]:
def render_training_history_loss(training_history):
    loss = training_history.history['loss']

    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.plot(loss, label='Training set')
    plt.legend()
    plt.grid(linestyle='--', linewidth=1, alpha=0.5)
    plt.show()

In [None]:
render_training_history_loss(history)

In [None]:
def render_training_history_accuracy(training_history):
    accuracy = training_history.history['accuracy']

    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.plot(accuracy, label='Training set')
    plt.legend()
    plt.grid(linestyle='--', linewidth=1, alpha=0.5)
    plt.show()

In [None]:
render_training_history_accuracy(history)

# Test the model

In [None]:
# prompt: test data using data_test

test_data = tf.data.Dataset.from_tensor_slices(data_test)

# Apply the transformation function to each element in the dataset
test_data = test_data.map(transform_element)

# Shuffle and batch the dataset
test_data = test_data.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

model.evaluate(test_data)

## Generating text

### Restore the latest checkpoint

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

#### Creating simplified version of the model

This code sets the `simplified_batch_size` to 1

In [None]:
simplified_batch_size = 1

In [None]:
model_simplified = build_model(vocab_size=VOCABULARY_SIZE,
  embedding_dim=256,
  rnn_units=1024,
  batch_size=simplified_batch_size)
model_simplified.load_weights(tf.train.latest_checkpoint("tmp/checkpoints"))
model_simplified.build(tf.TensorShape([simplified_batch_size, None]))

This code  prints a summary of the simplified model's architecture. This provides information about the model's layers, their shapes, and the number of trainable parameters.

In [None]:
model_simplified.summary()

This specifies the number of features or variables and the order in which they should be provided to the model for processing.

In [None]:
model_simplified.input_shape

In [None]:
def generate_text(model, start_string, num_generate = 1000):
    # Evaluation step (generating text using the learned model)

    padded_start_string = STOP_WORD_TITLE + start_string

    # Converting our start string to numbers (vectorizing).
    input_indices = np.array(tokenizer.texts_to_sequences([padded_start_string]))

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1.
    model.reset_states()
    for char_index in range(num_generate):
        predictions = model(input_indices)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # Using a categorical distribution to predict the character returned by the model.
        predicted_id = tf.random.categorical(
            predictions,
            num_samples=1
        )[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state.
        input_indices = tf.expand_dims([predicted_id], 0)

        next_character = tokenizer.sequences_to_texts(input_indices.numpy())[0]

        text_generated.append(next_character)

    return (padded_start_string + ''.join(text_generated))

This code defines a function generate_combinations that generates text using different combinations of input strings and temperature values. Here's a breakdown of what it does:

**recipe_length = 751**: This line sets the length of the generated text to 1000 characters.

**ingredients = input("Enter your ingredients (One or two ingredients): ")**: This line prompts the user to enter one or two ingredients using the input function and assigns the user's input to the variable ingredients.



**generated_text = generate_text(model, start_string=letter, num_generate = recipe_length, temperature=temperature)**: This line generates text using the generate_text function. The model argument is the trained machine learning model, start_string is the ingredients string, and num_generate is the length of the generated text.

**print(f'Attempt: "{letter}")**: This line prints a message indicating the current input string .

**print('-----------------------------------')**: This line prints a separator line.

**print(generated_text)**: This line prints the generated text.

**print('\n\n')**: This line prints a blank line to separate the generated text from the next iteration.


In [None]:
def generate_combinations(model):
    recipe_length = 751
    ingredients = ['rice ','potato ','pasta ','onion ','chicken ','meat ','salt ',
                   'sugar ','fish ','cheese ','toamato ','chocolate ','vanilla ']
    for ingredient in ingredients:
        generated_text = generate_text(
        model,
        start_string=ingredient,
        num_generate = recipe_length,
        )
        print(f'Attempt: "{ingredient}"')
        print('-----------------------------------')
        print(generated_text)
        print('\n\n')

In [None]:
generate_combinations(model_simplified)

In [None]:
def generate_combinations(model):
    recipe_length = 751
    ingredients = input("Enter your ingredients (One or two ingredients): ")
    generated_text = generate_text(
        model,
        start_string=ingredients,
        num_generate = recipe_length
    )
    print(f'Attempt: "{ingredients}"')
    print('-----------------------------------')
    print(generated_text)
    print('\n\n')

In [None]:
generate_combinations(model_simplified)