In [None]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
import pickle
from nltk.translate.bleu_score import sentence_bleu

In [None]:
with open(r"C:\Users\Ayush Mourya\OneDrive\Desktop\IIITD\Novel Recipe Generation\All CSVs\train_tokenized.csv", "r") as file:
    data = file.readlines()

# Convert to DataFrame
train_df = pd.DataFrame(data, columns=["Training Data"])
print(train_df.head())

In [None]:
with open(r"C:\Users\Ayush Mourya\OneDrive\Desktop\IIITD\Novel Recipe Generation\All CSVs\test_tokenized.csv", "r") as file:
    data = file.readlines()

# Convert to DataFrame
test_df = pd.DataFrame(data, columns=["Testing Data"])
print(test_df.head())

In [None]:
train_df = train_df.iloc[:2]
test_df = test_df.iloc[:1]

In [None]:
# Extract recipe texts from the DataFrames as lists
train_recipes = train_df['Training Data'].tolist()
test_recipes = test_df['Testing Data'].tolist()

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    oov_token='<OOV>',
    char_level=False,
    filters='',
    lower=False,
)

tokenizer.fit_on_texts(train_recipes)
word_index = tokenizer.word_index

In [None]:
#word_index

In [None]:
print(word_index['<RECIPE_START>'])
print(word_index['<RECIPE_END>\n'])

In [None]:
# Save the tokenizer for later use
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Convert recipes to sequences
train_sequences = tokenizer.texts_to_sequences(train_recipes)

In [None]:
print(train_sequences[0])

In [None]:
# Find the maximum sequence length for padding
max_sequence_length = max(len(seq) for seq in train_sequences)

In [None]:
input_sequences = []

for i in range(len(train_sequences)):
    for j in range(1,len(train_sequences[i])):
        input_sequences.append(train_sequences[i][:j+1])

In [None]:
# Pad sequences to a fixed length
train_sequences_padded = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

In [None]:
train_sequences_padded[0]

In [None]:
# -----------------------------
# 3. Create Input/Target Pairs for Sequence Generation
# -----------------------------

# For demonstration, we create a simple sequence-to-sequence prediction
# Shift each sequence by one token: predict next word given previous words.

X = train_sequences_padded[:,:-1]
y = train_sequences_padded[:,-1]

In [None]:
X

In [None]:
y

In [None]:
print(X.shape)
print(y.shape)

In [None]:
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
print(vocab_size)

In [None]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=vocab_size)

In [None]:
y

In [None]:
y.shape

In [None]:
# -----------------------------
# 4. Building the LSTM Model
# -----------------------------

embedding_dim = 256
lstm_units = 320

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length-1),
    LSTM(lstm_units, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# -----------------------------
# 5. Training the Model
# -----------------------------

# Setup checkpoint callback to save the best model during training
checkpoint = ModelCheckpoint('recipe_lstm_model.h5', monitor='loss', verbose=1, save_best_only=True)

# Train the model
model.fit(X, y, epochs=5, batch_size=64, callbacks=[checkpoint])

In [None]:
# Save the model using pickle
with open('recipe_lstm_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [None]:
def generate_recipe(seed_text, tokenizer, model, max_sequence_length, num_words=200):
    generated_text = seed_text
    for _ in range(num_words):
        # Tokenize the current seed text
        sequence = tokenizer.texts_to_sequences([generated_text])[0]
        # Pad the sequence to the required max_sequence_length
        sequence = pad_sequences([sequence], maxlen=max_sequence_length, padding='pre')
        # Predict the next word
        pred = model.predict(sequence, verbose=0)
        # Handle 2D output (final timestep prediction)
        pred_word_index = np.argmax(pred[0, :])
        # If the prediction is padding (index 0), break the loop
        if pred_word_index == 0:
            break
        # Convert the predicted index back to a word
        pred_word = tokenizer.index_word.get(pred_word_index, "")
        # Append the predicted word to the generated text
        generated_text += " " + pred_word
        # Stop if the end token is generated
        if pred_word == "<RECIPE_END>":
            break
    return generated_text

In [None]:
# Example seed text from the test dataset (the INPUT part)
seed_text = "<RECIPE_START> <INPUT_START> mustard <NEXT_INPUT> clove garlic <NEXT_INPUT> sugar <NEXT_INPUT> corn oil <NEXT_INPUT> salt <NEXT_INPUT> mayonnaise <NEXT_INPUT> red wine vinegar <NEXT_INPUT> paprika <INPUT_END>"

generated_recipe = generate_recipe(seed_text, tokenizer, model, max_sequence_length, num_words=200)
print("Generated Recipe:\n", generated_recipe)

In [None]:
# -----------------------------
# 7. Extracting Test Recipes and Computing BLEU Score
# -----------------------------

def extract_input_section(recipe):
    """
    Extract the input portion (from <INPUT_START> to <INPUT_END>) from a recipe.
    """
    match = re.search(r'(<INPUT_START>.*?<INPUT_END>)', recipe, re.DOTALL)
    if match:
        return match.group(1)
    return ""

In [None]:
# Extract the input sections from the test set that match the given pattern
test_inputs = [extract_input_section(recipe) for recipe in test_recipes if extract_input_section(recipe)]
print("Number of test recipes with INPUT section:", len(test_inputs))

In [None]:
# For BLEU score, we assume one reference per generated recipe.
# Here, for demonstration, we pick one reference (for example, the first one) from test_inputs.
# In practice, you might iterate over multiple recipes and average the BLEU scores.
if test_inputs:
    reference = test_inputs[0].split()  # Tokenize the reference text
    generated_tokens = generated_recipe.split()  # Tokenize the generated recipe

    bleu_score = sentence_bleu([reference], generated_tokens)
    print("BLEU score:", bleu_score)
else:
    print("No test recipe inputs were extracted for BLEU score computation.")