In [None]:
import torch 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import gc

In [None]:
with open(r"C:\Users\Ayush Mourya\OneDrive\Desktop\IIITD\Novel Recipe Generation\All CSVs\train_tokenized.csv", "r") as file:
    data = file.readlines()

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Training Data"])
print(df.head())

In [None]:
df = df.iloc[:100]

In [None]:
from transformers import(
    AutoTokenizer, 
    AutoModelForCausalLM,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments)

# check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
special_tokens = {
    "additional_special_tokens": ['<RECIPE_START>',
                                  '<INPUT_START>',
                                  '<NEXT_INPUT>',
                                  '<INPUT_END>',
                                  '<INGR_START>',
                                  '<NEXT_INGR>',
                                  '<INGR_END>',
                                  '<INSTR_START>',
                                  '<NEXT_INSTR>',
                                  '<INSTR_END>',
                                  '<TITLE_START>',
                                  '<TITLE_END>',
                                  '<RECIPE_END>',
                                  '<PAD>'
    ]
}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
model.to(device)

# add a pad token to the tokenizer 
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))


In [None]:
vectorized_dataset = df["Training Data"].apply(lambda x: tokenizer.encode(x))

In [None]:
print(vectorized_dataset[45])

In [None]:
tokenizer.pad_token_id

In [None]:
print(tokenizer.convert_ids_to_tokens([198]))
print(tokenizer.convert_ids_to_tokens([50270]))
print(tokenizer.convert_ids_to_tokens([50256]))
print(tokenizer.convert_tokens_to_ids("<RECIPE_END>"))

In [None]:
recipe_len_list = df.apply(lambda x: len(tokenizer.encode(x["Training Data"])), axis=1)

In [None]:
recipe_len_list.describe()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15,5))
axes[0].boxplot(recipe_len_list)
axes[1].hist(recipe_len_list, bins=100)
sm.qqplot(recipe_len_list, line='s', ax=axes[2])
plt.show()

In [None]:
# Remove rows where the length of the tokenized data is greater than 320 or less than 100
df_filtered = df[~df["Training Data"].apply(lambda x: len(tokenizer.encode(x)) > 320 or len(tokenizer.encode(x)) < 100)]
print(df_filtered)

In [None]:
vectorized_dataset_filtered = vectorized_dataset[~vectorized_dataset.apply(lambda x: len(x) > 320 or len(x) < 100)]
print(vectorized_dataset_filtered)

In [None]:
recipe_len_list = df_filtered.apply(lambda x: len(tokenizer.encode(x["Training Data"])), axis=1)

In [None]:
recipe_len_list.describe()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15,5))
axes[0].boxplot(recipe_len_list)
axes[1].hist(recipe_len_list, bins=100)
sm.qqplot(recipe_len_list, line='s', ax=axes[2])
plt.show()

In [None]:
#del df
#del vectorized_dataset
#gc.collect()

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [None]:
print(tokenizer.pad_token_id)

In [None]:
def pad_and_add_eos(recipe, max_length=322, pad_token_id=50270, eos_token_id=50256):
    recipe = recipe[:max_length-2]  # Reserve space for EOS token
    recipe.append(eos_token_id)  # Add EOS token at the end
    padding_length = max_length - len(recipe)
    return [pad_token_id] * padding_length + recipe  # Pad on the left


In [None]:
# Apply the function to the vectorized dataset
vectorized_dataset_padded = vectorized_dataset_filtered.apply(lambda x: pad_and_add_eos(x))

In [None]:
print(vectorized_dataset_padded[0])

In [None]:
class RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, vectorized_data):
        self.vectorized_data = vectorized_data

    def __len__(self):
        return len(self.vectorized_data)

    def __getitem__(self, idx):
        data = self.vectorized_data.iloc[idx]  # Direct indexing if it's a list
        #print(data)
        return torch.tensor(data)

dataset = RecipeDataset(vectorized_dataset_padded)

In [None]:
# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
with open(r"/kaggle/input/muah-muah/test_tokenized.csv", "r") as file:
    data = file.readlines()

test_df = pd.DataFrame(data, columns=["Testing Data"])
test_df = test_df.iloc[:4000]
print(test_df.head())

In [None]:
# Function to extract input portion for recipe generation
def extract_inputs(recipe_text):
    start_idx = recipe_text.find("<RECIPE_START>")
    end_idx = recipe_text.find("<INPUT_END>") + len("<INPUT_END>")
    return recipe_text[start_idx:end_idx+1].strip()

In [None]:
# Extract inputs from test dataset
test_inputs = test_df["Testing Data"].apply(extract_inputs)

In [None]:
print(test_inputs[0])

In [None]:
def generate_recipe(model, tokenizer, input_text, max_length=150):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length)
    input_ids = inputs["input_ids"].to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids, 
            max_length=max_length, 
            pad_token_id=50270, 
            eos_token_id=tokenizer.eos_token_id,  
            repetition_penalty=1.2,  
            no_repeat_ngram_size=3
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Function to compute BLEU score
def compute_bleu_score(generated_recipes, reference_recipes):
    scores = []
    for gen, ref in zip(generated_recipes, reference_recipes):
        reference = [ref.split()]  # Tokenize reference
        candidate = gen.split()  # Tokenize generated recipe    
        score = sentence_bleu(reference, candidate)
        scores.append(score)
    return sum(scores) / len(scores)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Change to desired epochs
    per_device_train_batch_size=2,
    save_steps=50000,  # Save checkpoint every 1000 steps
    #save_total_limit=2,
    logging_dir="./logs",  # Directory for logs
    logging_strategy="epoch",  # Log loss at each epoch
    evaluation_strategy="no",  # Evaluate model at each epoch
    report_to=["tensorboard"],  # Enables TensorBoard logging
)


In [None]:
# create the trainer 
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
) 

In [None]:
num_epochs = 5

In [None]:
# Start training and evaluate after every epoch
for epoch in range(num_epochs):
    trainer.train()
    
    # Generate recipes for test set
    generated_recipes = []
    count = 0
    for inp in test_inputs:
        count += 1
        generated_recipes.append(generate_recipe(model,tokenizer,inp))
        if count % 100 == 0:
            print(count)
            
    #generated_recipes = [generate_recipe(model, tokenizer, inp) for inp in test_inputs]
    #print(generated_recipes)
    
    # Compute BLEU score
    bleu_score = compute_bleu_score(generated_recipes, test_df["Testing Data"].tolist())
    
    print(f"Epoch {epoch+1} - BLEU Score: {bleu_score:.4f}")

    # Save the model and tokenizer after each epoch
    save_path = f"./saved_model_epoch_{epoch+1}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model saved at {save_path}")

In [None]:
history = trainer.state.log_history
df = pd.DataFrame(history)
print(df)

In [None]:
'''from transformers import AutoModel, AutoTokenizer

epoch_to_load = 2  # Change this to the desired epoch
load_path = f"./saved_model_epoch_{epoch_to_load}"

# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained(load_path)
tokenizer = GPT2Tokenizer.from_pretrained(load_path)

# Set model to evaluation mode
model.eval()

print(f"Loaded model from {load_path}")'''


In [None]:
def generate_recipe(model, tokenizer, input_text, max_length=150):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length)
    input_ids = inputs["input_ids"].to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids, 
            max_length=max_length, 
            pad_token_id=50270, 
            eos_token_id=tokenizer.eos_token_id,  
            repetition_penalty=1.2,  
            no_repeat_ngram_size=3
        )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
test_input = "Make a spicy pasta recipe."
generated_recipe = generate_recipe(model, tokenizer, test_input)
print("Generated Recipe:", generated_recipe)


In [None]:
'''from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments

# Choose which epoch to resume from
resume_epoch = 2  # Change this to the desired epoch
load_path = f"./saved_model_epoch_{resume_epoch}"

# Load the model and tokenizer from the saved checkpoint
model = AutoModel.from_pretrained(load_path)
tokenizer = AutoTokenizer.from_pretrained(load_path)

print(f"Resuming training from {load_path}")

# Reinitialize Trainer with the loaded model
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5 - resume_epoch,  # Continue from where it left off
    per_device_train_batch_size=32,
    save_steps=50000,
    logging_dir="./logs",
    logging_strategy="epoch",
    evaluation_strategy="no",
    report_to=["tensorboard"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Continue training from the next epoch
for epoch in range(resume_epoch, training_args.num_train_epochs + resume_epoch):
    trainer.train()
    
    # Generate recipes for test set
    generated_recipes = [generate_recipe(model, tokenizer, inp) for inp in test_inputs]
    
    # Compute BLEU score
    bleu_score = compute_bleu_score(generated_recipes, test_df["testing_data"])
    
    print(f"Epoch {epoch+1} - BLEU Score: {bleu_score:.4f}")

    # Save model again
    save_path = f"./saved_model_epoch_{epoch+1}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model saved at {save_path}")'''
