In [None]:
# import pandas as pd
# import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, AdamW


#  Load Pre-trained Model and Tokenizer

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#  Load and Preprocess Data

train_path = "bard_recipes.csv"  # Update the file name
df = pd.read_csv(train_path)
texts = df[['Item', 'Recipes']].apply(lambda x: ' '.join(x), axis=1).tolist()
tokenized_texts = [tokenizer.encode(text, return_tensors="pt").squeeze() for text in texts]
input_ids = torch.cat(tokenized_texts)

# Data Collator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training Configuration

training_args = {
    "output_dir": "./recipe_finetuned",
    "overwrite_output_dir": True,
    "num_train_epochs": 1,
    "per_device_train_batch_size": 4,
    "save_steps": 10_000,
    "save_total_limit": 2
}
# Initialize Optimizer

optimizer = AdamW(model.parameters(), lr=5e-5, no_deprecation_warning=True)

# Training Loop

from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_path,
    block_size=128
)

train_dataloader = DataLoader(
    train_dataset,
    collate_fn=data_collator,
    batch_size=training_args["per_device_train_batch_size"],
    shuffle=True
)

for epoch in range(training_args["num_train_epochs"]):
    for step, batch in enumerate(train_dataloader):
        input_ids = batch["input_ids"].to(model.device)
        labels = batch["labels"].to(model.device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 1000 == 0:
            print(f"Epoch {epoch}, Step {step}, Loss: {loss.item()}")

#Save Fine-tuned Model

model.save_pretrained(training_args["output_dir"])
tokenizer.save_pretrained(training_args["output_dir"])

# Load Fine-tuned Model for Recipe Generation

model = GPT2LMHeadModel.from_pretrained(training_args["output_dir"])
tokenizer = GPT2Tokenizer.from_pretrained(training_args["output_dir"])

# Recipe Generation

user_input = input("Enter the name of the item for which you want a recipe: ")
prompt = f"Item: {user_input}\nRecipe:"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Ensure attention_mask is set
attention_mask = torch.ones(input_ids.shape, device=model.device)

# Set pad_token_id to eos_token_id for open-end generation
model.config.pad_token_id = model.config.eos_token_id

output = model.generate(input_ids, attention_mask=attention_mask, max_length=200, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
generated_recipe = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"\nGenerated Recipe for {user_input}:\n{generated_recipe}")