In [None]:
import os
import re
import ast
import torch
import pandas as pd
from PIL import Image
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
from torchvision import transforms
import matplotlib.pyplot as plt

# Load your dataset
csv_path = r"C:\Users\Alaaeddin\Downloads\preprocessed_legal_data.csv"
df = pd.read_csv(csv_path)

# Filter and format the data
df = df[['NER', 'directions']].dropna()
df = df[df['NER'].apply(lambda x: len(eval(x)) >= 5)].reset_index(drop=True)

def format_ingredients(ner_str):
    try:
        items = eval(ner_str)
        return ", ".join(i.strip().lower() for i in items if isinstance(i, str))
    except:
        return ""

df['formatted'] = df.apply(
    lambda row: f"<|startoftext|>Ingredients: {format_ingredients(row['NER'])}\nInstructions: {row['directions'].strip()}<|endoftext|>",
    axis=1
)

texts = df['formatted'].tolist()

# In-memory dataset for GPT-2
class InMemoryTextDataset(Dataset):
    def __init__(self, tokenizer, texts, block_size=512):
        tokenized_inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=block_size, return_tensors="pt")
        self.input_ids = tokenized_inputs['input_ids']
        self.attn_mask = tokenized_inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attn_mask[idx],
            "labels": self.input_ids[idx]
        }

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Prepare dataset
dataset = InMemoryTextDataset(tokenizer, texts)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training configuration
training_args = TrainingArguments(
    output_dir="./gpt2-recipes",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=250,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available()
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

# Train
trainer.train()
trainer.save_model("./gpt2-recipes")
tokenizer.save_pretrained("./gpt2-recipes")

print("✅ Training completed and model saved.")

# Inference: Recipe generation
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-recipes", tokenizer="./gpt2-recipes")

# Example usage
example_ingredients = ["milk", "brown sugar", "vanilla", "butter"]
prompt = f"<|startoftext|>Ingredients: {', '.join(example_ingredients)}\nInstructions:"
output = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.8)
generated_text = output[0]['generated_text'].split("<|endoftext|>")[0].strip()

print("🧾 Generated Recipe:\n")
print(generated_text)
