## Install Libraries

In [24]:
# %%capture
# 1. INSTALL NECESSARY LIBRARIES
!pip install transformers[torch] datasets accelerate evaluate rouge_score nltk scikit-learn

# 2. IMPORTS
import torch
import pandas as pd
import numpy as np
import ast  # For safely evaluating string representations of lists
import evaluate  # For ROUGE and BLEU
import math # For calculating steps per epoch
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Features
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    pipeline
)
import warnings
warnings.filterwarnings("ignore")

# 3. GLOBAL CONFIGURATION AND CONSTANTS
# --- Parameters You Can Tune ---
MODEL_NAME = 'gpt2'
FILE_PATH = '/kaggle/input/3a2mext/3A2M_EXTENDED.csv'

# Training parameters
TRAIN_EPOCHS = 3
FP16_TRAINING = True
BATCH_SIZE = 4
GRAD_ACC_STEPS = 8
MAX_LENGTH = 512

# Data parameters
DATA_SUBSAMPLE = 50000
VAL_SIZE = 0.1  # 10% for validation
TEST_SIZE = 0.1  # 10% for test (80% for train)
# ----------------------------------

print("--- Configuration Loaded ---")
print(f"Model: {MODEL_NAME}")
print(f"Data Subsample: {DATA_SUBSAMPLE} rows")
print(f"Split: 80% Train, 10% Validation, 10% Test")
print(f"Effective Batch Size: {BATCH_SIZE * GRAD_ACC_STEPS}")
print("----------------------------")

# 4. LOAD MODEL AND TOKENIZER
print("\n--- Loading Model and Tokenizer ---")
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
print("Model and Tokenizer loaded successfully.")

# 5. LOAD, CLEAN, AND DEDUPLICATE DATASET
print("\n--- Loading, Cleaning, and Deduplicating Dataset ---")
try:
    raw_dataset = load_dataset('csv', data_files=FILE_PATH, split='train')
    print(f"Original dataset size: {len(raw_dataset)}")
except Exception as e:
    print(f"Error loading dataset: {e}. Please check the file path.")
    data = {'title': ['Test 1', 'Test 2', 'Test 3', 'Test 4'],
            'Extended_NER': ["['a']", "['b']", "['c']", "['d']"],
            'directions': ["['step 1']", "['step 2']", "['step 3']", "['step 4']"],
            'genre': ['vegetables', 'sides', 'nonveg', 'vegetables']}
    raw_dataset = Dataset.from_dict(data)

if DATA_SUBSAMPLE > 0 and DATA_SUBSAMPLE < len(raw_dataset):
    raw_dataset = raw_dataset.shuffle(seed=42).select(range(DATA_SUBSAMPLE))
    print(f"Subsampled dataset to {len(raw_dataset)} rows.")

original_count = len(raw_dataset)
raw_dataset = raw_dataset.filter(
    lambda x: x['title'] is not None and \
              x['Extended_NER'] is not None and \
              x['directions'] is not None and \
              x['genre'] is not None
)
print(f"Filtered {original_count - len(raw_dataset)} rows with null values.")

def clean_genre(example):
    example['genre'] = str(example['genre']).lower().strip()
    return example
raw_dataset = raw_dataset.map(clean_genre)
print("Normalized 'genre' column.")

df = raw_dataset.to_pandas()
original_count = len(df)
df.drop_duplicates(subset=['title', 'Extended_NER', 'directions'], inplace=True, keep='first')
print(f"Removed {original_count - len(df)} duplicate recipes.")
raw_dataset = Dataset.from_pandas(df)
print(f"Final cleaned dataset size: {len(raw_dataset)}")

# 6. STRATIFIED 80/10/10 SPLIT
print("\n--- Performing Stratified 80/10/10 Split ---")

# --- FIX: Cast 'genre' to ClassLabel for stratification ---
unique_genres = raw_dataset.unique("genre")
print(f"Found {len(unique_genres)} unique genres for stratification.")
current_features = raw_dataset.features
new_features = Features({
    **current_features,
    "genre": ClassLabel(names=unique_genres)
})
print("Casting 'genre' column to ClassLabel type...")
stratify_ready_dataset = raw_dataset.cast(new_features)
print("Casting complete.")
# --- End of Fix ---

train_test_split = stratify_ready_dataset.train_test_split(
    test_size=(VAL_SIZE + TEST_SIZE),
    seed=42,
    stratify_by_column="genre"
)
val_test_split = train_test_split['test'].train_test_split(
    test_size=0.5,
    seed=42,
    stratify_by_column="genre"
)
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})
print(f"Train split size: {len(split_dataset['train'])}")
print(f"Validation split size: {len(split_dataset['validation'])}")
print(f"Test split size: {len(split_dataset['test'])}")

# 7. DEFINE FORMATTING FUNCTION
def format_and_normalize_record(example):
    try:
        ingredients_list = ast.literal_eval(example['Extended_NER'])
        if not isinstance(ingredients_list, list): ingredients_list = []
    except:
        ingredients_list = []
    ingredients_str = ", ".join(filter(None, [ing.lower().strip() for ing in ingredients_list]))

    try:
        directions_list = ast.literal_eval(example['directions'])
        if not isinstance(directions_list, list): directions_list = []
    except:
        directions_list = []
    directions_str = " ".join(f"{i+1}. {step.lower().strip()}" for i, step in enumerate(directions_list))

    title_str = str(example['title']).strip().lower()

    if not title_str or not ingredients_str or not directions_str:
        return {"text": None}

    text = (
        f"TITLE: {title_str}\n"
        f"INGREDIENTS: {ingredients_str}\n"
        f"RECIPE: {directions_str}"
        f"{tokenizer.eos_token}"
    )
    return {"text": text}

# 8. APPLY FORMATTING
print("\nApplying formatting and normalization to all splits...")
formatted_dataset = split_dataset.map(
    format_and_normalize_record,
    num_proc=4,
    remove_columns=split_dataset['train'].column_names
)
formatted_dataset = formatted_dataset.filter(
    lambda example: example['text'] is not None
)

# 9. DEFINE TOKENIZATION FUNCTION (WITH PADDING FIX)
print("\n--- Defining Tokenization ---")

def tokenize_function(examples):
    # --- FIX: Padding is applied here to avoid collator errors ---
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # Pad all to max_length
        max_length=MAX_LENGTH,
    )
    
# 10. APPLY TOKENIZATION AND SET LABELS
print("\n--- Tokenizing Dataset (all splits) ---")
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"]
)

def set_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples
tokenized_dataset = tokenized_dataset.map(set_labels, batched=True, num_proc=4)
print("Tokenization and label setting complete.")

# 11. CONFIGURE TRAINER (WITH ALL VERSIONING FIXES)
print("\n--- Configuring Trainer ---")

# --- FIX: DataCollator MUST NOT have padding arguments ---
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
    # Padding is now handled in the tokenizer (Step 9)
)

# --- Calculate steps per epoch for old args ---
try:
    total_train_samples = len(tokenized_dataset["train"])
    effective_batch_size = BATCH_SIZE * GRAD_ACC_STEPS
    STEPS_PER_EPOCH = math.ceil(total_train_samples / effective_batch_size)
    print(f"Calculated steps per epoch: {STEPS_PER_EPOCH}")
except:
    print("Warning: Could not calculate steps_per_epoch. Defaulting to 500.")
    STEPS_PER_EPOCH = 500

# --- FIX: TrainingArguments uses old names and workarounds ---
training_args = TrainingArguments(
    output_dir="./gpt2-recipe-generator",
    overwrite_output_dir=True,
    num_train_epochs=TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    fp16=FP16_TRAINING,
    
    # --- Arguments for old library versions ---
    do_eval=True,
    eval_steps=STEPS_PER_EPOCH,
    save_steps=STEPS_PER_EPOCH,
    logging_steps=STEPS_PER_EPOCH,
    
    # --- Fix for Kaggle hanging ---
    dataloader_num_workers=0,
    
    # --- Removed load_best_model_at_end to avoid final error ---
    
    learning_rate=5e-5,
    warmup_steps=200,
    weight_decay=0.01,
    report_to="none",
)

# 12. INITIALIZE TRAINER
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)
print("Trainer initialized.")

# 13. TRAIN THE MODEL (WITH BETTER ERROR REPORTING)
print("\n--- Starting Model Training ---")
try:
    trainer.train()
    print("--- Training Finished ---")

    print("Saving final model...")
    trainer.save_model("./final_model")
    tokenizer.save_pretrained("./final_model")
    print("Model saved to ./final_model")

except Exception as e:
    # --- FIX: More accurate error reporting ---
    print(f"\n--- AN ERROR OCCURRED DURING TRAINING ---")
    print(f"ERROR: {e}")
    print("\nThis error is often due to:")
    print("1. An Out-of-Memory (OOM) error. Try reducing BATCH_SIZE or MAX_LENGTH.")
    print("2. A data collation error (check tokenizer padding).")
    print("3. A mismatch in CUDA/PyTorch versions.")

# 14. LOAD FINE-TUNED MODEL FOR INFERENCE
print("\n--- DELIVERABLE: Sample Generations (Qualitative) ---")
try:
    final_model = GPT2LMHeadModel.from_pretrained("./final_model")
    final_tokenizer = GPT2Tokenizer.from_pretrained("./final_model")
    print("Loaded final-trained model from ./final_model")
except:
    print("Could not load fine-tuned model. Using base model for generation.")
    final_model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    final_tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    final_tokenizer.pad_token = final_tokenizer.eos_token

recipe_generator = pipeline(
    "text-generation",
    model=final_model,
    tokenizer=final_tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# 15. DEFINE GENERATION PROMPTS
prompt1 = (
    "title: spicy chicken pasta\n"
    "ingredients: chicken breast, pasta, cayenne pepper, olive oil, garlic, tomatoes\n"
    "recipe:"
)
prompt2 = (
    "title: black bean and turkey chili\n"
    "ingredients:\n"
    "recipe:"
)
prompt3 = (
    "title:\n"
    "ingredients: arugula, pomegranate arils, persimmon, feta cheese, walnuts, vinaigrette\n"
    "recipe:"
)
prompts = [prompt1, prompt2, prompt3]

# 16. RUN QUALITATIVE GENERATION EXAMPLES
for i, prompt in enumerate(prompts):
    print(f"\n--- Generation Example {i+1} ---")
    print(f"PROMPT:\n{prompt}\n")
    
    generated_text = recipe_generator(
        prompt,
        max_length=300,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        pad_token_id=final_tokenizer.eos_token_id
    )
    
    print("GENERATED RECIPE:")
    full_text = generated_text[0]['generated_text']
    generated_part = full_text[len(prompt):]
    print(generated_part)

# 17. SETUP QUANTITATIVE EVALUATION (ON TEST SET)
print("\n--- DELIVERABLE: Metric Evaluation (ROUGE, BLEU) ---")
print("*** Running evaluation on the 10% held-out TEST set ***")

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

eval_sample_size = 100
if len(tokenized_dataset["test"]) > eval_sample_size:
    eval_sample = tokenized_dataset["test"].shuffle(seed=42).select(range(eval_sample_size))
else:
    eval_sample = tokenized_dataset["test"]
    
print(f"Running quantitative evaluation on {len(eval_sample)} test samples...")
predictions = []
references = []

# 18. RUN QUANTITATIVE EVALUATION
for example in eval_sample:
    token_ids = example['input_ids']
    full_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    
    try:
        parts = full_text.split("recipe:")
        prompt_text = parts[0] + "recipe:"
        reference_text = parts[1].strip()
    except:
        continue

    generated_output = recipe_generator(
        prompt_text,
        max_length=len(token_ids) + 50,
        pad_token_id=final_tokenizer.eos_token_id
    )
    
    full_gen_text = generated_output[0]['generated_text']
    predicted_text = full_gen_text[len(prompt_text):].strip()
    
    predictions.append(predicted_text)
    references.append(reference_text)

print("Evaluation generation complete. Calculating scores...")

# 19. DISPLAY EVALUATION RESULTS AND FINAL NOTES
if predictions:
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bleu_scores = bleu.compute(predictions=predictions, references=references)

    print("\n--- Final Evaluation Results (on TEST set) ---")
    print("\nROUGE Scores:")
    print(f"  ROUGE-1: {rouge_scores['rouge1'] * 100:.2f}")
    print(f"  ROUGE-2: {rouge_scores['rouge2'] * 100:.2f}")
    print(f"  ROUGE-L: {rouge_scores['rougeL'] * 100:.2f}")

    print("\nBLEU Score:")
    print(f"  BLEU: {bleu_scores['bleu'] * 100:.2f}")

    print("\n--- Human Evaluation Notes (for your report) ---")
    print("1. **Coherence:** Does the recipe make logical sense?")
    print("2. **Relevance:** Does it use the ingredients from the prompt?")
    print("3. **Fluency:** Is the text grammatically correct and readable?")
else:
    print("No predictions were generated. Skipping metric evaluation.")

print("\n--- SCRIPT COMPLETE ---")

--- Configuration Loaded ---
Model: gpt2
Data Subsample: 50000 rows
Split: 80% Train, 10% Validation, 10% Test
Effective Batch Size: 32
----------------------------

--- Loading Model and Tokenizer ---
Model and Tokenizer loaded successfully.

--- Loading, Cleaning, and Deduplicating Dataset ---
Original dataset size: 2231143
Subsampled dataset to 50000 rows.
Filtered 0 rows with null values.
Normalized 'genre' column.
Removed 0 duplicate recipes.
Final cleaned dataset size: 50000

--- Performing Stratified 80/10/10 Split ---
Found 9 unique genres for stratification.
Casting 'genre' column to ClassLabel type...


Casting the dataset:   0%|          | 0/50000 [00:00<?, ? examples/s]

Casting complete.
Train split size: 40000
Validation split size: 5000
Test split size: 5000

Applying formatting and normalization to all splits...


Map (num_proc=4):   0%|          | 0/40000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]


--- Defining Tokenization ---

--- Tokenizing Dataset (all splits) ---


Map (num_proc=4):   0%|          | 0/35141 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4453 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4396 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/35141 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4453 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4396 [00:00<?, ? examples/s]

Tokenization and label setting complete.

--- Configuring Trainer ---
Calculated steps per epoch: 1099


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Trainer initialized.

--- Starting Model Training ---


Step,Training Loss
1099,2.2813
2198,2.0676
3297,2.0111


--- Training Finished ---
Saving final model...


Device set to use cuda:0


Model saved to ./final_model

--- DELIVERABLE: Sample Generations (Qualitative) ---
Loaded final-trained model from ./final_model


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Generation Example 1 ---
PROMPT:
title: spicy chicken pasta
ingredients: chicken breast, pasta, cayenne pepper, olive oil, garlic, tomatoes
recipe:



Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


GENERATED RECIPE:
 1. put chicken breasts and garlic in a large skillet. 2. add tomatoes, oil and cumin and cook over medium heat, stirring occasionally, until tomatoes are tender. 3. stir in chicken. 4. transfer to a bowl and cover with a tight fitting lid. 5. let stand until ready to use. 6. serve over pasta. 7. serves 8. 8 ounces. 9. makes 12 servings. 10. yields 6 servings, about 12 to 14 servings each. 11. calories: 175. 12. fat: 15. sodium: 7g. 13. cholesterol: 14. carbohydrates: 2g 14 mg. 15 mg 16. fiber: 12g 17. protein: 4g 18. carbohydrate: 3g 19. sugar: 13mg 20. potassium: 5mg 21. calcium: 8mg 22. iron: 6mg 23. vitamin: 20mg 24. zinc: 9mg 25. folate: 10mg 26. selenium: 11mg 27. folic acid: 24mg 28. copper: 25mg 29. vitamins: 30mg 31. polyunsaturated: 35mg 32. dietary fiber of 1g 33. total cholesterol of 2mg 34. blood sugar of 3mg 35. glucose of 20g 36. car

--- Generation Example 2 ---
PROMPT:
title: black bean and turkey chili
ingredients:
recipe:



Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


GENERATED RECIPE:
 cook, black beans, ground cumin, 2, white rice, about 15 minutes, garlic, tomatoes, 15 to 20 minutes
RECIPE: 1. cook the black and ground black pepper in a heavy-bottomed saucepan over medium heat, stirring occasionally, until the peppers are golden brown, 10 minutes. 2. remove the pan from the heat and add the garlic and 1 teaspoon salt. 3. bring the mixture to a boil, then reduce the sauce to medium-low and simmer for 15-20 minutes or until most of the liquid is absorbed. 4. drain the beans and reserve for another use. 5. to make the chili: combine the remaining 2 teaspoons salt and black peppercorns in an electric blender and puree until smooth. 6. add 1 tablespoon oil and continue to pureer until you have a coarse powder. 7. strain the oil through a fine mesh strainer or a food mill. 8. when the spice mixture is thoroughly mixed, add it to the stew and cook until heated through, adding more oil if necessary. 9. serve with tortilla chips or other condiments. 10. i

In [27]:
# %%capture
# 1. INSTALL NECESSARY LIBRARIES
!pip install transformers[torch] datasets accelerate evaluate rouge_score nltk scikit-learn

# 2. IMPORTS
import torch
import pandas as pd
import numpy as np
import ast  # For safely evaluating string representations of lists
import evaluate  # For ROUGE and BLEU
import math # For calculating steps per epoch
import os   # For checking if model exists
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Features
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    pipeline
)
import warnings
warnings.filterwarnings("ignore")

# 3. GLOBAL CONFIGURATION AND CONSTANTS
# --- Parameters You Can Tune ---
MODEL_NAME = 'gpt2'
FILE_PATH = '/kaggle/input/3a2mext/3A2M_EXTENDED.csv'
FINAL_MODEL_PATH = "./final_model" # Path to save/load the trained model

# Training parameters
TRAIN_EPOCHS = 3
FP16_TRAINING = True
BATCH_SIZE = 4       # Small batch size for training and eval
GRAD_ACC_STEPS = 8
MAX_LENGTH = 512

# Data parameters
DATA_SUBSAMPLE = 50000
VAL_SIZE = 0.1  # 10% for validation
TEST_SIZE = 0.1  # 10% for test (80% for train)
# ----------------------------------

print("--- Configuration Loaded ---")
print(f"Model: {MODEL_NAME}")
print(f"Data Subsample: {DATA_SUBSAMPLE} rows")
print(f"Split: 80% Train, 10% Validation, 10% Test")
print(f"Effective Batch Size: {BATCH_SIZE * GRAD_ACC_STEPS}")
print("----------------------------")

# 4. LOAD MODEL AND TOKENIZER (FOR TRAINING)
print("\n--- Loading Model and Tokenizer ---")
# This tokenizer uses default RIGHT padding, which is correct for training
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))
print("Model and Tokenizer loaded successfully.")

# 5. LOAD, CLEAN, AND DEDUPLICATE DATASET
print("\n--- Loading, Cleaning, and Deduplicating Dataset ---")
try:
    raw_dataset = load_dataset('csv', data_files=FILE_PATH, split='train')
    print(f"Original dataset size: {len(raw_dataset)}")
except Exception as e:
    print(f"Error loading dataset: {e}. Please check the file path.")
    data = {'title': ['Test 1', 'Test 2', 'Test 3', 'Test 4'],
            'Extended_NER': ["['a']", "['b']", "['c']", "['d']"],
            'directions': ["['step 1']", "['step 2']", "['step 3']", "['step 4']"],
            'genre': ['vegetables', 'sides', 'nonveg', 'vegetables']}
    raw_dataset = Dataset.from_dict(data)

if DATA_SUBSAMPLE > 0 and DATA_SUBSAMPLE < len(raw_dataset):
    raw_dataset = raw_dataset.shuffle(seed=42).select(range(DATA_SUBSAMPLE))
    print(f"Subsampled dataset to {len(raw_dataset)} rows.")

original_count = len(raw_dataset)
raw_dataset = raw_dataset.filter(
    lambda x: x['title'] is not None and \
              x['Extended_NER'] is not None and \
              x['directions'] is not None and \
              x['genre'] is not None
)
print(f"Filtered {original_count - len(raw_dataset)} rows with null values.")

def clean_genre(example):
    example['genre'] = str(example['genre']).lower().strip()
    return example
raw_dataset = raw_dataset.map(clean_genre)
print("Normalized 'genre' column.")

df = raw_dataset.to_pandas()
original_count = len(df)
df.drop_duplicates(subset=['title', 'Extended_NER', 'directions'], inplace=True, keep='first')
print(f"Removed {original_count - len(df)} duplicate recipes.")
raw_dataset = Dataset.from_pandas(df)
print(f"Final cleaned dataset size: {len(raw_dataset)}")

# 6. STRATIFIED 80/10/10 SPLIT
print("\n--- Performing Stratified 80/10/10 Split ---")
unique_genres = raw_dataset.unique("genre")
print(f"Found {len(unique_genres)} unique genres for stratification.")
current_features = raw_dataset.features
new_features = Features({
    **current_features,
    "genre": ClassLabel(names=unique_genres)
})
print("Casting 'genre' column to ClassLabel type...")
stratify_ready_dataset = raw_dataset.cast(new_features)
print("Casting complete.")
train_test_split = stratify_ready_dataset.train_test_split(
    test_size=(VAL_SIZE + TEST_SIZE),
    seed=42,
    stratify_by_column="genre"
)
val_test_split = train_test_split['test'].train_test_split(
    test_size=0.5,
    seed=42,
    stratify_by_column="genre"
)
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
})
print(f"Train split size: {len(split_dataset['train'])}")
print(f"Validation split size: {len(split_dataset['validation'])}")
print(f"Test split size: {len(split_dataset['test'])}")

# 7. DEFINE FORMATTING FUNCTION (WITH ENHANCED CLEANING)
def format_and_normalize_record(example):
    try:
        ingredients_list = ast.literal_eval(example['Extended_NER'])
        if not isinstance(ingredients_list, list): ingredients_list = []
    except:
        ingredients_list = []
    ingredients_str = ", ".join(filter(None, [ing.lower().strip() for ing in ingredients_list]))

    try:
        directions_list = ast.literal_eval(example['directions'])
        if not isinstance(directions_list, list): directions_list = []
    except:
        directions_list = []
    
    directions_list_clean = [str(s).lower().strip() for s in directions_list]
    filtered_steps = [
        step for step in directions_list_clean if not 
        (step.startswith("calories:") or
         step.startswith("fat:") or
         step.startswith("sodium:") or
         step.startswith("cholesterol:") or
         step.startswith("protein:") or
         step.startswith("carbohydrates:") or
         "serving size" in step or
         step.isnumeric())
    ]
    
    directions_str = " ".join(f"{i+1}. {step}" for i, step in enumerate(filtered_steps))
    title_str = str(example['title']).strip().lower()

    if not title_str or not ingredients_str or not directions_str:
        return {"text": None}

    text = (
        f"TITLE: {title_str}\n"
        f"INGREDIENTS: {ingredients_str}\n"
        f"RECIPE: {directions_str}"
        f"{tokenizer.eos_token}"
    )
    return {"text": text}

# 8. APPLY FORMATTING
print("\nApplying formatting and normalization to all splits...")
formatted_dataset = split_dataset.map(
    format_and_normalize_record,
    num_proc=4,
    remove_columns=split_dataset['train'].column_names
)
formatted_dataset = formatted_dataset.filter(
    lambda example: example['text'] is not None
)

# 9. DEFINE TOKENIZATION FUNCTION (WITH PADDING FIX)
print("\n--- Defining Tokenization ---")
def tokenize_function(examples):
    # This uses the default RIGHT padding, which is correct for training.
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    
# 10. APPLY TOKENIZATION AND SET LABELS
print("\n--- Tokenizing Dataset (all splits) ---")
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"]
)
def set_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples
tokenized_dataset = tokenized_dataset.map(set_labels, batched=True, num_proc=4)
print("Tokenization and label setting complete.")

# 11. CONFIGURE TRAINER (WITH ALL VERSIONING FIXES)
print("\n--- Configuring Trainer ---")
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
try:
    total_train_samples = len(tokenized_dataset["train"])
    effective_batch_size = BATCH_SIZE * GRAD_ACC_STEPS
    STEPS_PER_EPOCH = math.ceil(total_train_samples / effective_batch_size)
    print(f"Calculated steps per epoch: {STEPS_PER_EPOCH}")
except:
    print("Warning: Could not calculate steps_per_epoch. Defaulting to 500.")
    STEPS_PER_EPOCH = 500
training_args = TrainingArguments(
    output_dir="./gpt2-recipe-generator",
    overwrite_output_dir=True,
    num_train_epochs=TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEPS,
    fp16=FP16_TRAINING,
    do_eval=True,
    eval_steps=STEPS_PER_EPOCH,
    save_steps=STEPS_PER_EPOCH,
    logging_steps=STEPS_PER_EPOCH,
    dataloader_num_workers=0,
    learning_rate=5e-5,
    warmup_steps=200,
    weight_decay=0.01,
    report_to="none",
)

# 12. INITIALIZE TRAINER
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)
print("Trainer initialized.")

# 13. TRAIN THE MODEL (WITH CHECKPOINTING)
print("\n--- Starting Model Training ---")
if not os.path.exists(FINAL_MODEL_PATH):
    print(f"No model found at {FINAL_MODEL_PATH}. Starting training...")
    try:
        trainer.train()
        print("--- Training Finished ---")
        print("Saving final model...")
        trainer.save_model(FINAL_MODEL_PATH)
        tokenizer.save_pretrained(FINAL_MODEL_PATH)
        print(f"Model saved to {FINAL_MODEL_PATH}")
    except Exception as e:
        print(f"\n--- AN ERROR OCCURRED DURING TRAINING ---")
        print(f"ERROR: {e}")
        print("\nThis error is often due to OOM or data collation.")
else:
    print(f"--- Model already found at {FINAL_MODEL_PATH}. Skipping training. ---")

# 14. LOAD FINE-TUNED MODEL FOR INFERENCE
print("\n--- DELIVERABLE: Loading Model for Evaluation ---")
try:
    final_model = GPT2LMHeadModel.from_pretrained(FINAL_MODEL_PATH)
    
    # --- FIX: Load tokenizer and set padding_side='left' for generation ---
    final_tokenizer = GPT2Tokenizer.from_pretrained(FINAL_MODEL_PATH)
    final_tokenizer.padding_side = 'left' 
    # --- End of Fix ---
    
    print(f"Loaded final-trained model from {FINAL_MODEL_PATH}")
except Exception as e:
    print(f"Error loading model from {FINAL_MODEL_PATH}: {e}")
    print("Loading base model as a fallback.")
    final_model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    
    # --- FIX: Load tokenizer and set padding_side='left' for generation ---
    final_tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    final_tokenizer.padding_side = 'left'
    final_tokenizer.pad_token = final_tokenizer.eos_token # Set pad token for base tokenizer
    # --- End of Fix ---

# We must ensure the pad token is set for the inference tokenizer
final_tokenizer.pad_token = final_tokenizer.eos_token

recipe_generator = pipeline(
    "text-generation",
    model=final_model,
    tokenizer=final_tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# 15. --- NEW: QUALITATIVE GENERATION FROM TEST SET ---
print("\n--- DELIVERABLE: Sample Generations (from Test Set) ---")

# Select 5 random samples from the test set
try:
    sample_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(5))
except:
    print("Test set is smaller than 5. Using all available samples.")
    sample_dataset = tokenized_dataset["test"]

for i, example in enumerate(sample_dataset):
    # Decode, and also clean up any padding tokens
    full_text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    
    prompt_text = ""
    reference_text = ""
    
    try:
        parts = full_text.split("RECIPE:") 
        prompt_text = parts[0] + "RECIPE:"
        reference_text = parts[1].strip()
    except:
        print(f"Error splitting sample {i+1}. Skipping.")
        continue 
    
    # Generate a recipe
    generated_output = recipe_generator(
        prompt_text,
        max_new_tokens=150,
        eos_token_id=final_tokenizer.eos_token_id,
        pad_token_id=final_tokenizer.eos_token_id,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_k=50
    )
    
    # Clean the output
    full_gen_text = generated_output[0]['generated_text'] 
    predicted_text = full_gen_text[len(prompt_text):].strip()
    if final_tokenizer.eos_token in predicted_text:
        predicted_text = predicted_text.split(final_tokenizer.eos_token)[0]

    # Print the comparison
    print(f"\n--- SAMPLE {i+1} ---")
    print(f"PROMPT:\n{prompt_text}\n")
    print(f"GENERATED:\n{predicted_text}\n")
    print(f"REFERENCE (Actual Recipe):\n{reference_text}\n")
    print("-" * 30)

# 16. SETUP QUANTITATIVE EVALUATION (ON TEST SET)
print("\n--- DELIVERABLE: Metric Evaluation (ROUGE, BLEU) ---")
print("*** Running evaluation on the 10% held-out TEST set ***")
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
eval_sample_size = 100
if len(tokenized_dataset["test"]) > eval_sample_size:
    eval_sample = tokenized_dataset["test"].shuffle(seed=42).select(range(eval_sample_size))
else:
    eval_sample = tokenized_dataset["test"]
print(f"Preparing {len(eval_sample)} samples for batch evaluation...")

# 17. RUN QUANTITATIVE EVALUATION (EFFICIENT BATCH MODE)
print("Building prompts and references list...")
prompts_list = []
references_list = []

for example in eval_sample:
    token_ids = example['input_ids']
    full_text = tokenizer.decode(token_ids, skip_special_tokens=True)
    try:
        parts = full_text.split("RECIPE:") 
        prompt_text = parts[0] + "RECIPE:"
        reference_text = parts[1].strip()
        prompts_list.append(prompt_text)
        references_list.append(reference_text)
    except:
        continue 

print(f"Generating {len(prompts_list)} predictions in a single batch...")
generated_outputs = recipe_generator(
    prompts_list,
    max_new_tokens=150,
    eos_token_id=final_tokenizer.eos_token_id,
    pad_token_id=final_tokenizer.eos_token_id,
    no_repeat_ngram_size=2,
    batch_size=BATCH_SIZE
)

print("Processing generated outputs...")
predictions = []
for i, output in enumerate(generated_outputs):
    full_gen_text = output[0]['generated_text'] 
    prompt_text = prompts_list[i]
    predicted_text = full_gen_text[len(prompt_text):].strip()
    if final_tokenizer.eos_token in predicted_text:
        predicted_text = predicted_text.split(final_tokenizer.eos_token)[0]
    predictions.append(predicted_text.strip())

references = references_list

print(f"Evaluation generation complete. Processed {len(predictions)} samples.")

# 18. DISPLAY EVALUATION RESULTS AND FINAL NOTES
if predictions:
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bleu_scores = bleu.compute(predictions=predictions, references=references)

    print("\n--- Final Evaluation Results (on TEST set) ---")
    print("\nROUGE Scores:")
    print(f"  ROUGE-1: {rouge_scores['rouge1'] * 100:.2f}")
    print(f"  ROUGE-2: {rouge_scores['rouge2'] * 100:.2f}")
    print(f"  ROUGE-L: {rouge_scores['rougeL'] * 100:.2f}")

    print("\nBLEU Score:")
    print(f"  BLEU: {bleu_scores['bleu'] * 100:.2f}")

else:
    print("No predictions were generated. Skipping metric evaluation.")

print("\n--- SCRIPT COMPLETE ---")

--- Configuration Loaded ---
Model: gpt2
Data Subsample: 50000 rows
Split: 80% Train, 10% Validation, 10% Test
Effective Batch Size: 32
----------------------------

--- Loading Model and Tokenizer ---
Model and Tokenizer loaded successfully.

--- Loading, Cleaning, and Deduplicating Dataset ---
Original dataset size: 2231143
Subsampled dataset to 50000 rows.
Filtered 0 rows with null values.
Normalized 'genre' column.
Removed 0 duplicate recipes.
Final cleaned dataset size: 50000

--- Performing Stratified 80/10/10 Split ---
Found 9 unique genres for stratification.
Casting 'genre' column to ClassLabel type...


Casting the dataset:   0%|          | 0/50000 [00:00<?, ? examples/s]

Casting complete.
Train split size: 40000
Validation split size: 5000
Test split size: 5000

Applying formatting and normalization to all splits...


Map (num_proc=4):   0%|          | 0/40000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]


--- Defining Tokenization ---

--- Tokenizing Dataset (all splits) ---


Map (num_proc=4):   0%|          | 0/35139 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4453 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4396 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/35139 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4453 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4396 [00:00<?, ? examples/s]

Tokenization and label setting complete.

--- Configuring Trainer ---
Calculated steps per epoch: 1099


Device set to use cuda:0


Trainer initialized.

--- Starting Model Training ---
--- Model already found at ./final_model. Skipping training. ---

--- DELIVERABLE: Loading Model for Evaluation ---
Loaded final-trained model from ./final_model

--- DELIVERABLE: Sample Generations (from Test Set) ---

--- SAMPLE 1 ---
PROMPT:
TITLE: joyce's mexican corn bread
INGREDIENTS: 40-45 minutes, bell pepper, grease, salt, 8 inch square, 350 degrees, cornmeal, bake, peppers, creamstyle corn, 12, grated cheese, sour cream, 14, cooking oil, eggs
RECIPE:

GENERATED:
1. preheat oven to 350°. grease and flour 8 inches square baking pan. 2. combine cream cheese and sour creamed corn. 3. add eggs, one at a time, beating well after each addition. 4. stir in grates and pepper. 5. pour mixture into pan and bake for 40 to 45 minutes or until golden brown. 6. cool slightly. 7. for the filling, combine sour and cream and spread over corn mixture. 8. bake until filling is bubbly and corn is golden, about 5-7 minutes. 9. serve warm or at 