<a href="https://colab.research.google.com/github/acram002/AI-Driven-Recipe-Suggestion-System/blob/main/testSmallFlan2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load and preprocess dataset
import pandas as pd
import ast

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')
df = df[['NER', 'ingredients', 'directions']].dropna() # added ingredients (to model response)
df = df.sample(n=30000, random_state=42) # increased from 5 to 10 to 30k


# Fix stringified list formats and clean ingredient quantities
def clean_ingredient_list(ingredient_str):
    try:
        ingredients = ast.literal_eval(ingredient_str) if isinstance(ingredient_str, str) else ingredient_str
        cleaned = []
        for item in ingredients:
            if any(unit in item.lower() for unit in ['tortilla', 'egg', 'clove', 'slice']):
                parts = item.split()
                if len(parts) > 1 and parts[0][0].isdigit():
                    cleaned.append(" ".join(parts[1:]))
                else:
                    cleaned.append(item)
            elif any(unit in item.lower() for unit in ['lb', 'pound', 'lbs', 'kg']):
                cleaned.append(item.replace("6 lbs", "1.5 lbs").replace("5 lbs", "1.5 lbs"))
            else:
                cleaned.append(item)
        return cleaned
    except Exception:
        return []

df['NER'] = df['NER'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x)
df['ingredients'] = df['ingredients'].apply(clean_ingredient_list)

df['response'] = df.apply(
    lambda row: f"Ingredients:\n{'\n'.join(row['ingredients'])}\n\nInstructions:\n" +
                "\n".join(ast.literal_eval(row['directions']))
    if isinstance(row['directions'], str) and row['directions'].startswith("[") else str(row['directions']),
    axis=1
)

df['prompt'] = df['NER'].apply(lambda x: f"Generate a recipe:\nIngredients: {', '.join(x) if isinstance(x, list) else str(x)}")

# Fix stringified list formats
#df['NER'] = df['NER'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x)
#df['response'] = df.apply(
#    lambda row: f"Ingredients:\n{row['ingredients'].strip()}\n\nInstructions:\n" + "\n".join(ast.literal_eval(row['directions']))
#    if isinstance(row['directions'], str) and row['directions'].startswith("[") else str(row['directions']),
#    axis=1
#) # adding ingredients (quantities) to model response
#df['response'] = df['directions'].apply(lambda x: "\n".join(ast.literal_eval(x)) if isinstance(x, str) and x.startswith("[") else str(x))
#df['prompt'] = df['NER'].apply(lambda x: f"Generate a recipe:\nIngredients: {', '.join(x) if isinstance(x, list) else str(x)}")

# Show sample
print("🔎 FIRST ROW PROMPT:\n", df.iloc[0]['prompt'])
print("🔎 FIRST ROW RESPONSE:\n", df.iloc[0]['response'])

# STEP 4: Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df[['prompt', 'response']])

# STEP 5: Tokenize
from transformers import AutoTokenizer

model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 512

def preprocess(example):
    model_inputs = tokenizer(example['prompt'], max_length=max_length, padding="max_length", truncation=True)
    labels = tokenizer(example['response'], max_length=max_length, padding="max_length", truncation=True)
    labels["input_ids"] = [
        (token if token != tokenizer.pad_token_id else -100)
        for token in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# STEP 6: Confirm valid labels
sample = tokenized[0]
non_masked = sum([1 for token in sample["labels"] if token != -100])
print(f"\n✅ Non-masked label tokens: {non_masked} out of {len(sample['labels'])}")
print("🎯 Decoded target:\n", tokenizer.decode([t for t in sample["labels"] if t != -100]))

# STEP 7: Manual PyTorch Training Loop
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM
from torch.optim import AdamW  # ✅ Correct import for newer transformers
from tqdm import tqdm

# Load model and move to GPU
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"\n📦 Model loaded to: {device}")

# Create DataLoader
train_loader = DataLoader(tokenized, batch_size=2, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(1):  # 1 epoch
    loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

# STEP 8: Save model to Drive
save_path = "/content/drive/MyDrive/flan_recipe_manual_final3"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n✅ Model and tokenizer saved to: {save_path}")


#increase training sample size
#maybe jump to flan large ? or other model? maybe only when training script is ready
# increase temperature for output


In [None]:

# Prompt the user for ingredients
user_input = input("Enter ingredients (comma-separated):\n")
ingredients = [i.strip() for i in user_input.split(",")]
prompt = f"Generate a recipe:\nIngredients: {', '.join(ingredients)}"

# Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(
    **inputs,
    max_length=512,# was 256 havent tested new
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=1.2,# was 0.8 havent tested new
    repetition_penalty=1.2,  # 🔥 NEW: discourages repetition
    num_return_sequences=1
)

# Print the recipe
print("\n=== Generated Recipe ===\n")
print(tokenizer.decode(output[0], skip_special_tokens=True))


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# STEP 1: Load model and tokenizer
model_path = "/content/drive/MyDrive/flan_recipe_manual_final3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")

# STEP 2: Define test cases
test_prompts = [
    ["chicken", "rice", "broccoli", "soy sauce", "garlic"],
    ["ice cream", "banana", "chocolate syrup", "whipped cream"],
    ["eggs", "cheese", "spinach", "tomato", "tortilla"],
    ["beef", "potatoes", "carrots", "onion", "broth", "bay leaf"],
    ["tofu", "mushrooms", "soy sauce", "garlic", "sesame oil"],
    ["chicken"],  # Minimal input
    ["ground beef", "tomato sauce", "spaghetti"],
    ["ground beef", "tomato sauce", "tortillas"],
]

# STEP 3: Generate and print recipes
for idx, ingredients in enumerate(test_prompts):
    print(f"\n🧪 Test Case {idx + 1}: {', '.join(ingredients)}\n")

    prompt = f"Generate a recipe:\nIngredients: {', '.join(ingredients)}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
          **inputs,
          do_sample=True,  # 🔥 Enable actual sampling!
          max_length=350,
          temperature=1.0,
          top_k=50,
          top_p=0.95,
          repetition_penalty=1.2,
          num_beams=1
)


    recipe = tokenizer.decode(output[0], skip_special_tokens=True)
    print("=== Generated Recipe ===")
    print(recipe)
    print("=" * 50)
