<a href="https://colab.research.google.com/github/acram002/AI-Driven-Recipe-Suggestion-System/blob/main/testSmallFlanColabPro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load and sample your dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')
df = df[['NER', 'directions']].dropna()
df = df.sample(n=5000, random_state=42)  # Reduced for fast training

# STEP 4: Format for training
df['prompt'] = 'Generate a recipe:\nIngredients: ' + df['NER']
df['response'] = df['directions']

# Convert to Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df[['prompt', 'response']])

# STEP 5: Tokenize the data
from transformers import AutoTokenizer

model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 512

def tokenize(example):
    inputs = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=max_length)
    targets = tokenizer(example['response'], truncation=True, padding='max_length', max_length=max_length)

    # Important: mask pad tokens in labels to -100 so they're ignored in loss
    targets['input_ids'] = [
        (token if token != tokenizer.pad_token_id else -100)
        for token in targets['input_ids']
    ]

    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_dataset = dataset.map(tokenize)

# STEP 6: Load model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# STEP 7: Set up training arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/flan_recipe_model_output",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=True,
    report_to="none"
)

# STEP 8: Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# STEP 9: Train!
trainer.train()

# STEP 10: Save model
save_path = "/content/drive/MyDrive/flan_recipe_model_final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0
400,0.0
450,0.0
500,0.0


✅ Model and tokenizer saved to: /content/drive/MyDrive/flan_recipe_model_final


In [None]:
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load and preprocess dataset
import pandas as pd
import ast

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')
df = df[['NER', 'directions']].dropna()
df = df.sample(n=5000, random_state=42)

# Fix stringified list formats
df['NER'] = df['NER'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x)
df['response'] = df['directions'].apply(lambda x: "\n".join(ast.literal_eval(x)) if isinstance(x, str) and x.startswith("[") else str(x))
df['prompt'] = df['NER'].apply(lambda x: f"Generate a recipe:\nIngredients: {', '.join(x) if isinstance(x, list) else str(x)}")

# Show sample
print("🔎 FIRST ROW PROMPT:\n", df.iloc[0]['prompt'])
print("🔎 FIRST ROW RESPONSE:\n", df.iloc[0]['response'])

# STEP 4: Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df[['prompt', 'response']])

# STEP 5: Tokenize
from transformers import AutoTokenizer

model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 512

def preprocess(example):
    model_inputs = tokenizer(example['prompt'], max_length=max_length, padding="max_length", truncation=True)
    labels = tokenizer(example['response'], max_length=max_length, padding="max_length", truncation=True)
    labels["input_ids"] = [
        (token if token != tokenizer.pad_token_id else -100)
        for token in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# STEP 6: Confirm valid labels
sample = tokenized[0]
non_masked = sum([1 for token in sample["labels"] if token != -100])
print(f"\n✅ Non-masked label tokens: {non_masked} out of {len(sample['labels'])}")
print("🎯 Decoded target:\n", tokenizer.decode([t for t in sample["labels"] if t != -100]))

# STEP 7: Manual PyTorch Training Loop
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, AdamW
from tqdm import tqdm

# Load model and move to GPU
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"\n📦 Model loaded to: {device}")

# Create DataLoader
train_loader = DataLoader(tokenized, batch_size=2, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()
for epoch in range(1):  # 1 epoch
    loop = tqdm(train_loader, desc="Epoch 1")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        loop.set_postfix(loss=loss.item())

# STEP 8: Save model to Drive
save_path = "/content/drive/MyDrive/flan_recipe_manual_final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n✅ Model and tokenizer saved to: {save_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔎 FIRST ROW PROMPT:
 Generate a recipe:
Ingredients: flank steak, green onions, red wine, soy sauce, salad oil, sesame seeds, brown sugar, grnd black pepper, grnd ginger, clove garlic
🔎 FIRST ROW RESPONSE:
 Remove tenderloin from steak.
Score meat.
Combine remaining ingredients and pour over meat.
Let marinate 24 hrs.
Preheat grill.
Broil or possibly grill.
Slice thinly on an angle against the grain.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]


✅ Non-masked label tokens: 48 out of 512
🎯 Decoded target:
 Remove tenderloin from steak. Score meat. Combine remaining ingredients and pour over meat. Let marinate 24 hrs. Preheat grill. Broil or possibly grill. Slice thinly on an angle against the grain.</s>





📦 Model loaded to: cuda


Epoch 1: 100%|██████████| 2500/2500 [12:58<00:00,  3.21it/s, loss=3.06]



✅ Model and tokenizer saved to: /content/drive/MyDrive/flan_recipe_manual_final


In [None]:
# Prompt the user for ingredients
user_input = input("Enter ingredients (comma-separated):\n")
ingredients = [i.strip() for i in user_input.split(",")]
prompt = f"Generate a recipe:\nIngredients: {', '.join(ingredients)}"

# Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_length=256,
        num_beams=4,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8
    )


# Print the recipe
print("\n=== Generated Recipe ===\n")
print(tokenizer.decode(output[0], skip_special_tokens=True))


Enter ingredients (comma-separated):
sausage, pepper, chicken, rice

=== Generated Recipe ===

In a large skillet, saute sausage, pepper and chicken. Add rice and stir. Cook on low for 10 minutes. Add rice and stir. Cook on low for 10 minutes. Add rice and stir. Cook on low for 10 minutes. Add chicken and rice and stir. Cook on low for 10 minutes. Add rice and stir. Cook on low for 10 minutes. Add rice and stir. Cook on low for 10 minutes. Add rice and stir. Cook on low for 10 minutes.


In [None]:
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load and preprocess dataset
import pandas as pd
import ast

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')
df = df[['NER', 'directions']].dropna()
df = df.sample(n=5000, random_state=42)

# 🧼 Fix stringified lists in NER and directions
df['NER'] = df['NER'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x)
df['response'] = df['directions'].apply(lambda x: "\n".join(ast.literal_eval(x)) if isinstance(x, str) and x.startswith("[") else str(x))

# 🧠 Format prompt using NER
df['prompt'] = df['NER'].apply(lambda x: f"Generate a recipe:\nIngredients: {', '.join(x) if isinstance(x, list) else str(x)}")

# Print prompt and response example
print("🔎 FIRST ROW PROMPT:\n", df.iloc[0]['prompt'])
print("🔎 FIRST ROW RESPONSE:\n",


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🔎 FIRST ROW PROMPT:
 Generate a recipe:
Ingredients: flank steak, green onions, red wine, soy sauce, salad oil, sesame seeds, brown sugar, grnd black pepper, grnd ginger, clove garlic
🔎 FIRST ROW RESPONSE:
 ["Remove tenderloin from steak.", "Score meat.", "Combine remaining ingredients and pour over meat.", "Let marinate 24 hrs.", "Preheat grill.", "Broil or possibly grill.", "Slice thinly on an angle against the grain."]


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]


✅ Non-masked label tokens: 70 out of 512
🎯 Decoded label text:
 ["Remove tenderloin from steak.", "Score meat.", "Combine remaining ingredients and pour over meat.", "Let marinate 24 hrs.", "Preheat grill.", "Broil or possibly grill.", "Slice thinly on an angle against the grain."]</s>


  trainer = Trainer(


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


KeyboardInterrupt: 

In [None]:
prompt = 'Generate a recipe:\nIngredients: ["chicken", "rice", "broccoli", "garlic", "soy sauce"]'
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(**inputs, max_length=256)

print("\n=== Generated Recipe ===\n")
print(tokenizer.decode(output[0], skip_special_tokens=True))
