<a href="https://colab.research.google.com/github/amit20kr/LLM-Food-Disease-Chatbot/blob/main/Model_Training_SLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
import torch
from google.colab import drive
import random
import re
import os

def augment_prompt(prompt, error_rate=0.4):
    """
    Applies realistic human-like spelling and typing errors to user prompts.
    Prioritizes vowel confusion, truncation, and missing letters.
    Balances between intelligibility and noise diversity for robust model training.
    """
    if random.random() > error_rate:
        return prompt
    words = prompt.split()
    if len(words) < 3:
        return prompt
    stop_words = {'is', 'a', 'to', 'for', 'the', 'in', 'of', 'what', 'and', 'my', 'on', 'do', 'it'}
    augmentable_indices = [i for i, w in enumerate(words) if len(w) > 3 and w.lower() not in stop_words]
    if not augmentable_indices:
        return prompt
    idx = random.choice(augmentable_indices)
    word = words[idx]
    techniques = [
        'vowel_swap', 'vowel_drop', 'truncation', 'missing_letter',
        'transposition', 'double_letter', 'random_case',
    ]
    t = random.choice(techniques)
    vowels = 'aeiou'
    chars = list(word)
    augmented = word
    if t == 'vowel_swap':
        new_chars = list(chars)
        for i, ch in enumerate(new_chars):
            if ch.lower() in vowels and random.random() < 0.5:
                new_chars[i] = random.choice(vowels.replace(ch.lower(), ''))
        augmented = ''.join(new_chars)
    elif t == 'vowel_drop':
        temp_augmented = ''.join([ch for ch in chars if not (ch.lower() in vowels and random.random() < 0.3)])
        augmented = temp_augmented if len(temp_augmented) >= 2 else word
    elif t == 'truncation' and len(word) > 5:
        cut = random.randint(1, 3)
        augmented = word[:-cut]
    elif t == 'missing_letter' and len(word) > 4:
        drop_positions = random.sample(range(1, len(chars)-1), k=min(2, len(chars)-2))
        augmented = ''.join([ch for i, ch in enumerate(chars) if i not in drop_positions])
    elif t == 'transposition' and len(word) > 3:
        j = random.randint(0, len(chars)-2)
        chars[j], chars[j+1] = chars[j+1], chars[j]
        augmented = ''.join(chars)
    elif t == 'double_letter' and len(word) > 2:
        j = random.randint(1, len(chars)-2)
        chars.insert(j, chars[j])
        augmented = ''.join(chars)
    elif t == 'random_case':
        augmented = ''.join(ch.upper() if random.random() < 0.3 else ch for ch in word)
    words[idx] = augmented
    return " ".join(words)

def create_reasoned_response_pattern(df):
    """
    Creates a structured yet conversational 'target_text' column that teaches
    the model to explain reasons, mention biomarkers, and give recommendations.
    """
    print("Generating enriched, model-friendly responses...")
    intros = ["Let's look at this step by step.", "Here's how this relates to your condition:", "Breaking it down scientifically:", "From a nutrition science point of view:"]
    reason_starters = ["Reasoning:", "Why this matters:", "Scientific insight:", "Underlying mechanism:"]
    biomarker_starters = ["Key biomarkers affected:", "Physiological impact:", "What it improves in your body:"]
    rec_starters = ["Recommended foods:", "Practical dietary guidance:", "You can focus on:"]
    responses = []
    for _, row in df.iterrows():
        response = str(row.get("response", "")).strip()
        rec = str(row.get("Recommended food", "")).strip()
        status = str(row.get("status", "")).strip().capitalize()
        rec = re.sub(r"(avoid:|instead try:)\s*", "", rec, flags=re.I).rstrip(" .")
        intro, reason, biomarker, rec_intro = random.choice(intros), random.choice(reason_starters), random.choice(biomarker_starters), random.choice(rec_starters)
        if status == "Good":
            text = (f"{response} {intro} {reason} These foods help by improving nutrient balance, reducing oxidative stress, and supporting healthy organ function. "
                    f"{biomarker} They may positively influence blood pressure, cholesterol, glucose metabolism, or thyroid hormone activity. "
                    f"{rec_intro} {rec}. Together, these choices strengthen long-term metabolic and cardiovascular stability.")
        elif status == "Avoid":
            text = (f"{response} {intro} {reason} Certain foods can worsen the condition by increasing inflammation or disrupting metabolic balance. "
                    f"{biomarker} They may elevate sodium, glucose, or lipid biomarkers beyond safe ranges. "
                    f"{rec_intro} Instead, consider: {rec}. Choosing these alternatives helps maintain steady biomarkers and reduce disease progression.")
        else:
            text = response
        responses.append(text)
    df["target_text"] = responses
    print(f"Created {len(df)} model-friendly target samples.")
    return df

def main():
    print("Mounting Google Drive...")
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
    DRIVE_BASE_PATH = "/content/drive/MyDrive/HackathonProject/"
    MODEL_NAME = f"{DRIVE_BASE_PATH}food-disease-chat-model-stable1-60"
    DATASET_PATH = f"{DRIVE_BASE_PATH}filtered_dataset_for_training.csv"
    OUTPUT_DIR = f"{DRIVE_BASE_PATH}food-chatbot-model-finetuned2-50"
    try:
        df = pd.read_csv(DATASET_PATH)
        original_rows = len(df)
        print(f"Original dataset loaded with {original_rows} rows.")
        print("Applying data augmentation to 80% of the dataset...")
        df_to_augment = df.sample(frac=0.8, random_state=42)
        augmented_rows_count = len(df_to_augment)
        print(f"Selected {augmented_rows_count} rows for augmentation.")
        df_augmented = df_to_augment.copy()
        df_augmented['prompt'] = df_augmented['prompt'].apply(augment_prompt)
        df_final_for_training = pd.concat([df, df_augmented]).reset_index(drop=True)
        total_augmented_rows = len(df_final_for_training)
        print(f"Dataset size after augmentation: {total_augmented_rows} rows.")
        print("\nSampling 50% of the resultant large dataset for faster training...")
        df_sampled = df_final_for_training.sample(frac=0.5, random_state=42)
        sampled_rows_count = len(df_sampled)
        print(f"Total rows for this training run: {sampled_rows_count} (50% of {total_augmented_rows}).\n")
        df = create_reasoned_response_pattern(df_sampled)
        df['input_text'] = "Provide a detailed dietary recommendation for the following query: " + df['prompt']
        hf_dataset = Dataset.from_pandas(df)
    except FileNotFoundError:
        print(f"Error: The file '{DATASET_PATH}' was not found.")
        return
    print(f"Loading tokenizer and model from: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    model.gradient_checkpointing_enable() # Reduce VRAM usage
    for param in model.parameters():
        param.requires_grad = True
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, pad_to_multiple_of=8)
    def preprocess_function(examples):
        model_inputs = tokenizer(examples["input_text"], max_length=128, truncation=True, padding="max_length")
        labels = tokenizer(text_target=examples["target_text"], max_length=512, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    print("Tokenizing the dataset...")
    tokenized_dataset = hf_dataset.map(preprocess_function, batched=True, num_proc=2, load_from_cache_file=True)
    train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']
    print(f"Dataset prepared. Training samples: {len(train_dataset)}, Evaluation samples: {len(eval_dataset)}")
    training_args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        weight_decay=0.01,
        learning_rate=3e-5,
        fp16=True,
        logging_dir=f"{OUTPUT_DIR}/logs",
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        max_grad_norm=1.0, # Gradient clipping for stability
        report_to="none",
    )
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)], # Stop if no improvement
    )
    print("Starting final model fine-tuning...")
    trainer.train()
    print("Fine-tuning complete.")
    print(f"Saving the final model to '{OUTPUT_DIR}'")
    trainer.save_model()
    tokenizer.save_pretrained(OUTPUT_DIR)
    # --- NEW: Save Model Card for Deployment ---
    model_card_content = f"""
model_name: food-disease-chatbot-finetuned
base_model: {MODEL_NAME}
fine_tuned_by: Amit Kumar
intended_use: dietary reasoning and food recommendation for chronic conditions
"""
    with open(os.path.join(OUTPUT_DIR, "model_card.md"), "w") as f:
        f.write(model_card_content)
    print("Final fine-tuned model, tokenizer, and model card saved successfully.")
    print("Ready for handler.py and .mar export to Google Cloud.")

if __name__ == "__main__":
    main()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.
Original dataset loaded with 2313 rows.
Applying data augmentation to 80% of the dataset...
Selected 1850 rows for augmentation.
Dataset size after augmentation: 4163 rows.

Sampling 50% of the resultant large dataset for faster training...
Total rows for this training run: 2082 (50% of 4163).

Generating enriched, model-friendly responses...
Created 2082 model-friendly target samples.
Loading tokenizer and model from: /content/drive/MyDrive/HackathonProject/food-disease-chat-model-stable1-60


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizing the dataset...


Map (num_proc=2):   0%|          | 0/2082 [00:00<?, ? examples/s]

Dataset prepared. Training samples: 1873, Evaluation samples: 209


  trainer = Seq2SeqTrainer(


Starting final model fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,26.0488,6.755293




Epoch,Training Loss,Validation Loss
1,26.0488,6.755293
2,5.6221,4.229208


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/HackathonProject/food-chatbot-model-finetuned2-50/checkpoint-118/pytorch_model.bin'

In [None]:
# --- Robust Final Model Saving Script ---
# This script should be run in a NEW CELL and can be run even if the Colab session was lost.
# It loads the best model directly from the last saved checkpoint directory
# and saves it to a clean, final destination.

import os
import shutil
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def save_model_from_checkpoint():
    """
    Finds the latest checkpoint, loads the model and tokenizer from it,
    and saves them to a clean final directory. This is a stateless operation
    and does not depend on the previous training session's memory.
    """
    print("--- Starting Final Model Saving Process from Checkpoint ---")

    # --- Configuration ---
    # These paths must match the paths used in your training script.
    DRIVE_BASE_PATH = "/content/drive/MyDrive/HackathonProject/"

    # The directory where all training outputs and checkpoints were saved
    CHECKPOINT_BASE_DIR = f"/content/drive/MyDrive/HackathonProject/food-chatbot-model-finetuned2-50"

    # A new, clean directory for the final, production-ready model
    FINAL_OUTPUT_DIR = f"{DRIVE_BASE_PATH}food-chatbot-production-model"

    # --- Find the latest checkpoint ---
    try:
        # List all directories inside the checkpoint base directory that start with "checkpoint-"
        checkpoints = [d for d in os.listdir(CHECKPOINT_BASE_DIR) if d.startswith("checkpoint-")]
        if not checkpoints:
            print(f"\n❌ ERROR: No checkpoint directories found in '{CHECKPOINT_BASE_DIR}'.")
            print("Please ensure the 'OUTPUT_DIR' in the training script matches this path.")
            return

        # Sort checkpoints by the step number (the integer after the hyphen) to find the latest one
        latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[1]))[-1]
        latest_checkpoint_path = os.path.join(CHECKPOINT_BASE_DIR, latest_checkpoint)
        print(f"Found latest checkpoint: '{latest_checkpoint_path}'")

    except FileNotFoundError:
        print(f"\n❌ ERROR: Checkpoint directory '{CHECKPOINT_BASE_DIR}' not found.")
        return

    # --- Load the best model and tokenizer from the checkpoint ---
    print("Loading model and tokenizer from the latest checkpoint...")
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(latest_checkpoint_path)
        tokenizer = AutoTokenizer.from_pretrained(latest_checkpoint_path)
        print("Model and tokenizer loaded successfully.")
    except Exception as e:
        print(f"\n❌ ERROR: Failed to load model from checkpoint. Error: {e}")
        return

    # --- Save to the final, clean directory ---
    print(f"Saving the final model to a clean directory: '{FINAL_OUTPUT_DIR}'")
    os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
    model.save_pretrained(FINAL_OUTPUT_DIR)
    tokenizer.save_pretrained(FINAL_OUTPUT_DIR)
    print("Final model and tokenizer saved successfully.")

    # --- Create the Model Card for Deployment ---
    print("Creating model_card.md for deployment metadata...")
    model_card_content = f"""
model_name: food-disease-chatbot-finetuned
base_model: food-disease-chat-model-stable (fine-tuned)
fine_tuned_by: Amit Kumar
intended_use: dietary reasoning and food recommendation for chronic conditions
"""
    try:
        with open(os.path.join(FINAL_OUTPUT_DIR, "model_card.md"), "w") as f:
            f.write(model_card_content)
        print("Model card created successfully.")
    except Exception as e:
        print(f"Could not create model card. Error: {e}")

    print(f"\n✅ Process complete. Your final, production-ready model is in: '{FINAL_OUTPUT_DIR}'")
    print("You are now ready to create the handler.py and .mar file for deployment.")

if __name__ == "__main__":
    save_model_from_checkpoint()



--- Starting Final Model Saving Process from Checkpoint ---
Found latest checkpoint: '/content/drive/MyDrive/HackathonProject/food-chatbot-model-finetuned2-50/checkpoint-118'
Loading model and tokenizer from the latest checkpoint...
Model and tokenizer loaded successfully.
Saving the final model to a clean directory: '/content/drive/MyDrive/HackathonProject/food-chatbot-production-model'
Final model and tokenizer saved successfully.
Creating model_card.md for deployment metadata...
Model card created successfully.

✅ Process complete. Your final, production-ready model is in: '/content/drive/MyDrive/HackathonProject/food-chatbot-production-model'
You are now ready to create the handler.py and .mar file for deployment.


In [None]:
# --- handler.py for FLAN-T5 Food Recommender ---
# This script defines how TorchServe loads the model and handles inference requests.
# It is optimized for use with Google Cloud Platform's AI Platform Prediction.

import logging
import os
from abc import ABC
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)

class FlanT5Handler(BaseHandler, ABC):
    """
    Custom handler for the fine-tuned FLAN-T5 model. It handles tokenization,
    inference, and decoding of the model's output in a production environment.
    """
    def __init__(self):
        super(FlanT5Handler, self).__init__()
        self.initialized = False

    def initialize(self, context):
        """
        Loads the tokenizer and model from the saved artifacts. This is called
        once when the model is loaded on the server.
        """
        self.manifest = context.manifest
        properties = context.system_properties
        model_dir = properties.get("model_dir")

        # Determine device (GPU if available on the serving instance, else CPU)
        self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        # Load tokenizer and model
        logger.info(f"Loading model from {model_dir}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
        self.model.to(self.device)
        self.model.eval() # Set the model to evaluation mode

        logger.info("Transformer model and tokenizer loaded successfully.")
        self.initialized = True

    def preprocess(self, data):
        """
        Preprocesses the input data by adding the required prefix and tokenizing.
        This function is designed to handle raw requests from an API endpoint.
        """
        # Extract the user query from the input request
        text = data[0].get("data") or data[0].get("body")
        if isinstance(text, (bytes, bytearray)):
            text = text.decode('utf-8')

        logger.info(f"Received raw text: '{text}'")

        # CRITICAL: Apply the exact same instructional prefix used during training.
        # This triggers the model's specialized knowledge.
        input_text = "Provide a detailed dietary recommendation for the following query: " + text

        # Tokenize the input text, preparing it for the model
        inputs = self.tokenizer(input_text, return_tensors="pt")
        return inputs.to(self.device)

    def inference(self, inputs):
        """
        Runs the tokenized input through the model to generate a response.
        This is where the actual prediction happens.
        """
        # Inference should not track gradients, which saves memory and computation
        with torch.no_grad():
            outputs = self.model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=512,      # Should match or exceed the target length from training
                num_beams=5,         # Use beam search for higher quality, more coherent text
                early_stopping=True, # Stop generation when the model is confident
                temperature=0.9,     # A little creativity in the response
                top_k=50             # Consider the top 50 words at each step
            )
        return outputs

    def postprocess(self, inference_output):
        """
        Decodes the generated token IDs back into a human-readable string.
        """
        # Decode the tensor of token IDs into a string
        result = self.tokenizer.decode(inference_output[0], skip_special_tokens=True)
        logger.info(f"Generated response: '{result}'")
        # TorchServe expects a list of responses
        return [result]



ModuleNotFoundError: No module named 'ts'