<a href="https://colab.research.google.com/github/amit20kr/LLM-Food-Disease-Chatbot/blob/main/Model_Training_SLM1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from google.colab import drive
import random
import re

In [None]:
def augment_prompt(prompt, error_rate=0.4):

    # Probabilistic trigger
    if random.random() > error_rate:
        return prompt

    words = prompt.split()
    if len(words) < 3:
        return prompt

    # Exclude unhelpful word
    stop_words = {'is', 'a', 'to', 'for', 'the', 'in', 'of', 'what', 'and', 'my', 'on', 'do', 'it'}
    augmentable_indices = [i for i, w in enumerate(words) if len(w) > 3 and w.lower() not in stop_words]
    if not augmentable_indices:
        return prompt

    idx = random.choice(augmentable_indices)
    word = words[idx]

    # Human-like error patterns
    techniques = [
        'vowel_swap',       # a → e, e → i, etc.
        'vowel_drop',       # remove a vowel
        'truncation',       # cut off last few chars
        'missing_letter',   # delete 1–2 chars mid-word
        'transposition',    # swap adjacent chars
        'double_letter',    # add repeated letter
        'random_case',      # mimic accidental caps
    ]
    t = random.choice(techniques)

    vowels = 'aeiou'
    chars = list(word)
    augmented = word # Default to original word if a technique fails

    if t == 'vowel_swap':
        new_chars = list(chars)
        for i, ch in enumerate(new_chars):
            if ch.lower() in vowels and random.random() < 0.5:
                new_chars[i] = random.choice(vowels.replace(ch.lower(), ''))
        augmented = ''.join(new_chars)

    elif t == 'vowel_drop':
        temp_augmented = ''.join([ch for ch in chars if not (ch.lower() in vowels and random.random() < 0.3)])
        if len(temp_augmented) < 2:  # Avoid total deletion
            augmented = word
        else:
            augmented = temp_augmented

    elif t == 'truncation' and len(word) > 5:
        cut = random.randint(1, 3)
        augmented = word[:-cut]

    elif t == 'missing_letter' and len(word) > 4:
        drop_positions = random.sample(range(1, len(chars)-1), k=min(2, len(chars)-2))
        augmented = ''.join([ch for i, ch in enumerate(chars) if i not in drop_positions])

    elif t == 'transposition' and len(word) > 3:
        j = random.randint(0, len(chars)-2)
        chars[j], chars[j+1] = chars[j+1], chars[j]
        augmented = ''.join(chars)

    elif t == 'double_letter' and len(word) > 2:
        j = random.randint(1, len(chars)-2)
        chars.insert(j, chars[j])  # double a mid-letter
        augmented = ''.join(chars)

    elif t == 'random_case':
        augmented = ''.join(ch.upper() if random.random() < 0.3 else ch for ch in word)

    words[idx] = augmented
    return " ".join(words)

In [None]:
def create_reasoned_response_pattern(df):

    intros = [
        "Let's look at this step by step.",
        "Here's how this relates to your condition:",
        "Breaking it down scientifically:",
        "From a nutrition science point of view:"
    ]

    reason_starters = [
        "Reasoning:",
        "Why this matters:",
        "Scientific insight:",
        "Underlying mechanism:"
    ]

    biomarker_starters = [
        "Key biomarkers affected:",
        "Physiological impact:",
        "What it improves in your body:"
    ]

    rec_starters = [
        "Recommended foods:",
        "Practical dietary guidance:",
        "You can focus on:"
    ]

    responses = []

    for _, row in df.iterrows():
        response = str(row.get("response", "")).strip()
        rec = str(row.get("Recommended food", "")).strip()
        status = str(row.get("status", "")).strip().capitalize()

        # Clean up formatting artifacts
        rec = re.sub(r"(avoid:|instead try:)\s*", "", rec, flags=re.I)
        rec = rec.rstrip(" .")

        intro = random.choice(intros)
        reason = random.choice(reason_starters)
        biomarker = random.choice(biomarker_starters)
        rec_intro = random.choice(rec_starters)

        if status == "Good":
            text = (
                f"{response} {intro} "
                f"{reason} These foods help by improving nutrient balance, reducing oxidative stress, "
                f"and supporting healthy organ function. "
                f"{biomarker} They may positively influence blood pressure, cholesterol, glucose metabolism, or thyroid hormone activity. "
                f"{rec_intro} {rec}. "
                f"Together, these choices strengthen long-term metabolic and cardiovascular stability."
            )

        elif status == "Avoid":
            text = (
                f"{response} {intro} "
                f"{reason} Certain foods can worsen the condition by increasing inflammation or disrupting metabolic balance. "
                f"{biomarker} They may elevate sodium, glucose, or lipid biomarkers beyond safe ranges. "
                f"{rec_intro} Instead, consider: {rec}. "
                f"Choosing these alternatives helps maintain steady biomarkers and reduce disease progression."
            )

        else:
            text = response

        responses.append(text)

    df["target_text"] = responses
    print(f"Created {len(df)} model-friendly target samples.")
    return df

In [None]:
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

Mounted at /content/drive
Google Drive mounted successfully.


In [None]:
DRIVE_BASE_PATH = "/content/drive/MyDrive/food_chatbot/"

PRETRAINED_MODEL_PATH = "/content/drive/MyDrive/HackathonProject/food-disease-chat-model-stable1-60"

DATASET_PATH = f"{DRIVE_BASE_PATH}filtered_dataset_for_training.csv"
OUTPUT_DIR = f"{DRIVE_BASE_PATH}food-chatbot-model-finetuned2-50"

print(f"Loading the filtered training dataset from '{DATASET_PATH}'...")

Loading the filtered training dataset from '/content/drive/MyDrive/food_chatbot/filtered_dataset_for_training.csv'...


In [None]:
df = pd.read_csv(DATASET_PATH)
original_rows = len(df)
print(f"Original dataset loaded with {original_rows} rows.")
# Data Augmentation Step
augmentation_frac = 0.8
df_to_augment = df.sample(frac=augmentation_frac, random_state=42)
augmented_rows_count = len(df_to_augment)
print(f"Selected {augmented_rows_count} rows for augmentation ({augmentation_frac*100}% of original).")

df_augmented = df_to_augment.copy()
df_augmented['prompt'] = df_augmented['prompt'].apply(augment_prompt)
print("Augmentation applied to selected rows.")

# Combine the original clean data with the newly augmented data
df_final_for_training = pd.concat([df, df_augmented]).reset_index(drop=True)
total_training_rows = len(df_final_for_training)
print(f"Total rows for training: {original_rows} (original) + {augmented_rows_count} (augmented) = {total_training_rows} rows.")
# Sample the dataset for faster training
training_sample_frac = 0.5
sampled_df = df_final_for_training.sample(frac=training_sample_frac, random_state=42)
sampled_rows_count = len(sampled_df)
print(f"Using {training_sample_frac*100}% of the full dataset for this training run.")
print(f"Number of rows selected for training: {sampled_rows_count} out of {total_training_rows}.")

df = create_reasoned_response_pattern(sampled_df)

df['input_text'] = "Provide a detailed dietary recommendation for the following query: " + df['prompt']

hf_dataset = Dataset.from_pandas(df)

Original dataset loaded with 2313 rows.
Selected 1850 rows for augmentation (80.0% of original).
Augmentation applied to selected rows.
Total rows for training: 2313 (original) + 1850 (augmented) = 4163 rows.
------------------------------------

Using 50.0% of the full dataset for this training run.
Number of rows selected for training: 2082 out of 4163.
Created 2082 model-friendly target samples.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_PATH)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        text_target=examples["target_text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Dataset prepared. Training samples: {len(train_dataset)}, Evaluation samples: {len(eval_dataset)}")

Map:   0%|          | 0/2082 [00:00<?, ? examples/s]

Dataset prepared. Training samples: 1873, Evaluation samples: 209


In [None]:
print(f"Loading pre-trained model from: {PRETRAINED_MODEL_PATH}")
model = AutoModelForSeq2SeqLM.from_pretrained(PRETRAINED_MODEL_PATH)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Loading pre-trained model from: /content/drive/MyDrive/HackathonProject/food-disease-chat-model-stable1-60


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# --- MODIFIED: Optimized Training Arguments for Faster Training ---
training_args = Seq2SeqTrainingArguments(
  output_dir=OUTPUT_DIR,
  num_train_epochs=2,
  per_device_train_batch_size=8,
  per_device_eval_batch_size=8, # Added for consistency
  gradient_accumulation_steps=4,
  warmup_steps=50,
  weight_decay=0.01,
  learning_rate=3e-5,
  fp16=True,
  logging_dir=f"{OUTPUT_DIR}/logs",
  logging_steps=50,
  eval_strategy="epoch",
  save_strategy="epoch",
  save_total_limit=2, # Keeps disk usage down
  load_best_model_at_end=True,
  report_to="none"
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting final model fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

In [None]:
print(f"Saving the final fine-tuned model to '{OUTPUT_DIR}'")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
print("Final model and tokenizer saved successfully to your Google Drive.")