<a href="https://colab.research.google.com/github/avikumart/LLM-GenAI-Transformers-Notebooks/blob/main/DeepLearningFiles/mental_health_convo_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thedevastator/nlp-mental-health-conversations")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv(os.path.join(path, "train.csv"))
df.head()

In [None]:
# what are the unique values in the Contexrt column
df['Context'].unique()

In [None]:
df["Context"].nunique()

In [None]:
df["Response"].nunique()

In [None]:
# finetuning the OS model
!pip install transformers accelerate peft bitsandbytes datasets trl -q

In [None]:
# from huggingface_hub import login
# login()

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

In [None]:
from datasets import Dataset
text = "You are a helpful mental health assistant answering following input to the output"
df["instruction"] = text
df["input"] = df["Context"]
df["output"] = df["Response"]
ndf = df[["instruction","input", "output"]]
ndf.head()

In [None]:
# Convert the dataset into a list of dictionaries
dataset = Dataset.from_pandas(ndf)
dataset

In [None]:
# train and test split for the dataset
dataset = dataset.train_test_split(test_size=0.2)
dataset

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig

# Quantization config for memory efficiency
#bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_compute_dtype="float16",
#)

# Load model with quantization
#model = AutoModelForCausalLM.from_pretrained(
#    model_name,
#    device_map="auto",
#)

# Prepare model for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Alpha parameter
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Target attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

In [None]:
# train the model on custom dataset
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "./gemma-finetuned-mhfaq",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    learning_rate = 2e-4,
    logging_steps = 10,
    max_steps = 100,
    num_train_epochs=3,
    weight_decay = 0.01,
    fp16 = True,
    bf16 = False,
    max_grad_norm = 0.3,
    warmup_ratio = 0.03,
    group_by_length = True,
    lr_scheduler_type = "constant",
    report_to = "tensorboard",
    save_strategy="epoch"
)

# define the dataset formatting function
def formatting(example):
  text = f"Instrcution: {example['instruction']}\n"
  if example.get('input'):
    text += f"Input: {example['input']}\n"
  text += f"Output: {example['output']}"
  return text

# define the trainer
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    formatting_func = formatting,
    args = training_args,
    peft_config = lora_config,
)

#train the model
trainer.train()