
# Stress Status Detection ‚Äî End‚Äëto‚ÄëEnd Colab Notebook  
**Order:** Load Dataset ‚Üí Baseline Models ‚Üí Pre‚ÄëTrained Models ‚Üí Training of Data ‚Üí Fine‚Äëtuning ‚Üí Eval

> This notebook encodes the provided fine‚Äëtuning script into a structured, well‚Äëcommented pipeline.  
> It includes at least **three hyperparameter experiments** (aiming to maximize **F1‚ÄëScore**).  
> Replace the dataset path with your CSV if needed ‚Äî required columns: `statement` (text) and `status` (0/1).


# 0) Setup (libraries and reproducibility)

--- Import and Environment Setup ---

import os
- used for managing file paths, environment variables, and directory operations to ensure smooth file handling throughout the notebook

import math
- provides access to mathematical functions that can be useful for adjusting learning rate schedules or calculations during training

import random
- initializes and controls random number generation to keep results consistent across multiple runs, ensuring reproducibility

import numpy as np
- supports numerical computations, array operations, and helps manage data processing tasks efficiently before feeding data to the model

import pandas as pd
- enables loading, inspecting, and manipulating structured data such as CSV files, which is essential for preparing datasets for model training

from pathlib import Path
- offers an object-oriented approach to handle file and directory paths in a more readable and cross-platform manner

--- Core Framework Imports ---

import torch
- serves as the foundation for tensor operations and deep learning on both CPU and GPU, allowing efficient model training and inference

from datasets import Dataset
- converts pandas DataFrames into an optimized dataset format compatible with Hugging Face Transformers for seamless data management

from transformers import (
AutoTokenizer,
- automatically retrieves the correct tokenizer corresponding to the specified model checkpoint, ensuring token compatibility
AutoModelForSequenceClassification,
- loads a pre-trained Transformer model with an added classification layer suitable for text classification tasks
TrainingArguments,
 - defines and stores hyperparameters for the training process such as epochs, batch size, and evaluation strategy
Trainer
- manages the full training loop, including evaluation, logging, and checkpoint saving, simplifying fine-tuning workflows
)

--- Evaluation Metric Imports ---

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 - provides standard metrics to measure model performance, including accuracy, precision, recall, and F1-score for balanced evaluation

--- Reproducibility Configuration ---

SEED = 42
- assigns a fixed seed value to maintain consistency across random operations for all libraries used
random.seed(SEED)
 - ensures Python‚Äôs random processes produce the same outcomes each run
np.random.seed(SEED)
 - synchronizes NumPy‚Äôs internal random state for repeatable data shuffling and sampling
torch.manual_seed(SEED)
- locks PyTorch‚Äôs random state for deterministic model initialization
torch.cuda.manual_seed_all(SEED)
 - extends reproducibility settings to all available GPUs, ensuring consistent results even in multi-GPU environments

--- Device Detection ---

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- checks if a GPU is available; otherwise defaults to CPU for computation
print(f"Using device: {device}")
- displays which hardware is being used to verify that GPU acceleration is active when available

In [None]:

# Every import has an explanatory comment.
import os                         # file paths and environment checks
import math                       # math helpers (may be useful for schedules)
import random                     # Python's RNG for reproducibility
import numpy as np                # numerical arrays and metrics support
import pandas as pd               # data loading and manipulation
from pathlib import Path          # convenient and robust path handling

# Hugging Face / PyTorch stack (for transformer fine‚Äëtuning)
import torch                      # tensor and GPU utilities
from datasets import Dataset      # lightweight dataset wrapper around pandas
from transformers import (       # core HF components for tokenization and training
    AutoTokenizer,               # auto‚Äëloads the right tokenizer for a given model checkpoint
    AutoModelForSequenceClassification,  # classification head on top of a transformer
    TrainingArguments,           # training hyperparameters container
    Trainer                      # training loop helper (handles eval and logging)
)

# Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Make runs reproducible (seed Python, NumPy, and PyTorch)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Detect device once and print for visibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # shows 'cuda' when a GPU is available in Colab


Using device: cpu


## 1) Load Dataset

# --- Load Dataset (Upload version, auto-encodes text labels) ---

#import pandas as pd  
- loads the pandas library for handling CSV data

#from pathlib import Path  
- allows robust and convenient file path handling

#from google.colab import files  
- enables file upload directly in Google Colab

#print("üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)")  
- displays an instruction message asking the user to upload the dataset

#uploaded = files.upload()  
- opens a file upload dialog for selecting the CSV file

#filename = list(uploaded.keys())[0]  
- retrieves the name of the first uploaded file

#csv_path = Path(f"/content/{filename}")  
- constructs the full path to the uploaded file in Colab‚Äôs environment

#print(f"‚úÖ File uploaded successfully: {csv_path}")  
- confirms successful upload and shows the file path

#df = pd.read_csv(csv_path)  
- reads the uploaded CSV file into a pandas DataFrame

# --- Validate columns ---

#expected_cols = {'statement', 'status'}  
- defines the expected column names required in the dataset

#assert expected_cols.issubset(df.columns), f"‚ùå Missing required columns: {expected_cols - set(df.columns)}"  
- checks if required columns exist, otherwise stops execution with an error

# --- Clean ---

#df = df.dropna(subset=['statement', 'status']).copy()  
- removes rows that have missing values in the statement or status columns

#df['statement'] = df['statement'].astype(str)  
- ensures all text entries in the statement column are strings

# --- Encode text labels into integers ---

#from sklearn.preprocessing import LabelEncoder  
- imports a utility that converts categorical text labels into numeric values

#le = LabelEncoder()  
- creates an instance of the LabelEncoder class

#df['status_encoded'] = le.fit_transform(df['status'])  
- fits the encoder on the status column and creates a new encoded column

#print("üî§ Label encoding map:")  
- prints a heading for the label mapping information

#for label, code in zip(le.classes_, range(len(le.classes_))):  
- iterates through each label and its assigned numeric code
    print(f"  {code} ‚Üí {label}")  # displays each code-label pair

#df['status'] = df['status_encoded']  
- replaces the original text labels with numeric values

#df.drop(columns=['status_encoded'], inplace=True)  
- removes the temporary encoded column since it‚Äôs no longer needed

#print("\n‚úÖ Dataset loaded and label-encoded successfully!")  
- prints confirmation that data cleaning and encoding are complete

#print(df['status'].value_counts(dropna=False))  
- shows the number of samples per label for verification

#df.head(3)  
- displays the first three rows of the cleaned and processed dataset


In [None]:
# --- Load Dataset (Upload version, auto-encodes text labels) ---
import pandas as pd
from pathlib import Path
from google.colab import files

print("üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)")
uploaded = files.upload()

# Automatically pick the first uploaded file
filename = list(uploaded.keys())[0]
csv_path = Path(f"/content/{filename}")

print(f"‚úÖ File uploaded successfully: {csv_path}")

# Load the CSV
df = pd.read_csv(csv_path)

# --- Validate columns ---
expected_cols = {'statement', 'status'}
assert expected_cols.issubset(df.columns), f"‚ùå Missing required columns: {expected_cols - set(df.columns)}"

# --- Clean ---
df = df.dropna(subset=['statement', 'status']).copy()
df['statement'] = df['statement'].astype(str)

# --- Encode text labels into integers ---
# This maps each unique label (like 'Anxiety', 'Stress', etc.) to a numeric ID
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['status_encoded'] = le.fit_transform(df['status'])

# Optional: print mapping for your reference
print("üî§ Label encoding map:")
for label, code in zip(le.classes_, range(len(le.classes_))):
    print(f"  {code} ‚Üí {label}")

# Replace 'status' with the encoded version
df['status'] = df['status_encoded']
df.drop(columns=['status_encoded'], inplace=True)

print("\n‚úÖ Dataset loaded and label-encoded successfully!")
print(df['status'].value_counts(dropna=False))
df.head(3)


üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)


Saving Combined Data.csv to Combined Data (2).csv
‚úÖ File uploaded successfully: /content/Combined Data (2).csv
üî§ Label encoding map:
  0 ‚Üí Anxiety
  1 ‚Üí Bipolar
  2 ‚Üí Depression
  3 ‚Üí Normal
  4 ‚Üí Personality disorder
  5 ‚Üí Stress
  6 ‚Üí Suicidal

‚úÖ Dataset loaded and label-encoded successfully!
status
3    16343
2    15404
6    10652
0     3841
1     2777
5     2587
4     1077
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,0
1,1,"trouble sleeping, confused mind, restless hear...",0
2,2,"All wrong, back off dear, forward doubt. Stay ...",0


## 2) Baseline Models (TF‚ÄëIDF + Linear)

# --- Baseline Models (TF-IDF + Linear, supports multi-class) ---

#from sklearn.model_selection import train_test_split  
- splits the dataset into training and validation sets

#from sklearn.feature_extraction.text import TfidfVectorizer  
- converts text into numerical TF-IDF feature vectors

#from sklearn.linear_model import LogisticRegression  
- imports the logistic regression model for classification

#from sklearn.svm import LinearSVC  
- imports the linear support vector machine classifier

#from sklearn.metrics import accuracy_score, precision_recall_fscore_support  
- provides functions for evaluating model performance

#import numpy as np  
- used for numerical operations and array manipulation

#X_train, X_val, y_train, y_val = train_test_split(  
#    df['statement'].values,  
#    df['status'].values,  
#    test_size=0.2,  
#    random_state=42,  
#    stratify=df['status'].values  
#)  
- splits data into 80% training and 20% validation sets while keeping label proportions balanced

#tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=40000)  
- creates a TF-IDF vectorizer using unigrams and bigrams with up to 40,000 features

#Xtr = tfidf.fit_transform(X_train)  
- fits the TF-IDF model on training text and transforms it into feature vectors

#Xva = tfidf.transform(X_val)  
- applies the same transformation to the validation set without refitting

#num_classes = len(np.unique(y_train))  
- counts the number of unique labels in the training data

#avg_type = "binary" if num_classes == 2 else "weighted"  
- selects metric averaging type depending on whether it‚Äôs binary or multi-class

#print(f"Detected {num_classes} classes ‚Üí using average='{avg_type}' for metrics.\n")  
- prints the detected number of classes and chosen averaging method

# --- Baseline 1: Logistic Regression ---

#logreg = LogisticRegression(max_iter=2000, class_weight="balanced")  
- initializes logistic regression with balanced class weights and higher iteration limit

#logreg.fit(Xtr, y_train)  
- trains the logistic regression model on TF-IDF features

#pred_lr = logreg.predict(Xva)  
- generates predictions on the validation data

#p, r, f, _ = precision_recall_fscore_support(y_val, pred_lr, average=avg_type)  
- computes precision, recall, and F1-score using the chosen averaging type

#acc = accuracy_score(y_val, pred_lr)  
- calculates overall accuracy of the logistic regression model

#print(f"[Baseline-LR] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")  
- displays evaluation metrics for the logistic regression baseline

# --- Baseline 2: Linear SVM ---

#svm = LinearSVC(class_weight="balanced")  
- initializes a linear SVM classifier with balanced class weights

#svm.fit(Xtr, y_train)  
- trains the SVM model on the training data

#pred_svm = svm.predict(Xva)  
- predicts labels for the validation set

#p, r, f, _ = precision_recall_fscore_support(y_val, pred_svm, average=avg_type)  
- calculates precision, recall, and F1-score for SVM predictions

#acc = accuracy_score(y_val, pred_svm)  
- computes accuracy of the SVM model

#print(f"[Baseline-SVM] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")  
- displays evaluation metrics for the SVM baseline model


In [None]:
# --- Baseline Models (TF-IDF + Linear, supports multi-class) ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    df['statement'].values,
    df['status'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['status'].values
)

# Convert raw text into TF-IDF features
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=40000)
Xtr = tfidf.fit_transform(X_train)
Xva = tfidf.transform(X_val)

# Detect if this is binary or multiclass
num_classes = len(np.unique(y_train))
avg_type = "binary" if num_classes == 2 else "weighted"
print(f"Detected {num_classes} classes ‚Üí using average='{avg_type}' for metrics.\n")

# --- Baseline 1: Logistic Regression ---
logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
logreg.fit(Xtr, y_train)
pred_lr = logreg.predict(Xva)
p, r, f, _ = precision_recall_fscore_support(y_val, pred_lr, average=avg_type)
acc = accuracy_score(y_val, pred_lr)
print(f"[Baseline-LR] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")

# --- Baseline 2: Linear SVM ---
svm = LinearSVC(class_weight="balanced")
svm.fit(Xtr, y_train)
pred_svm = svm.predict(Xva)
p, r, f, _ = precision_recall_fscore_support(y_val, pred_svm, average=avg_type)
acc = accuracy_score(y_val, pred_svm)
print(f"[Baseline-SVM] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")


Detected 7 classes ‚Üí using average='weighted' for metrics.

[Baseline-LR] Acc=0.778  P=0.787  R=0.778  F1=0.777
[Baseline-SVM] Acc=0.782  P=0.779  R=0.782  F1=0.780


## 3) Pre‚ÄëTrained Models (Tokenization and Dataset Prep)

# --- Transformer Backbone and Tokenization Setup ---

#CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"  
- defines the pretrained ClinicalBERT model specialized for clinical or medical text

#DISTIL_BERT   = "distilbert-base-uncased"  
- defines the lightweight DistilBERT model for faster fine-tuning and baseline comparison

#BACKBONE = CLINICAL_BERT  
- sets ClinicalBERT as the default model backbone for experiments

#tokenizer = AutoTokenizer.from_pretrained(BACKBONE)  
- loads the tokenizer that matches the chosen transformer backbone

#def tokenize_texts(texts, max_length=128):  
- defines a function to tokenize a list or series of input texts
    #return tokenizer(  
    #    list(texts),                 # a Python list of strings  
    #    padding=True,                # pads shorter texts to the same length  
    #    truncation=True,             # trims texts that exceed max_length  
    #    max_length=max_length,       # sets a limit on sequence length  
    #    return_tensors="pt"          # returns tensors compatible with PyTorch  
    #)  
- applies the tokenizer to input texts and returns encoded tensors ready for model input

#train_enc = tokenize_texts(X_train)  
- tokenizes the training dataset to produce input IDs and attention masks

#val_enc   = tokenize_texts(X_val)  
- tokenizes the validation dataset using the same tokenizer settings

#train_ds = Dataset.from_dict({  
#    "input_ids": train_enc["input_ids"],  
#    "attention_mask": train_enc["attention_mask"],  
#    "labels": torch.tensor(y_train)  
#})  
- creates a Hugging Face Dataset object for the training data containing input tensors and labels

#val_ds = Dataset.from_dict({  
#    "input_ids": val_enc["input_ids"],  
#    "attention_mask": val_enc["attention_mask"],  
#    "labels": torch.tensor(y_val)  
#})  
- creates a similar Dataset object for the validation data

#len(train_ds), len(val_ds)  
- checks the number of samples in both training and validation datasets


In [None]:

# Choose your checkpoints.
# We include ClinicalBERT (for clinical text) and DistilBERT (fast baseline).
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
DISTIL_BERT   = "distilbert-base-uncased"

# Pick one as the default backbone for experiments below.
BACKBONE = CLINICAL_BERT

# Initialize tokenizer for the chosen backbone
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

# Helper to tokenize a pandas series with per-line comments
def tokenize_texts(texts, max_length=128):
    # Apply the tokenizer: returns dict with input_ids and attention_mask
    return tokenizer(
        list(texts),                 # a Python list of strings
        padding=True,                # pad to the longest in the batch
        truncation=True,             # cut off text exceeding max_length
        max_length=max_length,       # cap sequence length
        return_tensors="pt"          # return PyTorch tensors
    )

# Tokenize train/validation splits
train_enc = tokenize_texts(X_train)
val_enc   = tokenize_texts(X_val)

# Wrap into HF Datasets with labels
train_ds = Dataset.from_dict({
    "input_ids": train_enc["input_ids"],
    "attention_mask": train_enc["attention_mask"],
    "labels": torch.tensor(y_train)
})
val_ds = Dataset.from_dict({
    "input_ids": val_enc["input_ids"],
    "attention_mask": val_enc["attention_mask"],
    "labels": torch.tensor(y_val)
})

len(train_ds), len(val_ds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

(42144, 10537)

## 4) Training of Data (Trainer utilities and metrics)

# --- Metric Computation and Weighted Trainer ---

#def compute_metrics(eval_pred):  
- defines a function to calculate evaluation metrics during training

#logits, labels = eval_pred  
- unpacks model predictions (logits) and true labels from the evaluation results

#preds = np.argmax(logits, axis=-1)  
- selects the class with the highest prediction score for each sample

#precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")  
- computes precision, recall, and F1-score assuming binary classification

#acc = accuracy_score(labels, preds)  
- calculates overall accuracy between predictions and true labels

#return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}  
- returns a dictionary of all computed metric values to the Trainer

#pos = (y_train == 1).sum()  
- counts how many positive samples are in the training data

#neg = (y_train == 0).sum()  
- counts how many negative samples are in the training data

#w_pos = neg / max(pos, 1)  
- sets the positive class weight inversely proportional to its frequency

#w_neg = 1.0  
- assigns a baseline weight of 1 to the negative class

#class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float).to(device)  
- creates a tensor of class weights and moves it to the selected device (CPU or GPU)

#print(f"Class weights (neg, pos): {class_weights.tolist()}")  
- displays the computed class weights for reference

#from torch.nn import CrossEntropyLoss  
- imports the cross-entropy loss function used for classification tasks

#class WeightedTrainer(Trainer):  
- defines a custom Trainer subclass that supports class-weighted loss

#def compute_loss(self, model, inputs, return_outputs=False):  
- overrides the Trainer‚Äôs default loss computation method

#labels = inputs.get("labels")  
- extracts the ground-truth labels from the input batch

#outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})  
- runs a forward pass of the model using the input tensors (excluding labels)

#logits = outputs.get("logits")  
- retrieves raw model output scores before activation

#loss_fct = CrossEntropyLoss(weight=class_weights)  
- initializes cross-entropy loss with the specified class weights

#loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))  
- computes the weighted loss comparing predictions to true labels

#return (loss, outputs) if return_outputs else loss  
- returns both loss and model outputs if requested, otherwise just the loss value


In [None]:

# Metric function for the Trainer: computes Accuracy, Precision, Recall, F1
def compute_metrics(eval_pred):
    # eval_pred is a tuple of (logits, labels)
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Optional: class weights for imbalanced datasets
# Compute weights inversely proportional to class frequencies
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
w_pos = neg / max(pos, 1)   # weight for positive class
w_neg = 1.0                 # keep negative as baseline
class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float).to(device)
print(f"Class weights (neg, pos): {class_weights.tolist()}" )

# Custom Trainer that injects weighted loss
from torch.nn import CrossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


Class weights (neg, pos): [1.0, 1.3836109638214111]


## 5) Fine‚Äëtuning (Three Experiments)

# --- 5) Fine-tuning (Three Experiments) [version-compatible] ---

#import numpy as np  
- provides support for array operations and numerical computations

#import torch  
- enables tensor computation and GPU acceleration

#from collections import OrderedDict  
- maintains insertion order for storing results in a dictionary

#from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer  
- imports Hugging Face components for model loading, configuration, and training

#from torch.nn import CrossEntropyLoss  
- imports the loss function used for classification tasks

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
- sets computation to GPU if available, otherwise CPU

#num_labels = len(np.unique(y_train))  
- counts how many unique classes exist in the training set

#avg_type = "binary" if num_labels == 2 else "weighted"  
- decides the metric averaging type based on whether the task is binary or multiclass

#print(f"[Fine-tune] Detected {num_labels} classes ‚Üí metrics average='{avg_type}'")  
- prints the number of detected classes and chosen averaging method

#def compute_metrics(eval_pred):  
- defines a function to compute evaluation metrics for each epoch
    #logits, labels = eval_pred  
    - extracts predicted logits and true labels
    #preds = np.argmax(logits, axis=-1)  
    - converts logits into class predictions by selecting the highest score
    #from sklearn.metrics import accuracy_score, precision_recall_fscore_support  
    - imports metrics used for evaluation
    #p, r, f, _ = precision_recall_fscore_support(labels, preds, average=avg_type)  
    - calculates precision, recall, and F1 using the chosen averaging strategy
    #acc = accuracy_score(labels, preds)  
    - computes overall prediction accuracy
    #return {"accuracy": acc, "precision": p, "recall": r, "f1": f}  
    - returns a dictionary of computed metric values

#counts = np.bincount(y_train, minlength=num_labels)  
- counts occurrences of each class label in the training data

#weights = counts.max() / np.maximum(counts, 1)  
- computes inverse-frequency weights to balance rare classes

#class_weights = torch.tensor(weights, dtype=torch.float32, device=device)  
- converts weights into a tensor and moves them to the computation device

#print(f"[Fine-tune] Class weights: {class_weights.tolist()}")  
- displays the computed class weights for reference

#class WeightedTrainer(Trainer):  
- creates a custom Trainer class to support weighted loss computation

#def compute_loss(self, model, inputs, return_outputs=False):  
- overrides the default loss calculation method
    #labels = inputs.get("labels")  
    - extracts the label tensor from the input batch
    #outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})  
    - performs a forward pass on the model excluding labels
    #logits = outputs.get("logits")  
    - retrieves raw model predictions
    #loss_fct = CrossEntropyLoss(weight=class_weights)  
    - initializes the cross-entropy loss function with class weighting
    #loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))  
    - computes weighted loss comparing predictions to ground truth
    #return (loss, outputs) if return_outputs else loss  
    - returns both loss and outputs if required

#def tokenize_texts(texts, max_length=160):  
- defines a helper function for tokenizing text data before training
    #return tokenizer(  
    #    list(texts),  
    #    padding=True,  
    #    truncation=True,  
    #    max_length=max_length,  
    #    return_tensors="pt"  
    #)  
    - applies the tokenizer with truncation, padding, and fixed max length

#import inspect  
- imports a module used to check function argument compatibility across versions

#def make_training_args(name, batch_size, lr, epochs, weight_decay, warmup_ratio):  
- defines a factory function to create version-safe training arguments
    #kwargs_modern = dict(... )  
    - defines parameters for newer versions of Hugging Face Transformers
    #try:  
    - attempts to use modern TrainingArguments configuration
    #return TrainingArguments(**kwargs_modern)  
    - returns the configured training arguments
    #except TypeError:  
    - catches errors if running an older version of Transformers
    #print("[Fine-tune] Using legacy TrainingArguments fallback.")  
    - notifies the user that the fallback configuration is being used
    #kwargs_legacy = dict(... )  
    - defines parameters compatible with legacy versions
    #return TrainingArguments(**kwargs_legacy)  
    - returns legacy-compatible training arguments

#def run_experiment(name, backbone, batch_size=16, lr=2e-5, epochs=3,  
#                   weight_decay=0.01, warmup_ratio=0.1, max_length=160):  
- defines a function to execute one full fine-tuning experiment
    #tr = tokenize_texts(X_train, max_length=max_length)  
    - tokenizes training text with the chosen sequence length
    #va = tokenize_texts(X_val, max_length=max_length)  
    - tokenizes validation text similarly
    #train_ds_local = Dataset.from_dict({...})  
    - creates a Hugging Face Dataset object for the training set
    #val_ds_local = Dataset.from_dict({...})  
    - creates a Dataset object for validation
    #model = AutoModelForSequenceClassification.from_pretrained(  
    #    backbone, num_labels=num_labels  
    #).to(device)  
    - loads a pretrained model with a classification head matching the number of classes
    #args = make_training_args(... )  
    - generates compatible training arguments using provided hyperparameters
    #trainer = WeightedTrainer(... )  
    - initializes the Trainer with model, datasets, tokenizer, metrics, and weighted loss
    #trainer.train()  
    - starts the fine-tuning process
    #metrics = trainer.evaluate()  
    - evaluates the model on the validation set after training
    #print(f"\n>>> {name} results: {metrics}\n")  
    - prints the metrics for this experiment
    #return metrics, trainer  
    - returns both results and the trained model instance

#CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"  
- sets the ClinicalBERT model checkpoint for fine-tuning

#DISTIL_BERT   = "distilbert-base-uncased"  
- sets the DistilBERT model checkpoint as a faster baseline

#results = OrderedDict()  
- initializes an ordered dictionary to store experiment outcomes

#results['expA_clinicalbert_bs16_lr2e-5_ep3'] = run_experiment(... )  
- runs Experiment A using ClinicalBERT with small batch and low learning rate

#results['expB_clinicalbert_bs16_lr5e-5_ep4'] = run_experiment(... )  
- runs Experiment B using ClinicalBERT with higher learning rate and more epochs

#results['expC_distilbert_bs32_lr3e-5_ep3'] = run_experiment(... )  
- runs Experiment C using DistilBERT with larger batch size as a fast baseline

#board = []  
- initializes a list to hold leaderboard data

#for k,(m,_t) in results.items():  
- iterates through experiment results to extract evaluation metrics
    #board.append((k, m.get('eval_f1', float('nan')), m.get('eval_accuracy', float('nan'))))  
    - collects experiment name, F1-score, and accuracy

#board = sorted(board, key=lambda x: x[1], reverse=True)  
- sorts experiments in descending order by F1-score

#print("\nLeaderboard (by F1):")  
- prints the leaderboard heading

#for name, f1, acc in board:  
- loops through the leaderboard entries
    #print(f"{name:35s}  F1={f1:.4f}  Acc={acc:.4f}")  
    - prints formatted performance results for each experiment


In [None]:
# --- 5) Fine-tuning (Three Experiments) [version-compatible] ---
import numpy as np
import torch
from collections import OrderedDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Metrics: binary vs multiclass handled automatically
num_labels = len(np.unique(y_train))
avg_type = "binary" if num_labels == 2 else "weighted"
print(f"[Fine-tune] Detected {num_labels} classes ‚Üí metrics average='{avg_type}'")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f}

# 2) Class weights for imbalanced data (size == num_labels)
counts = np.bincount(y_train, minlength=num_labels)
# Heuristic: inverse-frequency scaled to max=1.0 (safe for CE)
weights = counts.max() / np.maximum(counts, 1)
class_weights = torch.tensor(weights, dtype=torch.float32, device=device)
print(f"[Fine-tune] Class weights: {class_weights.tolist()}")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 3) Helper: tokenizer already defined above. Re-tokenize per max_length
def tokenize_texts(texts, max_length=160):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# 4) Version-compatible TrainingArguments factory
import inspect

def make_training_args(name, batch_size, lr, epochs, weight_decay, warmup_ratio):
    kwargs_modern = dict(
        output_dir=f"./runs/{name}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to=[]
    )
    try:
        # Try modern signature first
        return TrainingArguments(**kwargs_modern)
    except TypeError:
        # Fallback for older transformers (no evaluation_strategy/save_strategy)
        print("[Fine-tune] Using legacy TrainingArguments fallback.")
        kwargs_legacy = dict(
            output_dir=f"./runs/{name}",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=lr,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            logging_steps=50,
            do_eval=True,          # legacy way to enable evaluation
            save_steps=500,        # periodic saving
            overwrite_output_dir=True,
            fp16=torch.cuda.is_available()
        )
        return TrainingArguments(**kwargs_legacy)

def run_experiment(name, backbone, batch_size=16, lr=2e-5, epochs=3,
                   weight_decay=0.01, warmup_ratio=0.1, max_length=160):
    # Re-tokenize for this max_length
    tr = tokenize_texts(X_train, max_length=max_length)
    va = tokenize_texts(X_val,   max_length=max_length)

    train_ds_local = Dataset.from_dict({
        "input_ids": tr["input_ids"],
        "attention_mask": tr["attention_mask"],
        "labels": torch.tensor(y_train, dtype=torch.long)
    })
    val_ds_local = Dataset.from_dict({
        "input_ids": va["input_ids"],
        "attention_mask": va["attention_mask"],
        "labels": torch.tensor(y_val, dtype=torch.long)
    })

    # Load backbone with correct num_labels
    model = AutoModelForSequenceClassification.from_pretrained(
        backbone, num_labels=num_labels
    ).to(device)

    args = make_training_args(
        name=name, batch_size=batch_size, lr=lr, epochs=epochs,
        weight_decay=weight_decay, warmup_ratio=warmup_ratio
    )

    trainer = WeightedTrainer(
        model=model,
        args=args,
        train_dataset=train_ds_local,
        eval_dataset=val_ds_local,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f"\n>>> {name} results: {metrics}\n")
    return metrics, trainer

# --- Define backbones (already set earlier) ---
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
DISTIL_BERT   = "distilbert-base-uncased"

results = OrderedDict()

# Exp-A: ClinicalBERT, conservative LR, small batch
results['expA_clinicalbert_bs16_lr2e-5_ep3'] = run_experiment(
    name="expA_clinicalbert_bs16_lr2e-5_ep3",
    backbone=CLINICAL_BERT,
    batch_size=16, lr=2e-5, epochs=3,
    weight_decay=0.01, warmup_ratio=0.1, max_length=160
)

# Exp-B: ClinicalBERT, slightly higher LR, more epochs
results['expB_clinicalbert_bs16_lr5e-5_ep4'] = run_experiment(
    name="expB_clinicalbert_bs16_lr5e-5_ep4",
    backbone=CLINICAL_BERT,
    batch_size=16, lr=5e-5, epochs=4,
    weight_decay=0.01, warmup_ratio=0.06, max_length=160
)

# Exp-C: DistilBERT fast baseline
results['expC_distilbert_bs32_lr3e-5_ep3'] = run_experiment(
    name="expC_distilbert_bs32_lr3e-5_ep3",
    backbone=DISTIL_BERT,
    batch_size=32, lr=3e-5, epochs=3,
    weight_decay=0.01, warmup_ratio=0.1, max_length=128
)

# Leaderboard
board = []
for k,(m,_t) in results.items():
    board.append((k, m.get('eval_f1', float('nan')), m.get('eval_accuracy', float('nan'))))
board = sorted(board, key=lambda x: x[1], reverse=True)
print("\nLeaderboard (by F1):")
for name, f1, acc in board:
    print(f"{name:35s}  F1={f1:.4f}  Acc={acc:.4f}")


NameError: name 'y_train' is not defined

## 6) Eval (Pick Best and Run Inference)

# --- Best Model Selection, Saving, and Inference ---

#best_name, best_f1 = None, -1.0  
- initializes variables to store the best model‚Äôs name and its highest F1 score

#best_trainer = None  
- sets a placeholder for the trainer object of the best-performing run

#for name,(metrics, trainer) in results.items():  
- iterates through all fine-tuning experiment results
    #if metrics['eval_f1'] > best_f1:  
    - checks if the current run‚Äôs F1 score is higher than the previous best
    #best_f1 = metrics['eval_f1']  
    - updates the highest F1 score
    #best_name = name  
    - records the name of the best-performing experiment
    #best_trainer = trainer  
    - stores the trainer associated with that best run

#print(f"Best run: {best_name} with F1={best_f1:.4f}")  
- displays the name and F1 score of the top-performing model

#save_dir = f"./best_model_{best_name}"  
- defines the directory path where the best model will be saved

#best_trainer.save_model(save_dir)  
- saves the fine-tuned model weights and configuration to the specified directory

#tokenizer.save_pretrained(save_dir)  
- saves the tokenizer configuration to the same folder for consistent reuse

#def predict(texts, model_dir=save_dir):  
- defines a helper function to perform predictions on new text samples
    #tok = AutoTokenizer.from_pretrained(model_dir)  
    - loads the saved tokenizer from the specified model directory
    #mdl = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)  
    - loads the saved fine-tuned model and moves it to the computation device
    #enc = tok(list(texts), padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)  
    - tokenizes the input texts with padding and truncation, returning PyTorch tensors
    #with torch.no_grad():  
    - disables gradient computation to speed up inference
        #logits = mdl(**enc).logits  
        - performs a forward pass to get raw prediction scores (logits)
    #pred = torch.argmax(logits, dim=-1).cpu().numpy()  
    - converts logits to final predicted class labels
    #prob = torch.softmax(logits, dim=-1).cpu().numpy()[:,1]  
    - converts logits into probabilities and selects the positive-class probability
    #return pred, prob  
    - returns both the predicted labels and their confidence scores

#samples = [  
#    "I feel calm and in control today.",  
#    "My chest is tight and I cannot focus, I think I am very stressed.",  
#    "Workload is heavy but manageable so far."  
#]  
- defines example text inputs to test the model‚Äôs predictions

#pred, prob = predict(samples)  
- runs the prediction helper on the sample texts

#for s, y, p in zip(samples, pred, prob):  
- iterates over each sample, its predicted label, and probability
    #lab = "stressed(1)" if y==1 else "not-stressed(0)"  
    - assigns a readable label name depending on the prediction value
    #print(f"[{lab}  p={p:.3f}]  {s}")  
    - prints the predicted label, probability, and the original sentence


In [None]:

# Select the best run from 'results' dict above
best_name, best_f1 = None, -1.0
best_trainer = None
for name,(metrics, trainer) in results.items():
    if metrics['eval_f1'] > best_f1:
        best_f1 = metrics['eval_f1']
        best_name = name
        best_trainer = trainer

print(f"Best run: {best_name} with F1={best_f1:.4f}")

# Save the best model for reuse
save_dir = f"./best_model_{best_name}"
best_trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

# Simple inference helper
def predict(texts, model_dir=save_dir):
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
    enc = tok(list(texts), padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = mdl(**enc).logits
    pred = torch.argmax(logits, dim=-1).cpu().numpy()
    prob = torch.softmax(logits, dim=-1).cpu().numpy()[:,1]
    return pred, prob

# Demo predictions on a few samples
samples = [
    "I feel calm and in control today.",
    "My chest is tight and I cannot focus, I think I am very stressed.",
    "Workload is heavy but manageable so far."
]
pred, prob = predict(samples)
for s, y, p in zip(samples, pred, prob):
    lab = "stressed(1)" if y==1 else "not‚Äëstressed(0)"
    print(f"[{lab}  p={p:.3f}]  {s}")
