
# Stress Status Detection ‚Äî End‚Äëto‚ÄëEnd Colab Notebook  
**Order:** Load Dataset ‚Üí Baseline Models ‚Üí Pre‚ÄëTrained Models ‚Üí Training of Data ‚Üí Fine‚Äëtuning ‚Üí Eval

> This notebook encodes the provided fine‚Äëtuning script into a structured, well‚Äëcommented pipeline.  
> It includes at least **three hyperparameter experiments** (aiming to maximize **F1‚ÄëScore**).  
> Replace the dataset path with your CSV if needed ‚Äî required columns: `statement` (text) and `status` (0/1).


# 0) Setup (libraries and reproducibility)

--- Import and Environment Setup ---

import os

- manages system paths, folders, and environment variables to handle files and directories efficiently during the execution of the notebook.

import math

- includes mathematical tools and formulas that can assist in calculations such as learning rate adjustments or numeric transformations during model training.

import random

- controls and initializes random number generation, ensuring that every run of the model produces consistent outcomes for reproducibility.

import numpy as np

- provides extensive support for numerical data handling, offering fast and flexible operations on arrays and matrices used throughout the data preparation process.

import pandas as pd

- allows for structured data loading and manipulation, making it easier to explore, clean, and organize datasets, especially when working with CSV files.

from pathlib import Path

- gives a cleaner and more reliable way to manage file and directory paths across different operating systems.

--- Core Framework Imports ---

import torch

- provides the base framework for tensor manipulation and GPU acceleration, enabling efficient computation for training and evaluating deep learning models.

from datasets import Dataset

- transforms pandas DataFrames into optimized dataset objects that integrate smoothly with the Hugging Face Transformers library for preprocessing and training.

from transformers import (
AutoTokenizer,

- automatically selects and loads the appropriate tokenizer for a specific pre-trained model to ensure consistent tokenization.
AutoModelForSequenceClassification,

- initializes a pre-trained Transformer model with an added classification head, suitable for tasks like sentiment analysis or text categorization.
TrainingArguments,

- specifies and stores key hyperparameters such as the number of epochs, batch size, and evaluation frequency for the model training process.
Trainer

- streamlines the entire fine-tuning procedure, managing training, evaluation, logging, and checkpoint saving without requiring manual loop implementation.
)

--- Evaluation Metric Imports ---

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

- brings in performance evaluation tools that calculate key metrics such as accuracy, precision, recall, and F1-score to assess the model‚Äôs prediction quality.

--- Reproducibility Configuration ---

SEED = 42

- defines a fixed seed number to guarantee that all random processes across libraries yield consistent results.
random.seed(SEED)

- ensures that Python‚Äôs random number operations remain stable and predictable in every run.
np.random.seed(SEED)

- controls NumPy‚Äôs internal random processes to maintain the same shuffling or sampling patterns across executions.
torch.manual_seed(SEED)

- fixes PyTorch‚Äôs randomization for consistent model weight initialization and data handling.
torch.cuda.manual_seed_all(SEED)

- applies the same reproducibility rule across all available GPUs to maintain uniform outcomes even in multi-GPU training setups.

--- Device Detection ---

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

- determines whether a GPU is available for acceleration and defaults to CPU if not, ensuring compatibility in any environment.
print(f"Using device: {device}")

- prints out the current hardware in use to confirm that GPU acceleration is properly detected and active.

In [None]:

# Every import has an explanatory comment.
import os                         # file paths and environment checks
import math                       # math helpers (may be useful for schedules)
import random                     # Python's RNG for reproducibility
import numpy as np                # numerical arrays and metrics support
import pandas as pd               # data loading and manipulation
from pathlib import Path          # convenient and robust path handling

# Hugging Face / PyTorch stack (for transformer fine‚Äëtuning)
import torch                      # tensor and GPU utilities
from datasets import Dataset      # lightweight dataset wrapper around pandas
from transformers import (       # core HF components for tokenization and training
    AutoTokenizer,               # auto‚Äëloads the right tokenizer for a given model checkpoint
    AutoModelForSequenceClassification,  # classification head on top of a transformer
    TrainingArguments,           # training hyperparameters container
    Trainer                      # training loop helper (handles eval and logging)
)

# Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Make runs reproducible (seed Python, NumPy, and PyTorch)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Detect device once and print for visibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # shows 'cuda' when a GPU is available in Colab


Using device: cpu


## 1) Load Dataset

# --- Load Dataset (Upload version, auto-encodes text labels) ---

#import pandas as pd

- imports the pandas library, which is essential for reading, organizing, and analyzing CSV data within Python.

#from pathlib import Path

- provides a structured and cross-platform way to handle file paths, making directory navigation and file references more reliable.

#from google.colab import files

- activates Google Colab‚Äôs file upload feature, allowing users to upload local datasets directly into the runtime environment.

#print("üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)")

- displays a clear message prompting the user to upload a dataset file in CSV format for processing.

#uploaded = files.upload()

 -opens a file selection dialog so that the user can choose and upload the desired dataset from their computer.

#filename = list(uploaded.keys())[0]

- extracts the name of the uploaded file from the dictionary of uploaded files.

#csv_path = Path(f"/content/{filename}")

- constructs a full, system-compatible file path pointing to the uploaded dataset within the Colab working directory.

#print(f"‚úÖ File uploaded successfully: {csv_path}")

- provides feedback confirming that the file upload was successful and shows where the file was saved.

#df = pd.read_csv(csv_path)

- loads the uploaded CSV file into a pandas DataFrame, preparing it for inspection and processing.

# --- Validate columns ---

#expected_cols = {'statement', 'status'}

- defines the columns that must exist in the dataset to ensure it matches the expected structure for further steps.

#assert expected_cols.issubset(df.columns), f"‚ùå Missing required columns: {expected_cols - set(df.columns)}"

- verifies that all required columns are present in the dataset; if not, the code stops and reports which ones are missing.

# --- Clean ---

#df = df.dropna(subset=['statement', 'status']).copy()

- deletes any rows containing missing values in the ‚Äòstatement‚Äô or ‚Äòstatus‚Äô columns to maintain data consistency.

#df['statement'] = df['statement'].astype(str)

- converts all entries in the ‚Äòstatement‚Äô column into string type to prevent formatting or type errors later in processing.

# --- Encode text labels into integers ---

#from sklearn.preprocessing import LabelEncoder

- brings in a class from scikit-learn that converts categorical text labels into numerical form for model compatibility.

#le = LabelEncoder()

- initializes the LabelEncoder, preparing it to map text categories into numeric codes.

#df['status_encoded'] = le.fit_transform(df['status'])

- fits the encoder to the ‚Äòstatus‚Äô column and generates a new column containing the corresponding numeric label values.

#print("üî§ Label encoding map:")

- prints a section heading to indicate that the label-to-code mapping will be shown next.

#for label, code in zip(le.classes_, range(len(le.classes_))):

- loops through each label and its encoded numeric representation to display the mapping relationship.
  print(f"  {code} ‚Üí {label}")  - prints each numeric code and its associated label for verification.

#df['status'] = df['status_encoded']

- replaces the original ‚Äòstatus‚Äô column‚Äôs text labels with their corresponding numeric values.

#df.drop(columns=['status_encoded'], inplace=True)

- deletes the temporary ‚Äòstatus_encoded‚Äô column since the main ‚Äòstatus‚Äô column now contains the encoded values.

#print("\n‚úÖ Dataset loaded and label-encoded successfully!")

- outputs a confirmation message indicating that the dataset has been fully cleaned and encoded without errors.

#print(df['status'].value_counts(dropna=False))

- displays a frequency count of each encoded label, helping verify that the encoding process was applied correctly.

#df.head(3)

- shows the first three rows of the cleaned and processed dataset to confirm that all transformations were applied successfully.


In [None]:
# --- Load Dataset (Upload version, auto-encodes text labels) ---
import pandas as pd
from pathlib import Path
from google.colab import files

print("üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)")
uploaded = files.upload()

# Automatically pick the first uploaded file
filename = list(uploaded.keys())[0]
csv_path = Path(f"/content/{filename}")

print(f"‚úÖ File uploaded successfully: {csv_path}")

# Load the CSV
df = pd.read_csv(csv_path)

# --- Validate columns ---
expected_cols = {'statement', 'status'}
assert expected_cols.issubset(df.columns), f"‚ùå Missing required columns: {expected_cols - set(df.columns)}"

# --- Clean ---
df = df.dropna(subset=['statement', 'status']).copy()
df['statement'] = df['statement'].astype(str)

# --- Encode text labels into integers ---
# This maps each unique label (like 'Anxiety', 'Stress', etc.) to a numeric ID
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['status_encoded'] = le.fit_transform(df['status'])

# Optional: print mapping for your reference
print("üî§ Label encoding map:")
for label, code in zip(le.classes_, range(len(le.classes_))):
    print(f"  {code} ‚Üí {label}")

# Replace 'status' with the encoded version
df['status'] = df['status_encoded']
df.drop(columns=['status_encoded'], inplace=True)

print("\n‚úÖ Dataset loaded and label-encoded successfully!")
print(df['status'].value_counts(dropna=False))
df.head(3)


üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)


Saving Combined Data.csv to Combined Data (2).csv
‚úÖ File uploaded successfully: /content/Combined Data (2).csv
üî§ Label encoding map:
  0 ‚Üí Anxiety
  1 ‚Üí Bipolar
  2 ‚Üí Depression
  3 ‚Üí Normal
  4 ‚Üí Personality disorder
  5 ‚Üí Stress
  6 ‚Üí Suicidal

‚úÖ Dataset loaded and label-encoded successfully!
status
3    16343
2    15404
6    10652
0     3841
1     2777
5     2587
4     1077
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,0
1,1,"trouble sleeping, confused mind, restless hear...",0
2,2,"All wrong, back off dear, forward doubt. Stay ...",0


## 2) Baseline Models (TF‚ÄëIDF + Linear)

# --- Baseline Models (TF-IDF + Linear, supports multi-class) ---

#from sklearn.model_selection import train_test_split
‚Äì divides the dataset into separate subsets for training and validation purposes

#from sklearn.feature_extraction.text import TfidfVectorizer
‚Äì transforms raw text into numerical representations using the TF-IDF method

#from sklearn.linear_model import LogisticRegression
‚Äì loads the logistic regression algorithm used for text classification

#from sklearn.svm import LinearSVC
‚Äì loads the linear support vector machine classifier for categorizing text

#from sklearn.metrics import accuracy_score, precision_recall_fscore_support
‚Äì provides built-in functions to measure model performance using common evaluation metrics

#import numpy as np
‚Äì supports efficient numerical calculations and operations on arrays

#X_train, X_val, y_train, y_val = train_test_split(

df['statement'].values,
df['status'].values,
test_size=0.2,
random_state=42,
stratify=df['status'].values

#)
‚Äì separates the dataset into 80% training and 20% validation samples while maintaining balanced class distribution

#tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=40000)
‚Äì builds a TF-IDF model that captures single words and two-word phrases, ignoring rare terms and limiting total features to 40,000

#Xtr = tfidf.fit_transform(X_train)
‚Äì learns vocabulary patterns from the training set and converts text into TF-IDF feature vectors

#Xva = tfidf.transform(X_val)
‚Äì applies the trained TF-IDF transformation to the validation set without retraining

#num_classes = len(np.unique(y_train))
‚Äì determines how many distinct categories or labels exist in the dataset

#avg_type = "binary" if num_classes == 2 else "weighted"
‚Äì automatically chooses whether to use binary or weighted averaging based on the number of classes

#print(f"Detected {num_classes} classes ‚Üí using average='{avg_type}' for metrics.\n")
‚Äì outputs the number of identified classes and indicates which averaging method will be applied for evaluation

--- Baseline 1: Logistic Regression ---

#logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
‚Äì creates a logistic regression model configured to balance uneven class frequencies and allow more training iterations

#logreg.fit(Xtr, y_train)
‚Äì trains the logistic regression classifier using the prepared TF-IDF features and corresponding labels

#pred_lr = logreg.predict(Xva)
‚Äì produces predictions on unseen validation data using the trained logistic regression model

#p, r, f, _ = precision_recall_fscore_support(y_val, pred_lr, average=avg_type)
‚Äì calculates the precision, recall, and F1-score metrics according to the averaging method chosen

#acc = accuracy_score(y_val, pred_lr)
‚Äì evaluates how often the logistic regression model predicted the correct label

#print(f"[Baseline-LR] Acc={acc:.3f} P={p:.3f} R={r:.3f} F1={f:.3f}")
‚Äì prints the accuracy, precision, recall, and F1-score results for the logistic regression model

--- Baseline 2: Linear SVM ---

#svm = LinearSVC(class_weight="balanced")
‚Äì initializes a linear SVM model that compensates for class imbalance during training

#svm.fit(Xtr, y_train)
‚Äì fits the SVM classifier using the TF-IDF features from the training data

#pred_svm = svm.predict(Xva)
‚Äì predicts the validation set labels using the trained SVM model

#p, r, f, _ = precision_recall_fscore_support(y_val, pred_svm, average=avg_type)
‚Äì computes precision, recall, and F1-score for the SVM‚Äôs predictions based on the selected averaging mode

#acc = accuracy_score(y_val, pred_svm)
‚Äì determines the SVM model‚Äôs accuracy across all validation examples

#print(f"[Baseline-SVM] Acc={acc:.3f} P={p:.3f} R={r:.3f} F1={f:.3f}")
‚Äì displays the accuracy, precision, recall, and F1-score achieved by the SVM baseline model


In [None]:
# --- Baseline Models (TF-IDF + Linear, supports multi-class) ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    df['statement'].values,
    df['status'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['status'].values
)

# Convert raw text into TF-IDF features
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=40000)
Xtr = tfidf.fit_transform(X_train)
Xva = tfidf.transform(X_val)

# Detect if this is binary or multiclass
num_classes = len(np.unique(y_train))
avg_type = "binary" if num_classes == 2 else "weighted"
print(f"Detected {num_classes} classes ‚Üí using average='{avg_type}' for metrics.\n")

# --- Baseline 1: Logistic Regression ---
logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
logreg.fit(Xtr, y_train)
pred_lr = logreg.predict(Xva)
p, r, f, _ = precision_recall_fscore_support(y_val, pred_lr, average=avg_type)
acc = accuracy_score(y_val, pred_lr)
print(f"[Baseline-LR] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")

# --- Baseline 2: Linear SVM ---
svm = LinearSVC(class_weight="balanced")
svm.fit(Xtr, y_train)
pred_svm = svm.predict(Xva)
p, r, f, _ = precision_recall_fscore_support(y_val, pred_svm, average=avg_type)
acc = accuracy_score(y_val, pred_svm)
print(f"[Baseline-SVM] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")


Detected 7 classes ‚Üí using average='weighted' for metrics.

[Baseline-LR] Acc=0.778  P=0.787  R=0.778  F1=0.777
[Baseline-SVM] Acc=0.782  P=0.779  R=0.782  F1=0.780


## 3) Pre‚ÄëTrained Models (Tokenization and Dataset Prep)

# --- Transformer Backbone and Tokenization Setup ---

#CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
‚Äì specifies the pretrained ClinicalBERT model, which is optimized for understanding clinical and medical language

#DISTIL_BERT = "distilbert-base-uncased"
‚Äì specifies the lightweight DistilBERT model designed for faster and more efficient fine-tuning compared to larger transformer models

#BACKBONE = CLINICAL_BERT
‚Äì assigns ClinicalBERT as the main transformer model to be used for this experiment

#tokenizer = AutoTokenizer.from_pretrained(BACKBONE)
‚Äì loads the tokenizer associated with the selected transformer model to ensure text encoding consistency

#def tokenize_texts(texts, max_length=128):
‚Äì defines a reusable function that converts a collection of raw text samples into tokenized sequences suitable for the model
    #return tokenizer(
      list(texts),‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚Äì transforms the input texts into a list format
      padding=True,‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚Äì automatically pads all sequences to the same length
      truncation=True,‚ÄÉ‚ÄÉ‚ÄÉ‚Äì shortens sequences that exceed the specified maximum length
      max_length=max_length,‚ÄÉ‚Äì defines the limit for each tokenized text sequence
      return_tensors="pt"‚ÄÉ‚ÄÉ‚Äì outputs data as PyTorch-compatible tensors
    )
‚Äì applies the tokenizer configuration to the texts and produces ready-to-use numerical tensors

#train_enc = tokenize_texts(X_train)
‚Äì processes and encodes all training sentences into model-readable token IDs and attention masks

#val_enc = tokenize_texts(X_val)
‚Äì applies the same tokenization steps to the validation set to maintain consistency with the training data

#train_ds = Dataset.from_dict({

"input_ids": train_enc["input_ids"],
"attention_mask": train_enc["attention_mask"],
"labels": torch.tensor(y_train)

#})
‚Äì builds a structured Hugging Face dataset for the training portion, including encoded inputs and their respective labels

#val_ds = Dataset.from_dict({

"input_ids": val_enc["input_ids"],
"attention_mask": val_enc["attention_mask"],
"labels": torch.tensor(y_val)

#})
‚Äì constructs a matching dataset object for the validation data with identical field structure

#len(train_ds), len(val_ds)
‚Äì verifies and displays how many records are contained within the training and validation datasets

In [None]:

# Choose your checkpoints.
# We include ClinicalBERT (for clinical text) and DistilBERT (fast baseline).
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
DISTIL_BERT   = "distilbert-base-uncased"

# Pick one as the default backbone for experiments below.
BACKBONE = CLINICAL_BERT

# Initialize tokenizer for the chosen backbone
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

# Helper to tokenize a pandas series with per-line comments
def tokenize_texts(texts, max_length=128):
    # Apply the tokenizer: returns dict with input_ids and attention_mask
    return tokenizer(
        list(texts),                 # a Python list of strings
        padding=True,                # pad to the longest in the batch
        truncation=True,             # cut off text exceeding max_length
        max_length=max_length,       # cap sequence length
        return_tensors="pt"          # return PyTorch tensors
    )

# Tokenize train/validation splits
train_enc = tokenize_texts(X_train)
val_enc   = tokenize_texts(X_val)

# Wrap into HF Datasets with labels
train_ds = Dataset.from_dict({
    "input_ids": train_enc["input_ids"],
    "attention_mask": train_enc["attention_mask"],
    "labels": torch.tensor(y_train)
})
val_ds = Dataset.from_dict({
    "input_ids": val_enc["input_ids"],
    "attention_mask": val_enc["attention_mask"],
    "labels": torch.tensor(y_val)
})

len(train_ds), len(val_ds)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

(42144, 10537)

## 4) Training of Data (Trainer utilities and metrics)

Metric function for the Trainer: computes Accuracy, Precision, Recall, F1

‚Äì defines a function used by the Trainer to evaluate model performance through key metrics such as accuracy, precision, recall, and F1-score

eval_pred is a tuple of (logits, labels)

‚Äì indicates that the function receives two components: the model‚Äôs raw predictions (logits) and the actual ground-truth labels (labels)

logits, labels = eval_pred

‚Äì unpacks the tuple into separate variables representing predicted outputs and true labels

preds = np.argmax(logits, axis=-1)

‚Äì selects the class with the highest predicted probability for each input sample

precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

‚Äì calculates precision, recall, and F1-score across all predictions using a binary averaging scheme

acc = accuracy_score(labels, preds)

‚Äì measures the overall proportion of correct predictions made by the model

return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

‚Äì returns the computed metrics in a dictionary format for reporting and monitoring during training

Optional: class weights for imbalanced datasets

‚Äì introduces a section that handles uneven class distributions by adjusting their relative training importance

Compute weights inversely proportional to class frequencies

‚Äì derives weight values where less frequent classes receive higher importance in the loss function

pos = (y_train == 1).sum()

‚Äì counts how many samples belong to the positive class in the training data

neg = (y_train == 0).sum()

‚Äì counts how many samples belong to the negative class in the training data

w_pos = neg / max(pos, 1) # weight for positive class

‚Äì assigns a weight to the positive class that is inversely proportional to its frequency to counter class imbalance

w_neg = 1.0 # keep negative as baseline

‚Äì keeps the negative class weight as the standard reference (baseline weight of 1.0)

class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float).to(device)

‚Äì converts both class weights into a PyTorch tensor and transfers them to the active computing device (CPU or GPU)

#print(f"Class weights (neg, pos): {class_weights.tolist()}")
‚Äì outputs the computed class weights for verification and transparency

Custom Trainer that injects weighted loss

‚Äì defines a subclass of the Hugging Face Trainer that incorporates class-weighted loss during backpropagation

#from torch.nn import CrossEntropyLoss
‚Äì imports the cross-entropy loss function, which is standard for classification tasks

#class WeightedTrainer(Trainer):
‚Äì creates a custom training class that inherits properties and methods from the base Trainer class

#def compute_loss(self, model, inputs, return_outputs=False):
‚Äì overrides the default loss computation method to integrate the weighted loss function

#labels = inputs.get("labels")
‚Äì extracts the true labels from the batch input dictionary

#outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
‚Äì performs a forward pass through the model while excluding the labels from the input arguments

#logits = outputs.get("logits")
‚Äì retrieves the predicted logits from the model output

#loss_fct = CrossEntropyLoss(weight=class_weights)
‚Äì initializes a cross-entropy loss function that applies the predefined class weights

#loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
‚Äì computes the final weighted loss by comparing predicted logits and true labels across all samples

#return (loss, outputs) if return_outputs else loss
‚Äì returns both loss and model outputs (if requested), otherwise only the computed loss for training

In [None]:

# Metric function for the Trainer: computes Accuracy, Precision, Recall, F1
def compute_metrics(eval_pred):
    # eval_pred is a tuple of (logits, labels)
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Optional: class weights for imbalanced datasets
# Compute weights inversely proportional to class frequencies
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
w_pos = neg / max(pos, 1)   # weight for positive class
w_neg = 1.0                 # keep negative as baseline
class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float).to(device)
print(f"Class weights (neg, pos): {class_weights.tolist()}" )

# Custom Trainer that injects weighted loss
from torch.nn import CrossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


Class weights (neg, pos): [1.0, 1.3836109638214111]


## 5) Fine‚Äëtuning (Three Experiments)



* `# --- 5) Fine-tuning (Three Experiments) [version-compatible] ---`
  ‚Äì runs three fine-tuning trials with settings that work across different Transformers versions.

* `# 1) Metrics: binary vs multiclass handled automatically`
  ‚Äì chooses the proper metric averaging based on whether the task is binary or multi-class.

* `# 2) Class weights for imbalanced data (size == num_labels)`
  ‚Äì builds a weight vector per class to address label imbalance.

* `# Heuristic: inverse-frequency scaled to max=1.0 (safe for CE)`
  ‚Äì uses inverse class frequency, normalized so the largest weight equals 1.0, suitable for cross-entropy.

* `# 3) Helper: tokenizer already defined above. Re-tokenize per max_length`
  ‚Äì re-encodes text using the existing tokenizer, honoring the given maximum sequence length.

* `# 4) Version-compatible TrainingArguments factory`
  ‚Äì creates TrainingArguments that adapt to both newer and older library versions.

* `# Try modern signature first`
  ‚Äì attempts to instantiate with contemporary argument names and options.

* `# Fallback for older transformers (no evaluation_strategy/save_strategy)`
  ‚Äì switches to legacy parameters when the newer ones aren‚Äôt supported.

* `# do_eval=True  # legacy way to enable evaluation`
  ‚Äì turns on evaluation using the older configuration style.

* `# save_steps=500  # periodic saving`
  ‚Äì saves checkpoints at fixed step intervals.

* `# Re-tokenize for this max_length`
  ‚Äì encodes the train/validation texts again for the chosen sequence length.

* `# Load backbone with correct num_labels`
  ‚Äì initializes the model with the appropriate number of output classes.

* `# --- Define backbones (already set earlier) ---`
  ‚Äì lists the model names used in the experiments.

* `# Exp-A: ClinicalBERT, conservative LR, small batch`
  ‚Äì first run: ClinicalBERT with a lower learning rate and batch size 16.

* `# Exp-B: ClinicalBERT, slightly higher LR, more epochs`
  ‚Äì second run: ClinicalBERT with a higher learning rate and an extra training epoch.

* `# Exp-C: DistilBERT fast baseline`
  ‚Äì third run: DistilBERT configured for a quicker baseline comparison.

* `# Leaderboard`
  ‚Äì prints a summary table ranking experiments by F1-score (with accuracy shown as well).


In [None]:
# --- 5) Fine-tuning (Three Experiments) [version-compatible] ---
import numpy as np
import torch
from collections import OrderedDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Metrics: binary vs multiclass handled automatically
num_labels = len(np.unique(y_train))
avg_type = "binary" if num_labels == 2 else "weighted"
print(f"[Fine-tune] Detected {num_labels} classes ‚Üí metrics average='{avg_type}'")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f}

# 2) Class weights for imbalanced data (size == num_labels)
counts = np.bincount(y_train, minlength=num_labels)
# Heuristic: inverse-frequency scaled to max=1.0 (safe for CE)
weights = counts.max() / np.maximum(counts, 1)
class_weights = torch.tensor(weights, dtype=torch.float32, device=device)
print(f"[Fine-tune] Class weights: {class_weights.tolist()}")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 3) Helper: tokenizer already defined above. Re-tokenize per max_length
def tokenize_texts(texts, max_length=160):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# 4) Version-compatible TrainingArguments factory
import inspect

def make_training_args(name, batch_size, lr, epochs, weight_decay, warmup_ratio):
    kwargs_modern = dict(
        output_dir=f"./runs/{name}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to=[]
    )
    try:
        # Try modern signature first
        return TrainingArguments(**kwargs_modern)
    except TypeError:
        # Fallback for older transformers (no evaluation_strategy/save_strategy)
        print("[Fine-tune] Using legacy TrainingArguments fallback.")
        kwargs_legacy = dict(
            output_dir=f"./runs/{name}",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=lr,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            logging_steps=50,
            do_eval=True,          # legacy way to enable evaluation
            save_steps=500,        # periodic saving
            overwrite_output_dir=True,
            fp16=torch.cuda.is_available()
        )
        return TrainingArguments(**kwargs_legacy)

def run_experiment(name, backbone, batch_size=16, lr=2e-5, epochs=3,
                   weight_decay=0.01, warmup_ratio=0.1, max_length=160):
    # Re-tokenize for this max_length
    tr = tokenize_texts(X_train, max_length=max_length)
    va = tokenize_texts(X_val,   max_length=max_length)

    train_ds_local = Dataset.from_dict({
        "input_ids": tr["input_ids"],
        "attention_mask": tr["attention_mask"],
        "labels": torch.tensor(y_train, dtype=torch.long)
    })
    val_ds_local = Dataset.from_dict({
        "input_ids": va["input_ids"],
        "attention_mask": va["attention_mask"],
        "labels": torch.tensor(y_val, dtype=torch.long)
    })

    # Load backbone with correct num_labels
    model = AutoModelForSequenceClassification.from_pretrained(
        backbone, num_labels=num_labels
    ).to(device)

    args = make_training_args(
        name=name, batch_size=batch_size, lr=lr, epochs=epochs,
        weight_decay=weight_decay, warmup_ratio=warmup_ratio
    )

    trainer = WeightedTrainer(
        model=model,
        args=args,
        train_dataset=train_ds_local,
        eval_dataset=val_ds_local,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f"\n>>> {name} results: {metrics}\n")
    return metrics, trainer

# --- Define backbones (already set earlier) ---
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
DISTIL_BERT   = "distilbert-base-uncased"

results = OrderedDict()

# Exp-A: ClinicalBERT, conservative LR, small batch
results['expA_clinicalbert_bs16_lr2e-5_ep3'] = run_experiment(
    name="expA_clinicalbert_bs16_lr2e-5_ep3",
    backbone=CLINICAL_BERT,
    batch_size=16, lr=2e-5, epochs=3,
    weight_decay=0.01, warmup_ratio=0.1, max_length=160
)

# Exp-B: ClinicalBERT, slightly higher LR, more epochs
results['expB_clinicalbert_bs16_lr5e-5_ep4'] = run_experiment(
    name="expB_clinicalbert_bs16_lr5e-5_ep4",
    backbone=CLINICAL_BERT,
    batch_size=16, lr=5e-5, epochs=4,
    weight_decay=0.01, warmup_ratio=0.06, max_length=160
)

# Exp-C: DistilBERT fast baseline
results['expC_distilbert_bs32_lr3e-5_ep3'] = run_experiment(
    name="expC_distilbert_bs32_lr3e-5_ep3",
    backbone=DISTIL_BERT,
    batch_size=32, lr=3e-5, epochs=3,
    weight_decay=0.01, warmup_ratio=0.1, max_length=128
)

# Leaderboard
board = []
for k,(m,_t) in results.items():
    board.append((k, m.get('eval_f1', float('nan')), m.get('eval_accuracy', float('nan'))))
board = sorted(board, key=lambda x: x[1], reverse=True)
print("\nLeaderboard (by F1):")
for name, f1, acc in board:
    print(f"{name:35s}  F1={f1:.4f}  Acc={acc:.4f}")


NameError: name 'y_train' is not defined

## 6) Eval (Pick Best and Run Inference)

* `# Select the best run from 'results' dict above`
  Introduces the section that will pick the highest-scoring experiment.

* `best_name, best_f1 = None, -1.0`
  Initializes the current ‚Äúbest‚Äù run name to nothing and its F1 to a very low value.

* `best_trainer = None`
  Placeholder for the Trainer object of the best run.

* `for name,(metrics, trainer) in results.items():`
  Loops through each experiment entry, unpacking its metrics and Trainer.

* `    if metrics['eval_f1'] > best_f1:`
  Checks if this experiment‚Äôs F1 beats the current best.

* `        best_f1 = metrics['eval_f1']`
  Updates the best F1 score.

* `        best_name = name`
  Records the winning experiment‚Äôs name.

* `        best_trainer = trainer`
  Stores the Trainer tied to the winning run.

* `print(f"Best run: {best_name} with F1={best_f1:.4f}")`
  Prints which run won and its F1 rounded to four decimals.

* `# Save the best model for reuse`
  Marks the section that persists the best model and tokenizer.

* `save_dir = f"./best_model_{best_name}"`
  Builds a folder path named after the best run.

* `best_trainer.save_model(save_dir)`
  Saves model weights and config to that folder.

* `tokenizer.save_pretrained(save_dir)`
  Saves the tokenizer files to the same folder.

* `# Simple inference helper`
  Introduces a convenience function for making predictions later.

* `def predict(texts, model_dir=save_dir):`
  Starts a function that takes raw texts and an optional model path.

* `    tok = AutoTokenizer.from_pretrained(model_dir)`
  Loads the tokenizer from the saved folder.

* `    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)`
  Loads the saved classifier and moves it to CPU/GPU.

* `    enc = tok(list(texts), padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)`
  Tokenizes the input texts, pads and truncates to length 160, returns PyTorch tensors, and moves them to the device.

* `    with torch.no_grad():`
  Disables gradient tracking for faster, memory-light inference.

* `        logits = mdl(**enc).logits`
  Runs the model forward pass and grabs raw class scores.

* `    pred = torch.argmax(logits, dim=-1).cpu().numpy()`
  Converts logits to predicted class IDs and moves them to NumPy.

* `    prob = torch.softmax(logits, dim=-1).cpu().numpy()[:,1]`
  Turns logits into probabilities and selects the column for class 1.

* `    return pred, prob`
  Returns predicted labels and their positive-class probabilities.

* `# Demo predictions on a few samples`
  Starts a small test to show the function in action.

* `samples = [`
  Opens a list of example texts.

* `    "I feel calm and in control today.",`
  Sample 1: likely not stressed.

* `    "My chest is tight and I cannot focus, I think I am very stressed.",`
  Sample 2: likely stressed.

* `    "Workload is heavy but manageable so far."`
  Sample 3: borderline but manageable tone.

* `]`
  Closes the list of samples.

* `pred, prob = predict(samples)`
  Runs inference on the samples, returning labels and probabilities.

* `for s, y, p in zip(samples, pred, prob):`
  Iterates over each sample with its predicted label and probability.

* `    lab = "stressed(1)" if y==1 else "not-stressed(0)"`
  Converts numeric label to a readable string.

* `    print(f"[{lab}  p={p:.3f}]  {s}")`
  Prints the label, probability (to three decimals), and the original text.


In [None]:

# Select the best run from 'results' dict above
best_name, best_f1 = None, -1.0
best_trainer = None
for name,(metrics, trainer) in results.items():
    if metrics['eval_f1'] > best_f1:
        best_f1 = metrics['eval_f1']
        best_name = name
        best_trainer = trainer

print(f"Best run: {best_name} with F1={best_f1:.4f}")

# Save the best model for reuse
save_dir = f"./best_model_{best_name}"
best_trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

# Simple inference helper
def predict(texts, model_dir=save_dir):
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
    enc = tok(list(texts), padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = mdl(**enc).logits
    pred = torch.argmax(logits, dim=-1).cpu().numpy()
    prob = torch.softmax(logits, dim=-1).cpu().numpy()[:,1]
    return pred, prob

# Demo predictions on a few samples
samples = [
    "I feel calm and in control today.",
    "My chest is tight and I cannot focus, I think I am very stressed.",
    "Workload is heavy but manageable so far."
]
pred, prob = predict(samples)
for s, y, p in zip(samples, pred, prob):
    lab = "stressed(1)" if y==1 else "not‚Äëstressed(0)"
    print(f"[{lab}  p={p:.3f}]  {s}")
