# 0) Setup (libraries and reproducibility)

--- Import and Environment Setup ---

import os

- manages system paths, folders, and environment variables to handle files and directories efficiently during the execution of the notebook.

import math

- includes mathematical tools and formulas that can assist in calculations such as learning rate adjustments or numeric transformations during model training.

import random

- controls and initializes random number generation, ensuring that every run of the model produces consistent outcomes for reproducibility.

import numpy as np

- provides extensive support for numerical data handling, offering fast and flexible operations on arrays and matrices used throughout the data preparation process.

import pandas as pd

- allows for structured data loading and manipulation, making it easier to explore, clean, and organize datasets, especially when working with CSV files.

from pathlib import Path

- gives a cleaner and more reliable way to manage file and directory paths across different operating systems.

--- Core Framework Imports ---

import torch

- provides the base framework for tensor manipulation and GPU acceleration, enabling efficient computation for training and evaluating deep learning models.

from datasets import Dataset

- transforms pandas DataFrames into optimized dataset objects that integrate smoothly with the Hugging Face Transformers library for preprocessing and training.

from transformers import (
AutoTokenizer,

- automatically selects and loads the appropriate tokenizer for a specific pre-trained model to ensure consistent tokenization.
AutoModelForSequenceClassification,

- initializes a pre-trained Transformer model with an added classification head, suitable for tasks like sentiment analysis or text categorization.
TrainingArguments,

- specifies and stores key hyperparameters such as the number of epochs, batch size, and evaluation frequency for the model training process.
Trainer

- streamlines the entire fine-tuning procedure, managing training, evaluation, logging, and checkpoint saving without requiring manual loop implementation.
)

--- Evaluation Metric Imports ---

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

- brings in performance evaluation tools that calculate key metrics such as accuracy, precision, recall, and F1-score to assess the model‚Äôs prediction quality.

--- Reproducibility Configuration ---

SEED = 42

- defines a fixed seed number to guarantee that all random processes across libraries yield consistent results.
random.seed(SEED)

- ensures that Python‚Äôs random number operations remain stable and predictable in every run.
np.random.seed(SEED)

- controls NumPy‚Äôs internal random processes to maintain the same shuffling or sampling patterns across executions.
torch.manual_seed(SEED)

- fixes PyTorch‚Äôs randomization for consistent model weight initialization and data handling.
torch.cuda.manual_seed_all(SEED)

- applies the same reproducibility rule across all available GPUs to maintain uniform outcomes even in multi-GPU training setups.

--- Device Detection ---

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

- determines whether a GPU is available for acceleration and defaults to CPU if not, ensuring compatibility in any environment.
print(f"Using device: {device}")

- prints out the current hardware in use to confirm that GPU acceleration is properly detected and active.

In [None]:

# Every import has an explanatory comment.
import os                         # file paths and environment checks
import math                       # math helpers (may be useful for schedules)
import random                     # Python's RNG for reproducibility
import numpy as np                # numerical arrays and metrics support
import pandas as pd               # data loading and manipulation
from pathlib import Path          # convenient and robust path handling

# Hugging Face / PyTorch stack (for transformer fine‚Äëtuning)
import torch                      # tensor and GPU utilities
from datasets import Dataset      # lightweight dataset wrapper around pandas
from transformers import (       # core HF components for tokenization and training
    AutoTokenizer,               # auto‚Äëloads the right tokenizer for a given model checkpoint
    AutoModelForSequenceClassification,  # classification head on top of a transformer
    TrainingArguments,           # training hyperparameters container
    Trainer                      # training loop helper (handles eval and logging)
)

# Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Make runs reproducible (seed Python, NumPy, and PyTorch)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Detect device once and print for visibility
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # shows 'cuda' when a GPU is available in Colab


Using device: cpu


## 1) Load Dataset

# --- Load Dataset (Upload version, auto-encodes text labels) ---

#import pandas as pd

- imports the pandas library, which is essential for reading, organizing, and analyzing CSV data within Python.

#from pathlib import Path

- provides a structured and cross-platform way to handle file paths, making directory navigation and file references more reliable.

#from google.colab import files

- activates Google Colab‚Äôs file upload feature, allowing users to upload local datasets directly into the runtime environment.

#print("üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)")

- displays a clear message prompting the user to upload a dataset file in CSV format for processing.

#uploaded = files.upload()

 -opens a file selection dialog so that the user can choose and upload the desired dataset from their computer.

#filename = list(uploaded.keys())[0]

- extracts the name of the uploaded file from the dictionary of uploaded files.

#csv_path = Path(f"/content/{filename}")

- constructs a full, system-compatible file path pointing to the uploaded dataset within the Colab working directory.

#print(f"‚úÖ File uploaded successfully: {csv_path}")

- provides feedback confirming that the file upload was successful and shows where the file was saved.

#df = pd.read_csv(csv_path)

- loads the uploaded CSV file into a pandas DataFrame, preparing it for inspection and processing.

# --- Validate columns ---

#expected_cols = {'statement', 'status'}

- defines the columns that must exist in the dataset to ensure it matches the expected structure for further steps.

#assert expected_cols.issubset(df.columns), f"‚ùå Missing required columns: {expected_cols - set(df.columns)}"

- verifies that all required columns are present in the dataset; if not, the code stops and reports which ones are missing.

# --- Clean ---

#df = df.dropna(subset=['statement', 'status']).copy()

- deletes any rows containing missing values in the ‚Äòstatement‚Äô or ‚Äòstatus‚Äô columns to maintain data consistency.

#df['statement'] = df['statement'].astype(str)

- converts all entries in the ‚Äòstatement‚Äô column into string type to prevent formatting or type errors later in processing.

# --- Encode text labels into integers ---

#from sklearn.preprocessing import LabelEncoder

- brings in a class from scikit-learn that converts categorical text labels into numerical form for model compatibility.

#le = LabelEncoder()

- initializes the LabelEncoder, preparing it to map text categories into numeric codes.

#df['status_encoded'] = le.fit_transform(df['status'])

- fits the encoder to the ‚Äòstatus‚Äô column and generates a new column containing the corresponding numeric label values.

#print("üî§ Label encoding map:")

- prints a section heading to indicate that the label-to-code mapping will be shown next.

#for label, code in zip(le.classes_, range(len(le.classes_))):

- loops through each label and its encoded numeric representation to display the mapping relationship.
  print(f"  {code} ‚Üí {label}")  - prints each numeric code and its associated label for verification.

#df['status'] = df['status_encoded']

- replaces the original ‚Äòstatus‚Äô column‚Äôs text labels with their corresponding numeric values.

#df.drop(columns=['status_encoded'], inplace=True)

- deletes the temporary ‚Äòstatus_encoded‚Äô column since the main ‚Äòstatus‚Äô column now contains the encoded values.

#print("\n‚úÖ Dataset loaded and label-encoded successfully!")

- outputs a confirmation message indicating that the dataset has been fully cleaned and encoded without errors.

#print(df['status'].value_counts(dropna=False))

- displays a frequency count of each encoded label, helping verify that the encoding process was applied correctly.

#df.head(3)

- shows the first three rows of the cleaned and processed dataset to confirm that all transformations were applied successfully.


In [None]:
# --- Load Dataset (Upload version, auto-encodes text labels) ---
import pandas as pd
from pathlib import Path
from google.colab import files

print("üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)")
uploaded = files.upload()

# Automatically pick the first uploaded file
filename = list(uploaded.keys())[0]
csv_path = Path(f"/content/{filename}")

print(f"‚úÖ File uploaded successfully: {csv_path}")

# Load the CSV
df = pd.read_csv(csv_path)

# --- Validate columns ---
expected_cols = {'statement', 'status'}
assert expected_cols.issubset(df.columns), f"‚ùå Missing required columns: {expected_cols - set(df.columns)}"

# --- Clean ---
df = df.dropna(subset=['statement', 'status']).copy()
df['statement'] = df['statement'].astype(str)

# --- Encode text labels into integers ---
# This maps each unique label (like 'Anxiety', 'Stress', etc.) to a numeric ID
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['status_encoded'] = le.fit_transform(df['status'])

# Optional: print mapping for your reference
print("üî§ Label encoding map:")
for label, code in zip(le.classes_, range(len(le.classes_))):
    print(f"  {code} ‚Üí {label}")

# Replace 'status' with the encoded version
df['status'] = df['status_encoded']
df.drop(columns=['status_encoded'], inplace=True)

print("\n‚úÖ Dataset loaded and label-encoded successfully!")
print(df['status'].value_counts(dropna=False))
df.head(3)


üìÇ Please upload your dataset CSV (e.g., Combined Data.csv)


Saving Combined Data.csv to Combined Data.csv
‚úÖ File uploaded successfully: /content/Combined Data.csv
üî§ Label encoding map:
  0 ‚Üí Anxiety
  1 ‚Üí Bipolar
  2 ‚Üí Depression
  3 ‚Üí Normal
  4 ‚Üí Personality disorder
  5 ‚Üí Stress
  6 ‚Üí Suicidal

‚úÖ Dataset loaded and label-encoded successfully!
status
3    16343
2    15404
6    10652
0     3841
1     2777
5     2587
4     1077
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,0
1,1,"trouble sleeping, confused mind, restless hear...",0
2,2,"All wrong, back off dear, forward doubt. Stay ...",0


## 2) Baseline Models (TF‚ÄëIDF + Linear)

# --- Baseline Models (TF-IDF + Linear, supports multi-class) ---

#from sklearn.model_selection import train_test_split
‚Äì divides the dataset into separate subsets for training and validation purposes

#from sklearn.feature_extraction.text import TfidfVectorizer
‚Äì transforms raw text into numerical representations using the TF-IDF method

#from sklearn.linear_model import LogisticRegression
‚Äì loads the logistic regression algorithm used for text classification

#from sklearn.svm import LinearSVC
‚Äì loads the linear support vector machine classifier for categorizing text

#from sklearn.metrics import accuracy_score, precision_recall_fscore_support
‚Äì provides built-in functions to measure model performance using common evaluation metrics

#import numpy as np
‚Äì supports efficient numerical calculations and operations on arrays

#X_train, X_val, y_train, y_val = train_test_split(

df['statement'].values,
df['status'].values,
test_size=0.2,
random_state=42,
stratify=df['status'].values

#)
‚Äì separates the dataset into 80% training and 20% validation samples while maintaining balanced class distribution

#tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=40000)
‚Äì builds a TF-IDF model that captures single words and two-word phrases, ignoring rare terms and limiting total features to 40,000

#Xtr = tfidf.fit_transform(X_train)
‚Äì learns vocabulary patterns from the training set and converts text into TF-IDF feature vectors

#Xva = tfidf.transform(X_val)
‚Äì applies the trained TF-IDF transformation to the validation set without retraining

#num_classes = len(np.unique(y_train))
‚Äì determines how many distinct categories or labels exist in the dataset

#avg_type = "binary" if num_classes == 2 else "weighted"
‚Äì automatically chooses whether to use binary or weighted averaging based on the number of classes

#print(f"Detected {num_classes} classes ‚Üí using average='{avg_type}' for metrics.\n")
‚Äì outputs the number of identified classes and indicates which averaging method will be applied for evaluation

--- Baseline 1: Logistic Regression ---

#logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
‚Äì creates a logistic regression model configured to balance uneven class frequencies and allow more training iterations

#logreg.fit(Xtr, y_train)
‚Äì trains the logistic regression classifier using the prepared TF-IDF features and corresponding labels

#pred_lr = logreg.predict(Xva)
‚Äì produces predictions on unseen validation data using the trained logistic regression model

#p, r, f, _ = precision_recall_fscore_support(y_val, pred_lr, average=avg_type)
‚Äì calculates the precision, recall, and F1-score metrics according to the averaging method chosen

#acc = accuracy_score(y_val, pred_lr)
‚Äì evaluates how often the logistic regression model predicted the correct label

#print(f"[Baseline-LR] Acc={acc:.3f} P={p:.3f} R={r:.3f} F1={f:.3f}")
‚Äì prints the accuracy, precision, recall, and F1-score results for the logistic regression model

--- Baseline 2: Linear SVM ---

#svm = LinearSVC(class_weight="balanced")
‚Äì initializes a linear SVM model that compensates for class imbalance during training

#svm.fit(Xtr, y_train)
‚Äì fits the SVM classifier using the TF-IDF features from the training data

#pred_svm = svm.predict(Xva)
‚Äì predicts the validation set labels using the trained SVM model

#p, r, f, _ = precision_recall_fscore_support(y_val, pred_svm, average=avg_type)
‚Äì computes precision, recall, and F1-score for the SVM‚Äôs predictions based on the selected averaging mode

#acc = accuracy_score(y_val, pred_svm)
‚Äì determines the SVM model‚Äôs accuracy across all validation examples

#print(f"[Baseline-SVM] Acc={acc:.3f} P={p:.3f} R={r:.3f} F1={f:.3f}")
‚Äì displays the accuracy, precision, recall, and F1-score achieved by the SVM baseline model


In [None]:
# --- Baseline Models (TF-IDF + Linear, supports multi-class) ---
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    df['statement'].values,
    df['status'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['status'].values
)

# Convert raw text into TF-IDF features
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_features=40000)
Xtr = tfidf.fit_transform(X_train)
Xva = tfidf.transform(X_val)

# Detect if this is binary or multiclass
num_classes = len(np.unique(y_train))
avg_type = "binary" if num_classes == 2 else "weighted"
print(f"Detected {num_classes} classes ‚Üí using average='{avg_type}' for metrics.\n")

# --- Baseline 1: Logistic Regression ---
logreg = LogisticRegression(max_iter=2000, class_weight="balanced")
logreg.fit(Xtr, y_train)
pred_lr = logreg.predict(Xva)
p, r, f, _ = precision_recall_fscore_support(y_val, pred_lr, average=avg_type)
acc = accuracy_score(y_val, pred_lr)
print(f"[Baseline-LR] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")

# --- Baseline 2: Linear SVM ---
svm = LinearSVC(class_weight="balanced")
svm.fit(Xtr, y_train)
pred_svm = svm.predict(Xva)
p, r, f, _ = precision_recall_fscore_support(y_val, pred_svm, average=avg_type)
acc = accuracy_score(y_val, pred_svm)
print(f"[Baseline-SVM] Acc={acc:.3f}  P={p:.3f}  R={r:.3f}  F1={f:.3f}")


Detected 7 classes ‚Üí using average='weighted' for metrics.

[Baseline-LR] Acc=0.778  P=0.787  R=0.778  F1=0.777
[Baseline-SVM] Acc=0.782  P=0.779  R=0.782  F1=0.780


## 3) Pre‚ÄëTrained Models (Tokenization and Dataset Prep)

# --- Transformer Backbone and Tokenization Setup ---

#CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
‚Äì specifies the pretrained ClinicalBERT model, which is optimized for understanding clinical and medical language

#DISTIL_BERT = "distilbert-base-uncased"
‚Äì specifies the lightweight DistilBERT model designed for faster and more efficient fine-tuning compared to larger transformer models

#BACKBONE = CLINICAL_BERT
‚Äì assigns ClinicalBERT as the main transformer model to be used for this experiment

#tokenizer = AutoTokenizer.from_pretrained(BACKBONE)
‚Äì loads the tokenizer associated with the selected transformer model to ensure text encoding consistency

#def tokenize_texts(texts, max_length=128):
‚Äì defines a reusable function that converts a collection of raw text samples into tokenized sequences suitable for the model
    #return tokenizer(
      list(texts),‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚Äì transforms the input texts into a list format
      padding=True,‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚ÄÉ‚Äì automatically pads all sequences to the same length
      truncation=True,‚ÄÉ‚ÄÉ‚ÄÉ‚Äì shortens sequences that exceed the specified maximum length
      max_length=max_length,‚ÄÉ‚Äì defines the limit for each tokenized text sequence
      return_tensors="pt"‚ÄÉ‚ÄÉ‚Äì outputs data as PyTorch-compatible tensors
    )
‚Äì applies the tokenizer configuration to the texts and produces ready-to-use numerical tensors

#train_enc = tokenize_texts(X_train)
‚Äì processes and encodes all training sentences into model-readable token IDs and attention masks

#val_enc = tokenize_texts(X_val)
‚Äì applies the same tokenization steps to the validation set to maintain consistency with the training data

#train_ds = Dataset.from_dict({

"input_ids": train_enc["input_ids"],
"attention_mask": train_enc["attention_mask"],
"labels": torch.tensor(y_train)

#})
‚Äì builds a structured Hugging Face dataset for the training portion, including encoded inputs and their respective labels

#val_ds = Dataset.from_dict({

"input_ids": val_enc["input_ids"],
"attention_mask": val_enc["attention_mask"],
"labels": torch.tensor(y_val)

#})
‚Äì constructs a matching dataset object for the validation data with identical field structure

#len(train_ds), len(val_ds)
‚Äì verifies and displays how many records are contained within the training and validation datasets

In [None]:

# Choose your checkpoints.
# We include ClinicalBERT (for clinical text) and DistilBERT (fast baseline).
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
DISTIL_BERT   = "distilbert-base-uncased"

# Pick one as the default backbone for experiments below.
BACKBONE = CLINICAL_BERT

# Initialize tokenizer for the chosen backbone
tokenizer = AutoTokenizer.from_pretrained(BACKBONE)

# Helper to tokenize a pandas series with per-line comments
def tokenize_texts(texts, max_length=128):
    # Apply the tokenizer: returns dict with input_ids and attention_mask
    return tokenizer(
        list(texts),                 # a Python list of strings
        padding=True,                # pad to the longest in the batch
        truncation=True,             # cut off text exceeding max_length
        max_length=max_length,       # cap sequence length
        return_tensors="pt"          # return PyTorch tensors
    )

# Tokenize train/validation splits
train_enc = tokenize_texts(X_train)
val_enc   = tokenize_texts(X_val)

# Wrap into HF Datasets with labels
train_ds = Dataset.from_dict({
    "input_ids": train_enc["input_ids"],
    "attention_mask": train_enc["attention_mask"],
    "labels": torch.tensor(y_train)
})
val_ds = Dataset.from_dict({
    "input_ids": val_enc["input_ids"],
    "attention_mask": val_enc["attention_mask"],
    "labels": torch.tensor(y_val)
})

len(train_ds), len(val_ds)


NameError: name 'X_train' is not defined

## 4) Training of Data (Trainer utilities and metrics)

Metric function for the Trainer: computes Accuracy, Precision, Recall, F1

‚Äì defines a function used by the Trainer to evaluate model performance through key metrics such as accuracy, precision, recall, and F1-score

eval_pred is a tuple of (logits, labels)

‚Äì indicates that the function receives two components: the model‚Äôs raw predictions (logits) and the actual ground-truth labels (labels)

logits, labels = eval_pred

‚Äì unpacks the tuple into separate variables representing predicted outputs and true labels

preds = np.argmax(logits, axis=-1)

‚Äì selects the class with the highest predicted probability for each input sample

precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

‚Äì calculates precision, recall, and F1-score across all predictions using a binary averaging scheme

acc = accuracy_score(labels, preds)

‚Äì measures the overall proportion of correct predictions made by the model

return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

‚Äì returns the computed metrics in a dictionary format for reporting and monitoring during training

Optional: class weights for imbalanced datasets

‚Äì introduces a section that handles uneven class distributions by adjusting their relative training importance

Compute weights inversely proportional to class frequencies

‚Äì derives weight values where less frequent classes receive higher importance in the loss function

pos = (y_train == 1).sum()

‚Äì counts how many samples belong to the positive class in the training data

neg = (y_train == 0).sum()

‚Äì counts how many samples belong to the negative class in the training data

w_pos = neg / max(pos, 1) # weight for positive class

‚Äì assigns a weight to the positive class that is inversely proportional to its frequency to counter class imbalance

w_neg = 1.0 # keep negative as baseline

‚Äì keeps the negative class weight as the standard reference (baseline weight of 1.0)

class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float).to(device)

‚Äì converts both class weights into a PyTorch tensor and transfers them to the active computing device (CPU or GPU)

#print(f"Class weights (neg, pos): {class_weights.tolist()}")
‚Äì outputs the computed class weights for verification and transparency

Custom Trainer that injects weighted loss

‚Äì defines a subclass of the Hugging Face Trainer that incorporates class-weighted loss during backpropagation

#from torch.nn import CrossEntropyLoss
‚Äì imports the cross-entropy loss function, which is standard for classification tasks

#class WeightedTrainer(Trainer):
‚Äì creates a custom training class that inherits properties and methods from the base Trainer class

#def compute_loss(self, model, inputs, return_outputs=False):
‚Äì overrides the default loss computation method to integrate the weighted loss function

#labels = inputs.get("labels")
‚Äì extracts the true labels from the batch input dictionary

#outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
‚Äì performs a forward pass through the model while excluding the labels from the input arguments

#logits = outputs.get("logits")
‚Äì retrieves the predicted logits from the model output

#loss_fct = CrossEntropyLoss(weight=class_weights)
‚Äì initializes a cross-entropy loss function that applies the predefined class weights

#loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
‚Äì computes the final weighted loss by comparing predicted logits and true labels across all samples

#return (loss, outputs) if return_outputs else loss
‚Äì returns both loss and model outputs (if requested), otherwise only the computed loss for training

In [None]:
from sklearn.model_selection import train_test_split

# Assuming your cleaned & encoded dataframe is called df
# with columns: 'statement' (text) and 'status' (numeric label)
X = df['statement']
y = df['status']

# Split into 80% train, 20% validation (you can adjust ratio)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("‚úÖ Data split complete:")
print(f"Train size: {len(X_train)} | Validation size: {len(X_val)}")

‚úÖ Data split complete:
Train size: 42144 | Validation size: 10537


In [None]:

# Metric function for the Trainer: computes Accuracy, Precision, Recall, F1
def compute_metrics(eval_pred):
    # eval_pred is a tuple of (logits, labels)
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Optional: class weights for imbalanced datasets
# Compute weights inversely proportional to class frequencies
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
w_pos = neg / max(pos, 1)   # weight for positive class
w_neg = 1.0                 # keep negative as baseline
class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float).to(device)
print(f"Class weights (neg, pos): {class_weights.tolist()}" )

# Custom Trainer that injects weighted loss
from torch.nn import CrossEntropyLoss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


Class weights (neg, pos): [1.0, 1.3836109638214111]


## 5) Fine‚Äëtuning (Three Experiments)



* `# --- 5) Fine-tuning (Three Experiments) [version-compatible] ---`
  ‚Äì runs three fine-tuning trials with settings that work across different Transformers versions.

* `# 1) Metrics: binary vs multiclass handled automatically`
  ‚Äì chooses the proper metric averaging based on whether the task is binary or multi-class.

* `# 2) Class weights for imbalanced data (size == num_labels)`
  ‚Äì builds a weight vector per class to address label imbalance.

* `# Heuristic: inverse-frequency scaled to max=1.0 (safe for CE)`
  ‚Äì uses inverse class frequency, normalized so the largest weight equals 1.0, suitable for cross-entropy.

* `# 3) Helper: tokenizer already defined above. Re-tokenize per max_length`
  ‚Äì re-encodes text using the existing tokenizer, honoring the given maximum sequence length.

* `# 4) Version-compatible TrainingArguments factory`
  ‚Äì creates TrainingArguments that adapt to both newer and older library versions.

* `# Try modern signature first`
  ‚Äì attempts to instantiate with contemporary argument names and options.

* `# Fallback for older transformers (no evaluation_strategy/save_strategy)`
  ‚Äì switches to legacy parameters when the newer ones aren‚Äôt supported.

* `# do_eval=True  # legacy way to enable evaluation`
  ‚Äì turns on evaluation using the older configuration style.

* `# save_steps=500  # periodic saving`
  ‚Äì saves checkpoints at fixed step intervals.

* `# Re-tokenize for this max_length`
  ‚Äì encodes the train/validation texts again for the chosen sequence length.

* `# Load backbone with correct num_labels`
  ‚Äì initializes the model with the appropriate number of output classes.

* `# --- Define backbones (already set earlier) ---`
  ‚Äì lists the model names used in the experiments.

* `# Exp-A: ClinicalBERT, conservative LR, small batch`
  ‚Äì first run: ClinicalBERT with a lower learning rate and batch size 16.

* `# Exp-B: ClinicalBERT, slightly higher LR, more epochs`
  ‚Äì second run: ClinicalBERT with a higher learning rate and an extra training epoch.

* `# Exp-C: DistilBERT fast baseline`
  ‚Äì third run: DistilBERT configured for a quicker baseline comparison.

* `# Leaderboard`
  ‚Äì prints a summary table ranking experiments by F1-score (with accuracy shown as well).


In [None]:
# --- 5) Fine-tuning (Three Experiments) [version-compatible] ---
import os
os.environ["WANDB_DISABLED"] = "true"

import numpy as np
import torch
from collections import OrderedDict
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Metrics: binary vs multiclass handled automatically
num_labels = len(np.unique(y_train))
avg_type = "binary" if num_labels == 2 else "weighted"
print(f"[Fine-tune] Detected {num_labels} classes ‚Üí metrics average='{avg_type}'")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f}

# 2) Class weights for imbalanced data (size == num_labels)
counts = np.bincount(y_train, minlength=num_labels)
# Heuristic: inverse-frequency scaled to max=1.0 (safe for CE)
weights = counts.max() / np.maximum(counts, 1)
class_weights = torch.tensor(weights, dtype=torch.float32, device=device)
print(f"[Fine-tune] Class weights: {class_weights.tolist()}")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# 3) Helper: tokenizer already defined above. Re-tokenize per max_length
def tokenize_texts(texts, max_length=160):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# 4) Version-compatible TrainingArguments factory
import inspect

def make_training_args(name, batch_size, lr, epochs, weight_decay, warmup_ratio):
    kwargs_modern = dict(
        output_dir=f"./runs/{name}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        report_to=[]
    )
    try:
        # Try modern signature first
        return TrainingArguments(**kwargs_modern)
    except TypeError:
        # Fallback for older transformers (no evaluation_strategy/save_strategy)
        print("[Fine-tune] Using legacy TrainingArguments fallback.")
        kwargs_legacy = dict(
            output_dir=f"./runs/{name}",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=lr,
            num_train_epochs=epochs,
            weight_decay=weight_decay,
            logging_steps=50,
            do_eval=True,          # legacy way to enable evaluation
            save_steps=500,        # periodic saving
            overwrite_output_dir=True,
            fp16=torch.cuda.is_available()
        )
        return TrainingArguments(**kwargs_legacy)

def run_experiment(name, backbone, batch_size=16, lr=2e-5, epochs=3,
                   weight_decay=0.01, warmup_ratio=0.1, max_length=160):
    # Re-tokenize for this max_length
    tr = tokenize_texts(X_train, max_length=max_length)
    va = tokenize_texts(X_val,   max_length=max_length)

    train_ds_local = Dataset.from_dict({
        "input_ids": tr["input_ids"],
        "attention_mask": tr["attention_mask"],
        "labels": torch.tensor(y_train.to_numpy(), dtype=torch.long)   # <-- use .to_numpy()
    })
    val_ds_local = Dataset.from_dict({
        "input_ids": va["input_ids"],
        "attention_mask": va["attention_mask"],
        "labels": torch.tensor(y_val.to_numpy(), dtype=torch.long)     # <-- use .to_numpy()
    })


    # Load backbone with correct num_labels
    model = AutoModelForSequenceClassification.from_pretrained(
        backbone, num_labels=num_labels
    ).to(device)

    args = make_training_args(
        name=name, batch_size=batch_size, lr=lr, epochs=epochs,
        weight_decay=weight_decay, warmup_ratio=warmup_ratio
    )

    trainer = WeightedTrainer(
        model=model,
        args=args,
        train_dataset=train_ds_local,
        eval_dataset=val_ds_local,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f"\n>>> {name} results: {metrics}\n")
    return metrics, trainer

# --- Define backbones (already set earlier) ---
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
DISTIL_BERT   = "distilbert-base-uncased"

results = OrderedDict()

# Exp-A: ClinicalBERT, conservative LR, small batch
results['expA_clinicalbert_bs16_lr2e-5_ep3'] = run_experiment(
    name="expA_clinicalbert_bs16_lr2e-5_ep3",
    backbone=CLINICAL_BERT,
    batch_size=16, lr=2e-5, epochs=3,
    weight_decay=0.01, warmup_ratio=0.1, max_length=160
)

# Exp-B: ClinicalBERT, slightly higher LR, more epochs
results['expB_clinicalbert_bs16_lr5e-5_ep4'] = run_experiment(
    name="expB_clinicalbert_bs16_lr5e-5_ep4",
    backbone=CLINICAL_BERT,
    batch_size=16, lr=5e-5, epochs=4,
    weight_decay=0.01, warmup_ratio=0.06, max_length=160
)

# Exp-C: DistilBERT fast baseline
results['expC_distilbert_bs32_lr3e-5_ep3'] = run_experiment(
    name="expC_distilbert_bs32_lr3e-5_ep3",
    backbone=DISTIL_BERT,
    batch_size=32, lr=3e-5, epochs=3,
    weight_decay=0.01, warmup_ratio=0.1, max_length=128
)

# Leaderboard
board = []
for k,(m,_t) in results.items():
    board.append((k, m.get('eval_f1', float('nan')), m.get('eval_accuracy', float('nan'))))
board = sorted(board, key=lambda x: x[1], reverse=True)
print("\nLeaderboard (by F1):")
for name, f1, acc in board:
    print(f"{name:35s}  F1={f1:.4f}  Acc={acc:.4f}")


[Fine-tune] Detected 7 classes ‚Üí metrics average='weighted'
[Fine-tune] Class weights: [4.254474639892578, 5.886537551879883, 1.0609430074691772, 1.0, 15.16705322265625, 6.31594181060791, 1.5343269109725952]


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = WeightedTrainer(


[Fine-tune] Using legacy TrainingArguments fallback.


Step,Training Loss
50,1.9126
100,1.667
150,1.3683
200,1.2414
250,1.112
300,1.0793
350,1.0089
400,0.9846
450,0.9603
500,0.8787



>>> expA_clinicalbert_bs16_lr2e-5_ep3 results: {'eval_loss': 0.6153295040130615, 'eval_accuracy': 0.8131346683116637, 'eval_precision': 0.823821343602551, 'eval_recall': 0.8131346683116637, 'eval_f1': 0.8144655432016229, 'eval_runtime': 23.8221, 'eval_samples_per_second': 442.32, 'eval_steps_per_second': 27.663, 'epoch': 3.0}



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = WeightedTrainer(


[Fine-tune] Using legacy TrainingArguments fallback.


Step,Training Loss
50,1.841
100,1.6118
150,1.3903
200,1.1853
250,1.0794
300,1.1217
350,1.0095
400,0.9995
450,0.9185
500,0.8314



>>> expB_clinicalbert_bs16_lr5e-5_ep4 results: {'eval_loss': 0.7305412292480469, 'eval_accuracy': 0.8230046502799658, 'eval_precision': 0.8262782454669647, 'eval_recall': 0.8230046502799658, 'eval_f1': 0.8236334762335898, 'eval_runtime': 23.9388, 'eval_samples_per_second': 440.163, 'eval_steps_per_second': 27.528, 'epoch': 4.0}



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = WeightedTrainer(


[Fine-tune] Using legacy TrainingArguments fallback.


Step,Training Loss
50,1.8149
100,1.6919
150,1.6068
200,1.5496
250,1.4798
300,1.4284
350,1.3444
400,1.3274
450,1.3135
500,1.2339



>>> expC_distilbert_bs32_lr3e-5_ep3 results: {'eval_loss': 0.6835536956787109, 'eval_accuracy': 0.7560975609756098, 'eval_precision': 0.7817777008346605, 'eval_recall': 0.7560975609756098, 'eval_f1': 0.7590166377377228, 'eval_runtime': 12.8813, 'eval_samples_per_second': 818.006, 'eval_steps_per_second': 25.618, 'epoch': 3.0}


Leaderboard (by F1):
expB_clinicalbert_bs16_lr5e-5_ep4    F1=0.8236  Acc=0.8230
expA_clinicalbert_bs16_lr2e-5_ep3    F1=0.8145  Acc=0.8131
expC_distilbert_bs32_lr3e-5_ep3      F1=0.7590  Acc=0.7561


In [None]:
# Record all experiment results to Excel log file
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from datetime import datetime
import re

# Create a new workbook
wb = Workbook()
ws = wb.active
ws.title = "Experiment_Logs"

# Define header style
header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF")

# Define headers
headers = [
    "Experiment_ID",
    "Model_Backbone",
    "Batch_Size",
    "Learning_Rate",
    "Epochs",
    "Weight_Decay",
    "Warmup_Ratio",
    "Max_Length",
    "Accuracy",
    "F1_Score",
    "Precision",
    "Recall"
]

# Write headers
for col_idx, header in enumerate(headers, 1):
    cell = ws.cell(row=1, column=col_idx, value=header)
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = Alignment(horizontal="center", vertical="center")

# Function to parse experiment name and extract hyperparameters
def parse_experiment_name(exp_name):
    """Extract hyperparameters from experiment name"""
    params = {
        "backbone": "Unknown",
        "batch_size": None,
        "learning_rate": None,
        "epochs": None
    }

    # Extract backbone
    if "clinicalbert" in exp_name.lower():
        params["backbone"] = "ClinicalBERT"
    elif "distilbert" in exp_name.lower():
        params["backbone"] = "DistilBERT"

    # Extract batch size (bs16, bs32, etc.)
    bs_match = re.search(r'bs(\d+)', exp_name.lower())
    if bs_match:
        params["batch_size"] = int(bs_match.group(1))

    # Extract learning rate (lr2e-5, lr5e-5, etc.)
    lr_match = re.search(r'lr([\d.e-]+)', exp_name.lower())
    if lr_match:
        lr_str = lr_match.group(1)
        # Convert scientific notation string to float
        if 'e' in lr_str:
            base, exp = lr_str.split('e')
            params["learning_rate"] = float(base) * (10 ** int(exp))
        else:
            params["learning_rate"] = float(lr_str)

    # Extract epochs (ep3, ep4, etc.)
    ep_match = re.search(r'ep(\d+)', exp_name.lower())
    if ep_match:
        params["epochs"] = int(ep_match.group(1))

    return params

# Store experiment configurations (you may need to adjust these based on your actual runs)
experiment_configs = {
    "expA_clinicalbert_bs16_lr2e-5_ep3": {
        "weight_decay": 0.01,
        "warmup_ratio": 0.1,
        "max_length": 160
    },
    "expB_clinicalbert_bs16_lr5e-5_ep4": {
        "weight_decay": 0.01,
        "warmup_ratio": 0.06,
        "max_length": 160
    },
    "expC_distilbert_bs32_lr3e-5_ep3": {
        "weight_decay": 0.01,
        "warmup_ratio": 0.1,
        "max_length": 128
    }
}

# Write experiment data
row = 2
for exp_name, (metrics, trainer) in results.items():
    # Parse experiment name
    parsed = parse_experiment_name(exp_name)
    config = experiment_configs.get(exp_name, {})

    # Write data
    ws.cell(row=row, column=1, value=exp_name)  # Experiment_ID
    ws.cell(row=row, column=2, value=parsed["backbone"])  # Model_Backbone
    ws.cell(row=row, column=3, value=parsed["batch_size"])  # Batch_Size
    ws.cell(row=row, column=4, value=parsed["learning_rate"])  # Learning_Rate
    ws.cell(row=row, column=5, value=parsed["epochs"])  # Epochs
    ws.cell(row=row, column=6, value=config.get("weight_decay", "N/A"))  # Weight_Decay
    ws.cell(row=row, column=7, value=config.get("warmup_ratio", "N/A"))  # Warmup_Ratio
    ws.cell(row=row, column=8, value=config.get("max_length", "N/A"))  # Max_Length
    ws.cell(row=row, column=9, value=metrics.get("eval_accuracy", "N/A"))  # Accuracy
    ws.cell(row=row, column=10, value=metrics.get("eval_f1", "N/A"))  # F1_Score
    ws.cell(row=row, column=11, value=metrics.get("eval_precision", "N/A"))  # Precision
    ws.cell(row=row, column=12, value=metrics.get("eval_recall", "N/A"))  # Recall

    row += 1

# Auto-adjust column widths
for col in ws.columns:
    max_length = 0
    col_letter = col[0].column_letter
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))
        except:
            pass
    adjusted_width = min(max_length + 2, 30)
    ws.column_dimensions[col_letter].width = adjusted_width

# Save the file
excel_filename = f"Exercise_F2_Experiment_Logs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
wb.save(excel_filename)

print(f"‚úÖ Experiment logs saved to: {excel_filename}")
print(f"   Total experiments logged: {len(results)}")
print(f"   Columns: {', '.join(headers)}")

# Automatically download the file
try:
    from google.colab import files
    files.download(excel_filename)
    print(f"‚úÖ File automatically downloaded: {excel_filename}")
except ImportError:
    print("Note: Not running in Google Colab. File saved locally.")
except Exception as e:
    print(f"Note: Could not auto-download. File saved at: {excel_filename}")
    print(f"   Error: {e}")

‚úÖ Experiment logs saved to: Exercise_F2_Experiment_Logs_20251108_112032.xlsx
   Total experiments logged: 3
   Columns: Experiment_ID, Model_Backbone, Batch_Size, Learning_Rate, Epochs, Weight_Decay, Warmup_Ratio, Max_Length, Accuracy, F1_Score, Precision, Recall


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ File automatically downloaded: Exercise_F2_Experiment_Logs_20251108_112032.xlsx


## 6) Eval (Pick Best and Run Inference)

* `# Select the best run from 'results' dict above`
  Introduces the section that will pick the highest-scoring experiment.

* `best_name, best_f1 = None, -1.0`
  Initializes the current ‚Äúbest‚Äù run name to nothing and its F1 to a very low value.

* `best_trainer = None`
  Placeholder for the Trainer object of the best run.

* `for name,(metrics, trainer) in results.items():`
  Loops through each experiment entry, unpacking its metrics and Trainer.

* `    if metrics['eval_f1'] > best_f1:`
  Checks if this experiment‚Äôs F1 beats the current best.

* `        best_f1 = metrics['eval_f1']`
  Updates the best F1 score.

* `        best_name = name`
  Records the winning experiment‚Äôs name.

* `        best_trainer = trainer`
  Stores the Trainer tied to the winning run.

* `print(f"Best run: {best_name} with F1={best_f1:.4f}")`
  Prints which run won and its F1 rounded to four decimals.

* `# Save the best model for reuse`
  Marks the section that persists the best model and tokenizer.

* `save_dir = f"./best_model_{best_name}"`
  Builds a folder path named after the best run.

* `best_trainer.save_model(save_dir)`
  Saves model weights and config to that folder.

* `tokenizer.save_pretrained(save_dir)`
  Saves the tokenizer files to the same folder.

* `# Simple inference helper`
  Introduces a convenience function for making predictions later.

* `def predict(texts, model_dir=save_dir):`
  Starts a function that takes raw texts and an optional model path.

* `    tok = AutoTokenizer.from_pretrained(model_dir)`
  Loads the tokenizer from the saved folder.

* `    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)`
  Loads the saved classifier and moves it to CPU/GPU.

* `    enc = tok(list(texts), padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)`
  Tokenizes the input texts, pads and truncates to length 160, returns PyTorch tensors, and moves them to the device.

* `    with torch.no_grad():`
  Disables gradient tracking for faster, memory-light inference.

* `        logits = mdl(**enc).logits`
  Runs the model forward pass and grabs raw class scores.

* `    pred = torch.argmax(logits, dim=-1).cpu().numpy()`
  Converts logits to predicted class IDs and moves them to NumPy.

* `    prob = torch.softmax(logits, dim=-1).cpu().numpy()[:,1]`
  Turns logits into probabilities and selects the column for class 1.

* `    return pred, prob`
  Returns predicted labels and their positive-class probabilities.

* `# Demo predictions on a few samples`
  Starts a small test to show the function in action.

* `samples = [`
  Opens a list of example texts.

* `    "I feel calm and in control today.",`
  Sample 1: likely not stressed.

* `    "My chest is tight and I cannot focus, I think I am very stressed.",`
  Sample 2: likely stressed.

* `    "Workload is heavy but manageable so far."`
  Sample 3: borderline but manageable tone.

* `]`
  Closes the list of samples.

* `pred, prob = predict(samples)`
  Runs inference on the samples, returning labels and probabilities.

* `for s, y, p in zip(samples, pred, prob):`
  Iterates over each sample with its predicted label and probability.

* `    lab = "stressed(1)" if y==1 else "not-stressed(0)"`
  Converts numeric label to a readable string.

* `    print(f"[{lab}  p={p:.3f}]  {s}")`
  Prints the label, probability (to three decimals), and the original text.


In [None]:

# Select the best run from 'results' dict above
best_name, best_f1 = None, -1.0
best_trainer = None
for name,(metrics, trainer) in results.items():
    if metrics['eval_f1'] > best_f1:
        best_f1 = metrics['eval_f1']
        best_name = name
        best_trainer = trainer

print(f"Best run: {best_name} with F1={best_f1:.4f}")

# Save the best model for reuse
save_dir = f"./best_model_{best_name}"
best_trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

# Simple inference helper
def predict(texts, model_dir=save_dir):
    tok = AutoTokenizer.from_pretrained(model_dir)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device)
    enc = tok(list(texts), padding=True, truncation=True, max_length=160, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = mdl(**enc).logits
    pred = torch.argmax(logits, dim=-1).cpu().numpy()
    prob = torch.softmax(logits, dim=-1).cpu().numpy()[:,1]
    return pred, prob

# Demo predictions on a few samples
samples = [
    "I feel calm and in control today.",
    "My chest is tight and I cannot focus, I think I am very stressed.",
    "Workload is heavy but manageable so far."
]
pred, prob = predict(samples)
for s, y, p in zip(samples, pred, prob):
    lab = "stressed(1)" if y==1 else "not‚Äëstressed(0)"
    print(f"[{lab}  p={p:.3f}]  {s}")


Best run: expB_clinicalbert_bs16_lr5e-5_ep4 with F1=0.8236
[not‚Äëstressed(0)  p=0.001]  I feel calm and in control today.
[not‚Äëstressed(0)  p=0.001]  My chest is tight and I cannot focus, I think I am very stressed.
[not‚Äëstressed(0)  p=0.007]  Workload is heavy but manageable so far.


# Exercise F3: Automated Hyperparameter Optimization


I install the packages needed for automated hyperparameter optimization. The command uses pip to install multiple libraries at once.

The `!pip install` syntax runs a shell command in Jupyter notebooks. I specify the package names separated by spaces. The `-U` flag updates packages to their latest versions. The `-q` flag runs the installation quietly to reduce output noise.

I install transformers for model training and tokenization, datasets for efficient data handling, accelerate for faster training, ray with the tune extension for distributed hyperparameter search, optuna as the optimization backend, and openpyxl for creating Excel files.

The command installs all packages in one line. This ensures I have everything needed before running the optimization code.

In [None]:
# Install required packages for hyperparameter optimization
!pip install transformers datasets accelerate ray[tune] optuna openpyxl -U -q

I prepare my environment for automated hyperparameter tuning. This cell imports libraries, sets up the model, and prepares data structures.

### Import Statements

I import time to track execution duration. I import json for data serialization. I import datetime to create timestamps. I import Workbook, Font, PatternFill, and Alignment from openpyxl to create formatted Excel files.

I import torch for tensor operations and device management. I import numpy as np for numerical computations. I import AutoModelForSequenceClassification and AutoTokenizer from transformers to load pre-trained models. I import TrainingArguments and Trainer for model training. I import set_seed for reproducibility.

I import Dataset from the datasets library to wrap my data efficiently. I import accuracy_score and precision_recall_fscore_support from sklearn.metrics to calculate evaluation metrics.

### Setting Random Seed

I call `set_seed(42)` to make results reproducible. This sets the random seed for Python, NumPy, PyTorch, and other libraries. The number 42 is arbitrary but consistent.

### Device Configuration

I create a device variable using `torch.device()`. The conditional checks if CUDA is available. If a GPU exists, it uses "cuda". Otherwise, it uses "cpu". I print the device so I know what hardware I am using.

### Model and Tokenizer Setup

I set CLINICAL_BERT to the model identifier string "emilyalsentzer/Bio_ClinicalBERT". This is the same model from Exercise F2. I call `AutoTokenizer.from_pretrained()` with this identifier to load the tokenizer. The tokenizer converts text into token IDs that the model understands.

### Class Detection

I calculate the number of unique labels in y_train using `np.unique()`. I store this in num_labels. I set avg_type to "binary" if num_labels equals 2, otherwise "weighted". This determines how sklearn calculates metrics for multi-class problems. I print the detected number of classes and the averaging method.

### Tokenization Function

I define a function called `tokenize_texts` that takes texts and an optional max_length parameter. The function calls the tokenizer with the text list. The padding parameter set to True adds padding tokens to make sequences the same length. The truncation parameter set to True cuts off text longer than max_length. The max_length parameter limits sequence length to 160 tokens. The return_tensors parameter set to "pt" returns PyTorch tensors instead of lists.

The function returns a dictionary with input_ids and attention_mask tensors. These represent token IDs and which tokens to attend to.

### Creating Datasets

I call `tokenize_texts()` on X_train and X_val to create train_enc and val_enc. These contain tokenized input data.

I create train_ds using `Dataset.from_dict()`. I pass a dictionary with three keys. The "input_ids" key contains the token IDs from train_enc. The "attention_mask" key contains the attention masks from train_enc. The "labels" key contains y_train converted to a PyTorch tensor with long integer type.

I create val_ds the same way using val_enc and y_val. The Dataset class wraps the data in a format the Trainer expects.

### Computing Class Weights

I use `np.bincount()` to count occurrences of each class label in y_train. The minlength parameter ensures the array has at least num_labels elements. I store counts in a variable.

I calculate weights by dividing the maximum count by each individual count. The `np.maximum()` function ensures no division by zero. This creates inverse frequency weights where minority classes get higher weights.

I convert weights to a PyTorch tensor with float32 dtype. I move it to the device so it matches where the model runs. These weights balance the loss function during training.

### Metrics Function

I define `compute_metrics()` that takes eval_pred as input. The eval_pred parameter is a tuple containing logits and labels from model evaluation.

I unpack logits and labels from the tuple. I use `np.argmax()` on logits along the last axis to get predicted class indices. This finds the class with the highest probability for each sample.

I call `precision_recall_fscore_support()` with labels and predictions. I set average to avg_type which handles binary or multi-class cases. The function returns precision, recall, F1 score, and support counts. I use underscore to ignore support.

I call `accuracy_score()` to calculate accuracy. I return a dictionary with accuracy, precision, recall, and f1 keys. The Trainer uses this function during evaluation.

### Weighted Trainer Class

I import CrossEntropyLoss from torch.nn. This is the loss function I use for classification.

I define a class called WeightedTrainer that inherits from Trainer. This extends the base Trainer with custom loss computation.

I define the `compute_loss()` method that overrides the parent method. The method takes model, inputs, return_outputs, and keyword arguments.

I extract labels from the inputs dictionary using `.get()`. I create a filtered dictionary that excludes labels from inputs. I pass this filtered dictionary to the model to get outputs. The model returns logits in the outputs dictionary.

I create a CrossEntropyLoss instance with the class_weights parameter. This applies the weights during loss calculation.

I reshape logits using `.view(-1, model.config.num_labels)`. The -1 dimension lets PyTorch infer the batch size. I reshape labels using `.view(-1)` to flatten them. I compute loss using the loss function.

I return loss and outputs if return_outputs is True, otherwise just loss. This matches the expected Trainer interface.

### Completion Message

I print a success message indicating the setup is complete. This confirms all components are ready for hyperparameter optimization.

In [None]:
# ============================================================================
# Exercise F3: Setup for Automated Hyperparameter Optimization
# ============================================================================
# IMPORTANT: This exercise uses the SAME data and model from Exercise F2:
#   - Same model: ClinicalBERT (emilyalsentzer/Bio_ClinicalBERT)
#   - Same data splits: X_train, X_val, y_train, y_val (from Exercise F2)
#   - Same class weights and metrics computation
#   - Only difference: Using automated hyperparameter optimization
# ============================================================================

import time
import json
from datetime import datetime
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
import torch
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    set_seed
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set seed for reproducibility
set_seed(42)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Model and tokenizer setup (using ClinicalBERT from Exercise F2)
CLINICAL_BERT = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(CLINICAL_BERT)

# Number of classes (from Exercise F2 - using same y_train variable)
num_labels = len(np.unique(y_train))
avg_type = "binary" if num_labels == 2 else "weighted"
print(f"Detected {num_labels} classes ‚Üí using average='{avg_type}' for metrics")

# Tokenize datasets (reusing SAME X_train, X_val, y_train, y_val from Exercise F2)
def tokenize_texts(texts, max_length=160):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_enc = tokenize_texts(X_train, max_length=160)
val_enc = tokenize_texts(X_val, max_length=160)

train_ds = Dataset.from_dict({
    "input_ids": train_enc["input_ids"],
    "attention_mask": train_enc["attention_mask"],
    "labels": torch.tensor(y_train.to_numpy(), dtype=torch.long)
})

val_ds = Dataset.from_dict({
    "input_ids": val_enc["input_ids"],
    "attention_mask": val_enc["attention_mask"],
    "labels": torch.tensor(y_val.to_numpy(), dtype=torch.long)
})

# Compute class weights (from Exercise F2)
counts = np.bincount(y_train, minlength=num_labels)
weights = counts.max() / np.maximum(counts, 1)
class_weights = torch.tensor(weights, dtype=torch.float32, device=device)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average=avg_type)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f}

# Weighted Trainer class
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

print("‚úÖ Exercise F3 setup complete!")


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

NameError: name 'y_train' is not defined

### F3.2: Random Search Implementation


I implement Random Search to find optimal hyperparameters. This cell defines the search space and executes the optimization.

### Model Initialization Function

I define a function called `model_init()` that creates a fresh model for each trial. This function takes no parameters but returns a model instance.

I call `AutoModelForSequenceClassification.from_pretrained()` with two arguments. The first argument is CLINICAL_BERT which is the model identifier string. The second argument is num_labels which sets the number of output classes.

I call `.to(device)` on the model to move it to the appropriate device. If CUDA is available, it moves to GPU. Otherwise, it stays on CPU.

I return the model instance. The hyperparameter_search method calls this function for each trial. This ensures each trial starts with identical initial weights, making comparisons fair.

### Hyperparameter Space Definition

I define a function called `random_search_hp_space()` that takes a trial parameter. The trial object comes from Optuna and suggests hyperparameter values.

I call `trial.suggest_float()` for learning_rate. I pass the parameter name as a string "learning_rate". I set the minimum to 1e-5 and maximum to 3e-5. The log parameter set to True samples on a logarithmic scale. This is appropriate for learning rates which vary over orders of magnitude.

I call `trial.suggest_categorical()` for per_device_train_batch_size. I pass the parameter name as a string. I provide a list of discrete options [8, 16]. The function randomly picks one of these values.

I call `trial.suggest_float()` for weight_decay. I set the range from 0.0 to 0.01. I do not use log scale since weight decay is linear.

I call `trial.suggest_int()` for num_train_epochs. I set both minimum and maximum to 2. This keeps training fast while still allowing the model to learn.

I return a dictionary with all four hyperparameters. The keys match the parameter names that TrainingArguments expects.

### Training Arguments Configuration

I create a TrainingArguments object called random_training_args. I set output_dir to "./random_search_results" where checkpoints save. I set eval_strategy to "epoch" to evaluate after each epoch. I set save_strategy to "epoch" to save checkpoints after each epoch.

I set load_best_model_at_end to True to keep the best performing model. I set metric_for_best_model to "f1" to optimize for F1 score. I set greater_is_better to True since higher F1 is better.

I set fp16 to the result of `torch.cuda.is_available()` to use half precision if GPU is available. I set report_to to "none" to disable external logging services.

I set warmup_steps to 500 to gradually increase learning rate at the start. I set logging_steps to 100 to print progress every 100 steps.

### Trainer Initialization

I create a WeightedTrainer instance called random_trainer. I pass model_init as a function reference, not a model instance. This lets the trainer create a fresh model for each trial.

I pass random_training_args as the args parameter. I pass train_ds and val_ds as the datasets. I pass compute_metrics as the metrics function. I pass tokenizer for text processing.

### Execution Tracking

I print messages indicating Random Search is starting. I print that it will run 6 trials. I print that it explores continuous ranges efficiently.

I call `time.time()` to record the start time. I store this in random_start_time.

### Running Hyperparameter Search

I call `hyperparameter_search()` on the trainer. I set backend to "optuna" to use Optuna for optimization. I pass random_search_hp_space as the hp_space parameter. This function defines which hyperparameters to optimize.

I set direction to "maximize" since I want the highest F1 score. I set n_trials to 6 to run six different hyperparameter combinations.

The method returns the best trial object. I store this in random_best_trial.

### Time Calculation

I call `time.time()` again to get the end time. I store this in random_end_time. I subtract random_start_time from random_end_time to get the total duration. I store this in random_total_time.

### Results Display

I print a completion message with the total time formatted to two decimal places. I check if random_best_trial exists. If it does, I print the trial object. I extract hyperparameters using the `.hyperparameters` attribute. I store these in random_best_hps.

I loop through the hyperparameters dictionary using `.items()`. I print each key and value pair. I print the best F1 score using the `.objective` attribute.

If random_best_trial does not exist, I print an error message and set random_best_hps to an empty dictionary. This prevents errors in later cells.


In [None]:
# Random Search Implementation
# This randomly samples from the hyperparameter space

def random_search_hp_space(trial):
    """
    Define the hyperparameter space for Random Search.
    Random Search samples RANDOMLY from continuous/discrete ranges.
    """
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 3e-5, log=True)  # Lower range
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16])  # Smaller batches
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.01)  # Same (already low)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 2)  # Keep 2 epochs for speed

    return {
        "learning_rate": learning_rate,
        "per_device_train_batch_size": per_device_train_batch_size,
        "weight_decay": weight_decay,
        "num_train_epochs": num_train_epochs,
    }

# Training arguments template (same as grid search)
random_training_args = TrainingArguments(
    output_dir="./random_search_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
    warmup_steps=500,
    logging_steps=100,
)

# Initialize trainer for random search
random_trainer = WeightedTrainer(
    model_init=model_init,
    args=random_training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("--- Starting Random Search ---")
print("Random Search will sample 6 trials from the hyperparameter space")
print("This allows exploration of continuous ranges efficiently.\n")

# Track start time
random_start_time = time.time()

# Execute Random Search
# Use same number of trials as grid search for fair comparison
random_best_trial = random_trainer.hyperparameter_search(
    backend="optuna",
    hp_space=random_search_hp_space,
    direction="maximize",
    n_trials=6,  # Same number of trials as grid search for fair comparison
)

random_end_time = time.time()
random_total_time = random_end_time - random_start_time

print(f"\n--- Random Search Complete (Time: {random_total_time:.2f} seconds) ---")
print("\nBEST HYPERPARAMETERS FROM RANDOM SEARCH:")
if random_best_trial:
    print(random_best_trial)
    random_best_hps = random_best_trial.hyperparameters
    print("\nBest Hyperparameters:")
    for key, value in random_best_hps.items():
        print(f"  {key}: {value}")
    print(f"\nBest F1 Score: {random_best_trial.objective:.4f}")
else:
    print("Random search failed or no best trial found.")
    random_best_hps = {}


  random_trainer = WeightedTrainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-11-08 13:08:52,516] A new study created in memory with name: no-name-a78bbfa1-9212-468c-8194-d81885a1b331


--- Starting Random Search ---
Random Search will sample 6 trials from the hyperparameter space
This allows exploration of continuous ranges efficiently.



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.509,0.625845,0.798045,0.80131,0.798045,0.797731
2,0.4708,0.599975,0.811806,0.820123,0.811806,0.812609


[I 2025-11-08 13:31:35,225] Trial 0 finished with value: 3.256344032885604 and parameters: {'learning_rate': 1.7108216142611804e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.009677289952248508, 'num_train_epochs': 2}. Best is trial 0 with value: 3.256344032885604.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6103,0.590603,0.796147,0.799588,0.796147,0.796199
2,0.3318,0.539433,0.809718,0.822356,0.809718,0.810934


[I 2025-11-08 13:47:52,115] Trial 1 finished with value: 3.2527265840624726 and parameters: {'learning_rate': 2.853743439192109e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.006042395220084231, 'num_train_epochs': 2}. Best is trial 0 with value: 3.256344032885604.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6427,0.610374,0.788839,0.797406,0.788839,0.79119


### F3.3: Extract All Trial Results and Log to Excel




I implement Random Search to find optimal hyperparameters. This cell defines the search space and executes the optimization.

### Model Initialization Function

I define a function called `model_init()` that creates a fresh model for each trial. This function takes no parameters but returns a model instance.

I call `AutoModelForSequenceClassification.from_pretrained()` with two arguments. The first argument is CLINICAL_BERT which is the model identifier string. The second argument is num_labels which sets the number of output classes.

I call `.to(device)` on the model to move it to the appropriate device. If CUDA is available, it moves to GPU. Otherwise, it stays on CPU.

I return the model instance. The hyperparameter_search method calls this function for each trial. This ensures each trial starts with identical initial weights, making comparisons fair.

### Hyperparameter Space Definition

I define a function called `random_search_hp_space()` that takes a trial parameter. The trial object comes from Optuna and suggests hyperparameter values.

I call `trial.suggest_float()` for learning_rate. I pass the parameter name as a string "learning_rate". I set the minimum to 1e-5 and maximum to 3e-5. The log parameter set to True samples on a logarithmic scale. This is appropriate for learning rates which vary over orders of magnitude.

I call `trial.suggest_categorical()` for per_device_train_batch_size. I pass the parameter name as a string. I provide a list of discrete options [8, 16]. The function randomly picks one of these values.

I call `trial.suggest_float()` for weight_decay. I set the range from 0.0 to 0.01. I do not use log scale since weight decay is linear.

I call `trial.suggest_int()` for num_train_epochs. I set both minimum and maximum to 2. This keeps training fast while still allowing the model to learn.

I return a dictionary with all four hyperparameters. The keys match the parameter names that TrainingArguments expects.

### Training Arguments Configuration

I create a TrainingArguments object called random_training_args. I set output_dir to "./random_search_results" where checkpoints save. I set eval_strategy to "epoch" to evaluate after each epoch. I set save_strategy to "epoch" to save checkpoints after each epoch.

I set load_best_model_at_end to True to keep the best performing model. I set metric_for_best_model to "f1" to optimize for F1 score. I set greater_is_better to True since higher F1 is better.

I set fp16 to the result of `torch.cuda.is_available()` to use half precision if GPU is available. I set report_to to "none" to disable external logging services.

I set warmup_steps to 500 to gradually increase learning rate at the start. I set logging_steps to 100 to print progress every 100 steps.

### Trainer Initialization

I create a WeightedTrainer instance called random_trainer. I pass model_init as a function reference, not a model instance. This lets the trainer create a fresh model for each trial.

I pass random_training_args as the args parameter. I pass train_ds and val_ds as the datasets. I pass compute_metrics as the metrics function. I pass tokenizer for text processing.

### Execution Tracking

I print messages indicating Random Search is starting. I print that it will run 6 trials. I print that it explores continuous ranges efficiently.

I call `time.time()` to record the start time. I store this in random_start_time.

### Running Hyperparameter Search

I call `hyperparameter_search()` on the trainer. I set backend to "optuna" to use Optuna for optimization. I pass random_search_hp_space as the hp_space parameter. This function defines which hyperparameters to optimize.

I set direction to "maximize" since I want the highest F1 score. I set n_trials to 6 to run six different hyperparameter combinations.

The method returns the best trial object. I store this in random_best_trial.

### Time Calculation

I call `time.time()` again to get the end time. I store this in random_end_time. I subtract random_start_time from random_end_time to get the total duration. I store this in random_total_time.

### Results Display

I print a completion message with the total time formatted to two decimal places. I check if random_best_trial exists. If it does, I print the trial object. I extract hyperparameters using the `.hyperparameters` attribute. I store these in random_best_hps.

I loop through the hyperparameters dictionary using `.items()`. I print each key and value pair. I print the best F1 score using the `.objective` attribute.

If random_best_trial does not exist, I print an error message and set random_best_hps to an empty dictionary. This prevents errors in later cells.


In [None]:
# Prepare Random Search results for Excel logging
# Note: Individual trial extraction is limited by transformers library
# We'll log the best trial results with full metrics

print("Preparing Random Search results for Excel logging...")
print("Note: Individual trial extraction may be limited by transformers library.")
print("Best trial results will be logged to Excel with full metrics.")
print("Creating summary and Excel log sheet...")

# Create summary data for Excel logging
# Since we can't easily extract all individual trials from hyperparameter_search,
# we'll create a summary with the best Random Search results

import pandas as pd

# Create summary data for Excel
summary_data = []

# Random Search Summary
if random_best_trial:
    summary_data.append({
        "Search_Type": "Random Search (Automated)",
        "Best_F1_Score": random_best_trial.objective,
        "Best_Learning_Rate": random_best_hps.get("learning_rate", "N/A"),
        "Best_Batch_Size": random_best_hps.get("per_device_train_batch_size", "N/A"),
        "Best_Weight_Decay": random_best_hps.get("weight_decay", "N/A"),
        "Best_Epochs": random_best_hps.get("num_train_epochs", "N/A"),
        "Total_Trials": 6,  # Updated for fast config
        "Total_Time_Seconds": random_total_time,
        "Time_Per_Trial_Seconds": random_total_time / 6,
        "Strategy": "Random Sampling - Continuous ranges",
        "Member_Number": MEMBER_NUMBER
    })

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    print("\n=== RANDOM SEARCH SUMMARY ===")
    print(summary_df.to_string(index=False))
    print("\nNote: This will be compared to Exercise F2 manual experiments in the Excel file.")
else:
    print("‚ö†Ô∏è  Random Search did not complete. Please run Random Search first.")

Preparing Random Search results for Excel logging...
Note: Individual trial extraction may be limited by transformers library.
Best trial results will be logged to Excel with full metrics.
Creating summary and Excel log sheet...


NameError: name 'random_best_trial' is not defined

In this code section, I create an Excel workbook using Workbook() and set ws.title to name the sheet. I define styling with PatternFill() and Font(), then use enumerate() to loop through headers, writing each with ws.cell() and applying Alignment(). I record best trial data in row 2 using ws.cell(), extracting values with .get() and formatting timestamps with datetime.now().strftime(). I create a second sheet using wb.create_sheet(), build a comparison_data list with .append(), calculate efficiency with conditional logic, then write data using nested enumerate() loops. I add notes by incrementing notes_row and setting .font property, implement auto-width adjustment by iterating through ws.columns with try-except and setting ws.column_dimensions[col_letter].width. Finally, I generate filename with f-string, save with wb.save(), and use try-except blocks to handle files.download() with ImportError exceptions.

In [None]:
# Create Excel log sheet for Random Search only
wb = Workbook()
ws = wb.active
ws.title = "F3_Random_Search_Results"

# Header styling
header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF")

# Write headers
headers = [
    "Member", "Trial #", "Learning Rate", "Batch Size", "Weight Decay",
    "Epochs", "F1 Score", "Accuracy", "Precision", "Recall",
    "Training Time (s)", "Timestamp"
]

for col_idx, header in enumerate(headers, 1):
    cell = ws.cell(row=1, column=col_idx, value=header)
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = Alignment(horizontal="center")

row = 2

# Add Random Search best result
if random_best_trial:
    ws.cell(row=row, column=1, value=f"Member {MEMBER_NUMBER}")  # Use member number from config
    ws.cell(row=row, column=2, value="Best")
    ws.cell(row=row, column=3, value=random_best_hps.get("learning_rate", "N/A"))
    ws.cell(row=row, column=4, value=random_best_hps.get("per_device_train_batch_size", "N/A"))
    ws.cell(row=row, column=5, value=random_best_hps.get("weight_decay", "N/A"))
    ws.cell(row=row, column=6, value=random_best_hps.get("num_train_epochs", "N/A"))
    ws.cell(row=row, column=7, value=random_best_trial.objective)
    ws.cell(row=row, column=11, value=random_total_time)
    ws.cell(row=row, column=12, value=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    row += 1

# Add comparison sheet (Random Search vs Exercise F2)
ws2 = wb.create_sheet("Comparison_Analysis")

comparison_headers = [
    "Metric", "Random Search (Automated)", "Exercise F2 (Manual)", "Notes"
]

for col_idx, header in enumerate(comparison_headers, 1):
    cell = ws2.cell(row=1, column=col_idx, value=header)
    cell.fill = header_fill
    cell.font = header_font
    cell.alignment = Alignment(horizontal="center")

# Comparison data (you'll need to add Exercise F2 best F1 score manually)
comparison_data = []

if random_best_trial:
    comparison_data.append([
        "Best F1 Score",
        f"{random_best_trial.objective:.4f}",
        "Add Exercise F2 best F1 here",
        "Random Search uses automated optimization"
    ])

    comparison_data.append([
        "Total Time (seconds)",
        f"{random_total_time:.2f}",
        "Add Exercise F2 total time here",
        "Time for 6 automated trials"
    ])

    # Efficiency
    random_efficiency = random_best_trial.objective / random_total_time if random_total_time > 0 else 0
    comparison_data.append([
        "Efficiency (F1/Time)",
        f"{random_efficiency:.6f}",
        "Calculate from F2",
        "Higher is better"
    ])

# Write comparison data
for row_idx, data in enumerate(comparison_data, 2):
    for col_idx, value in enumerate(data, 1):
        ws2.cell(row=row_idx, column=col_idx, value=value)

# Add analysis notes
notes_row = len(comparison_data) + 3
ws2.cell(row=notes_row, column=1, value="Analysis Notes:").font = Font(bold=True)
notes_row += 1
ws2.cell(row=notes_row, column=1, value="1. Random Search uses automated hyperparameter optimization")
notes_row += 1
ws2.cell(row=notes_row, column=1, value="2. Exercise F2 used manual hyperparameter tuning")
notes_row += 1
ws2.cell(row=notes_row, column=1, value="3. Random Search can explore continuous hyperparameter ranges")
notes_row += 1
ws2.cell(row=notes_row, column=1, value="4. Efficiency = Best F1 Score / Total Time")

# Auto-adjust column widths
for col in ws.columns:
    max_length = 0
    col_letter = col[0].column_letter
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))
        except:
            pass
    adjusted_width = min(max_length + 2, 30)
    ws.column_dimensions[col_letter].width = adjusted_width

# Save Excel file
excel_filename = f"Exercise_F3_Random_Search{MEMBER_NUMBER}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
wb.save(excel_filename)

print(f"\n‚úÖ Excel log saved: {excel_filename}")
print(f"   - Sheet 1: F3_Random_Search_Results")
print(f"   - Sheet 2: Comparison_Analysis (vs Exercise F2)")

# Automatically download the file
try:
    from google.colab import files
    files.download(excel_filename)
    print(f"‚úÖ File automatically downloaded: {excel_filename}")
except ImportError:
    print("Note: Not running in Google Colab. File saved locally.")
except Exception as e:
    print(f"Note: Could not auto-download. File saved at: {excel_filename}")
    print(f"   Error: {e}")