### **Cell 1 — Load processed data and initialize runtime environment**
This cell loads the preprocessed train/val/test splits, sets up the device (CUDA/MPS/CPU), and prepares global configurations such as batch sizes for training.


In [1]:
!pip install -q \
  "pandas>=2.0.0" \
  "numpy>=1.24.0" \
  "scikit-learn>=1.3.0" \
  "torch>=2.2.0" \
  "transformers>=4.36.0" \
  "xgboost>=2.0.0" \
  "matplotlib>=3.8.0" \
  "jupyter>=1.0.0" \
  "textblob>=0.17.1" \
  "empath>=0.89" \
  "sentencepiece>=0.1.99" \
  "accelerate>=0.25.0" \
  "protobuf>=4.25.0" \
  "tqdm>=4.66.0"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.8/59.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for empath (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/becoollll/Suicide-Risk-Detection.git
%cd Suicide-Risk-Detection
!ls

Cloning into 'Suicide-Risk-Detection'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 77 (delta 30), reused 60 (delta 21), pack-reused 0 (from 0)[K
Receiving objects: 100% (77/77), 71.58 KiB | 4.47 MiB/s, done.
Resolving deltas: 100% (30/30), done.
/content/Suicide-Risk-Detection
notebooks  README.md  requirements.txt	src


In [15]:
MODEL_NAME = "microsoft/deberta-v3-base"   # Backbone model
MAX_LEN = 512                                # Tokenization max length
BATCH_SIZE = 32                               # Training batch size
EPOCHS = 4                                    # Number of training epochs
LEARNING_RATE = 2e-5                          # LR
NUM_CLASSES = 4                               # (Indicator, Ideation, Behavior, Attempt)

print("MODEL_NAME    :", MODEL_NAME)
print("MAX_LEN       :", MAX_LEN)
print("BATCH_SIZE    :", BATCH_SIZE)
print("EPOCHS        :", EPOCHS)
print("LEARNING_RATE :", LEARNING_RATE)
print("NUM_CLASSES   :", NUM_CLASSES)

MODEL_NAME    : microsoft/deberta-v3-base
MAX_LEN       : 512
BATCH_SIZE    : 32
EPOCHS        : 4
LEARNING_RATE : 2e-05
NUM_CLASSES   : 4


In [16]:
# ===== Cell 1: Imports & Load Processed Data =====
import os
import sys
from pathlib import Path

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

# ---- Project paths  ----
PROJECT_ROOT = "/content/Suicide-Risk-Detection"
sys.path.append(PROJECT_ROOT)

from src.utils import compute_graded_metrics
from src.loss import OrdinalLoss

PROCESSED_DATA_DIR = f"{PROJECT_ROOT}/data/processed"
print("PROCESSED_DATA_DIR:", PROCESSED_DATA_DIR)

train_df = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "train.pkl"))
val_df   = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "val.pkl"))
test_df  = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "test.pkl"))

print(f"Train, Val, Test size: {len(train_df)}, {len(val_df)}, {len(test_df)}")
display(train_df.head())


if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print("Using device: CUDA (GPU)")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
    print("Using device: MPS (Apple Silicon GPU)")
else:
    DEVICE = torch.device("cpu")
    print("Using device: CPU")


BATCH_SIZE = 32
MICRO_BATCH_SIZE = 16
ACCUM_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE

print(f"Effective batch size = {MICRO_BATCH_SIZE} x {ACCUM_STEPS} = {BATCH_SIZE}")

PROCESSED_DATA_DIR: /content/Suicide-Risk-Detection/data/processed
Train, Val, Test size: 11972, 1605, 1036


Unnamed: 0,users,text,sentiment,time,timestamp_dt,label_ordinal
0,1,No one understands how much I desperately want...,Ideation,1648483701,2022-03-28 16:08:21,1
1,2,Today I never wanted to live to see 25. That m...,Behavior,1651130449,2022-04-28 07:20:49,2
2,3,Suicidal thoughts at / because of school For s...,Ideation,1662712545,2022-09-09 08:35:45,1
3,4,I feel like the pain will never end Everyday f...,Ideation,1638628371,2021-12-04 14:32:51,1
4,4,Is there even a point to living if you're not ...,Indicator,1639749228,2021-12-17 13:53:48,0


Using device: CUDA (GPU)
Effective batch size = 16 x 2 = 32


### **Cell 2 — Build tokenizer, dataset class, and dataloaders**
This cell initializes the tokenizer, defines the dataset class for RSD inputs, and constructs DataLoaders used during training and evaluation.

In [17]:
# ===== Cell 2: Tokenizer, Dataset & DataLoaders =====

TEXT_COL = "text"
LABEL_COL = "label_ordinal"

label2id = {
    "Indicator": 0,
    "Ideation": 1,
    "Behavior": 2,
    "Attempt": 3,
}
id2label = {v: k for k, v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class RSDDataset(Dataset):
    def __init__(self, df, text_col, label_col):
        self.texts = df[text_col].tolist()
        self.labels = df[label_col].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])

        enc = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

train_loader = DataLoader(
    RSDDataset(train_df, TEXT_COL, LABEL_COL),
    batch_size=MICRO_BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    RSDDataset(val_df, TEXT_COL, LABEL_COL),
    batch_size=MICRO_BATCH_SIZE * 2,
    shuffle=False
)

test_loader = DataLoader(
    RSDDataset(test_df, TEXT_COL, LABEL_COL),
    batch_size=MICRO_BATCH_SIZE * 2,
    shuffle=False
)

batch = next(iter(train_loader))
print("Batch input_ids shape     :", batch["input_ids"].shape)
print("Batch attention_mask shape:", batch["attention_mask"].shape)
print("Batch labels shape        :", batch["label"].shape)



Batch input_ids shape     : torch.Size([16, 512])
Batch attention_mask shape: torch.Size([16, 512])
Batch labels shape        : torch.Size([16])


### **Cell 3 — Define the SISMO Ordinal Model with DeBERTa backbone**
This cell creates the full fine-tuning model architecture: a DeBERTa backbone and a BiLSTM classification head for ordinal prediction.

In [18]:
class SISMOOrdinalModel(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super().__init__()

        self.backbone = AutoModel.from_pretrained(MODEL_NAME)

        # Important for DeBERTa v3
        if hasattr(self.backbone.config, "use_cache"):
            self.backbone.config.use_cache = False

        print("Backbone UNFROZEN: full fine-tuning (no gradient checkpointing).")

        hidden_size = self.backbone.config.hidden_size

        self.lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=256,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(256 * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        seq_output = outputs.last_hidden_state  # (B, T, H)

        lstm_out, (h_n, _) = self.lstm(seq_output)  # h_n: (2, B, 256)
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        pooled = torch.cat([h_forward, h_backward], dim=-1)

        logits = self.classifier(self.dropout(pooled))
        return logits


model = SISMOOrdinalModel().to(DEVICE)

print("Model initialized on", DEVICE)

Backbone UNFROZEN: full fine-tuning (no gradient checkpointing).
Model initialized on cuda


### **Cell 4 — Configure loss function, optimizer, and scheduler**
This cell initializes the OrdinalLoss, sets class weights, and prepares AdamW and learning-rate warmup scheduling.

In [19]:
from transformers import get_linear_schedule_with_warmup
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

support_counts = torch.tensor([305, 530, 135, 66], dtype=torch.float32)
print("Support counts:", support_counts.tolist())

raw_weights = 1.0 / torch.log(support_counts + 1.0)
class_weights = raw_weights / raw_weights.sum() * len(support_counts)

print("Class weights:", class_weights.tolist())

criterion = OrdinalLoss(
    alpha=2.0,
    num_classes=NUM_CLASSES,
    device=DEVICE,
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

total_steps = (len(train_loader) // ACCUM_STEPS) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * WARMUP_RATIO),
    num_training_steps=total_steps,
)

print(f"EPOCHS={EPOCHS} | total_steps={total_steps}")

Support counts: [305.0, 530.0, 135.0, 66.0]
Class weights: [0.9012120962142944, 0.8220493197441101, 1.049974799156189, 1.2267636060714722]
EPOCHS=4 | total_steps=1496


### **Cell 5 — Training loop with gradient accumulation**
This cell defines the core training function that performs gradient accumulation to simulate a larger effective batch size.

In [22]:
def train_one_epoch(model, data_loader, optimizer, criterion, device, scheduler=None):
    model.train()
    total_loss = 0.0
    total_examples = 0

    optimizer.zero_grad()
    accum_counter = 0

    for step, batch in enumerate(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        logits = model(input_ids, attention_mask)
        raw_loss = criterion(logits, labels)

        loss = raw_loss / ACCUM_STEPS
        loss.backward()
        accum_counter += 1

        bs = input_ids.size(0)
        total_loss += raw_loss.item() * bs
        total_examples += bs

        if accum_counter == ACCUM_STEPS:
            optimizer.step()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
            accum_counter = 0

        if (step + 1) % (ACCUM_STEPS * 10) == 0:
            print(f"  step {step+1} | loss={raw_loss.item():.4f}")

    if accum_counter > 0:
        optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()

    avg_loss = total_loss / total_examples
    return avg_loss

### **Cell 6 — Evaluation: compute accuracy, graded precision/recall/F1**
This cell implements the evaluation function used during validation to compute model performance using graded metrics.

In [23]:
def evaluate(model, data_loader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().tolist())
            all_preds.extend(preds.cpu().tolist())

    metrics = compute_graded_metrics(all_labels, all_preds)
    gp = metrics["graded_precision"]
    gr = metrics["graded_recall"]
    gf1 = metrics["graded_f1"]

    acc = (torch.tensor(all_labels) == torch.tensor(all_preds)).float().mean().item()

    return acc, gp, gr, gf1

### **Cell 7 — Execute training and track best GF1 performance**
This cell runs the full training process for all epochs, reports validation metrics, and saves the best performing model checkpoint.

In [24]:
best_val_gf1 = 0.0

print("===== Start Training =====")

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_loss = train_one_epoch(
        model,
        train_loader,
        optimizer,
        criterion,
        DEVICE,
        scheduler,
    )

    val_acc, val_gp, val_gr, val_gf1 = evaluate(model, val_loader, DEVICE)

    print(
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} | "
        f"val_acc={val_acc:.4f} | "
        f"GP={val_gp:.4f} | GR={val_gr:.4f} | GF1={val_gf1:.4f}"
    )

    if val_gf1 > best_val_gf1:
        best_val_gf1 = val_gf1
        torch.save(model.state_dict(), "best_sismo_ordinal.pt")
        print("  -> Best model updated and saved.")

print("\nBest Val Graded F1:", best_val_gf1)

===== Start Training =====

Epoch 1/4
  step 20 | loss=0.9795
  step 40 | loss=0.9640
  step 60 | loss=1.0996
  step 80 | loss=0.8561
  step 100 | loss=0.8347
  step 120 | loss=1.1354
  step 140 | loss=0.8741
  step 160 | loss=1.2059
  step 180 | loss=0.8301
  step 200 | loss=0.9039
  step 220 | loss=0.8500
  step 240 | loss=0.7761
  step 260 | loss=0.8540
  step 280 | loss=1.0414
  step 300 | loss=0.8324
  step 320 | loss=0.8120
  step 340 | loss=0.9649
  step 360 | loss=0.8822
  step 380 | loss=0.9831
  step 400 | loss=0.7871
  step 420 | loss=1.0070
  step 440 | loss=1.2223
  step 460 | loss=0.8004
  step 480 | loss=0.8801
  step 500 | loss=0.9960
  step 520 | loss=0.7567
  step 540 | loss=1.0620
  step 560 | loss=1.0036
  step 580 | loss=0.8796
  step 600 | loss=0.7616
  step 620 | loss=1.0768
  step 640 | loss=0.9033
  step 660 | loss=0.9235
  step 680 | loss=0.8577
  step 700 | loss=0.8207
  step 720 | loss=0.9005
  step 740 | loss=0.9409
[Epoch 1] train_loss=0.9354 | val_acc=0.7

In [25]:
# ===== BEGIN: Gemini-generated block =====
from sklearn.metrics import accuracy_score, classification_report


model_to_eval = model

model_to_eval.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        logits = model(input_ids, attention_mask)

        logits_adj = logits.clone()

        logits_adj[:, 2] += 0.2
        logits_adj[:, 3] += 0.4

        preds = torch.argmax(logits_adj, dim=1)

        all_labels.extend(labels.cpu().tolist())
        all_preds.extend(preds.cpu().tolist())

y_test = all_labels
y_pred = all_preds

acc = accuracy_score(y_test, y_pred)
print(f"\nSimple Accuracy: {acc:.4f}")
# ===== END: Gemini-generated block =====

target_names = ['Indicator', 'Ideation', 'Behavior', 'Attempt']
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

graded_metrics = compute_graded_metrics(y_test, y_pred)
print("\n=== Graded Metrics ===")
print(f"Graded Precision: {graded_metrics['graded_precision']:.4f}")
print(f"Graded Recall:    {graded_metrics['graded_recall']:.4f}")
print(f"Graded F1-Score:  {graded_metrics['graded_f1']:.4f}")


Simple Accuracy: 0.7143

Classification Report:
              precision    recall  f1-score   support

   Indicator       0.74      0.74      0.74       305
    Ideation       0.77      0.74      0.75       530
    Behavior       0.55      0.66      0.60       135
     Attempt       0.59      0.55      0.57        66

    accuracy                           0.71      1036
   macro avg       0.66      0.67      0.66      1036
weighted avg       0.72      0.71      0.72      1036


=== Graded Metrics ===
Graded Precision: 0.8542
Graded Recall:    0.8600
Graded F1-Score:  0.8571
