In [19]:
!pip install -q \
  "pandas>=2.0.0" \
  "numpy>=1.24.0" \
  "scikit-learn>=1.3.0" \
  "torch>=2.2.0" \
  "transformers>=4.36.0" \
  "xgboost>=2.0.0" \
  "matplotlib>=3.8.0" \
  "jupyter>=1.0.0" \
  "textblob>=0.17.1" \
  "empath>=0.89" \
  "sentencepiece>=0.1.99" \
  "accelerate>=0.25.0" \
  "protobuf>=4.25.0" \
  "tqdm>=4.66.0"

In [20]:
!git clone https://github.com/becoollll/Suicide-Risk-Detection.git
%cd Suicide-Risk-Detection
!ls

Cloning into 'Suicide-Risk-Detection'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 73 (delta 29), reused 57 (delta 21), pack-reused 0 (from 0)[K
Receiving objects: 100% (73/73), 57.09 KiB | 14.27 MiB/s, done.
Resolving deltas: 100% (29/29), done.
/content/Suicide-Risk-Detection/Suicide-Risk-Detection/Suicide-Risk-Detection
notebooks  README.md  requirements.txt	src


In [21]:
MODEL_NAME = "microsoft/deberta-v3-base"   # Backbone model
MAX_LEN = 512                                # Tokenization max length
BATCH_SIZE = 8                               # Training batch size
EPOCHS = 4                                    # Number of training epochs
LEARNING_RATE = 2e-5                          # LR
NUM_CLASSES = 4                               # (Indicator, Ideation, Behavior, Attempt)

print("MODEL_NAME    :", MODEL_NAME)
print("MAX_LEN       :", MAX_LEN)
print("BATCH_SIZE    :", BATCH_SIZE)
print("EPOCHS        :", EPOCHS)
print("LEARNING_RATE :", LEARNING_RATE)
print("NUM_CLASSES   :", NUM_CLASSES)

MODEL_NAME    : microsoft/deberta-v3-base
MAX_LEN       : 512
BATCH_SIZE    : 8
EPOCHS        : 4
LEARNING_RATE : 2e-05
NUM_CLASSES   : 4


In [24]:
# ===== Cell 1: Imports & Load Processed Data =====
import os
import sys
from pathlib import Path

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

# ---- Project paths (Colab) ----
PROJECT_ROOT = "/content/Suicide-Risk-Detection"
sys.path.append(PROJECT_ROOT)

from src.utils import compute_graded_metrics
from src.loss import OrdinalLoss

PROCESSED_DATA_DIR = f"{PROJECT_ROOT}/data/processed"
print("PROCESSED_DATA_DIR:", PROCESSED_DATA_DIR)

train_df = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "train.pkl"))
val_df   = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "val.pkl"))
test_df  = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, "test.pkl"))

print(f"Train, Val, Test size: {len(train_df)}, {len(val_df)}, {len(test_df)}")
display(train_df.head())


if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print("Using device: CUDA (GPU)")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
    print("Using device: MPS (Apple Silicon GPU)")
else:
    DEVICE = torch.device("cpu")
    print("Using device: CPU")


BATCH_SIZE = 32
MICRO_BATCH_SIZE = 8
ACCUM_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE

print(f"Effective batch size = {MICRO_BATCH_SIZE} x {ACCUM_STEPS} = {BATCH_SIZE}")

PROCESSED_DATA_DIR: /content/Suicide-Risk-Detection/data/processed
Train, Val, Test size: 11972, 1605, 1036


Unnamed: 0,users,text,sentiment,time,timestamp_dt,label_ordinal
0,1,No one understands how much I desperately want...,Ideation,1648483701,2022-03-28 16:08:21,1
1,2,Today I never wanted to live to see 25. That m...,Behavior,1651130449,2022-04-28 07:20:49,2
2,3,Suicidal thoughts at / because of school For s...,Ideation,1662712545,2022-09-09 08:35:45,1
3,4,I feel like the pain will never end Everyday f...,Ideation,1638628371,2021-12-04 14:32:51,1
4,4,Is there even a point to living if you're not ...,Indicator,1639749228,2021-12-17 13:53:48,0


Using device: CUDA (GPU)
Effective batch size = 8 x 4 = 32


In [26]:
# ===== Cell 2: Tokenizer, Dataset & DataLoaders =====

TEXT_COL = "text"
LABEL_COL = "label_ordinal"

label2id = {
    "Indicator": 0,
    "Ideation": 1,
    "Behavior": 2,
    "Attempt": 3,
}
id2label = {v: k for k, v in label2id.items()}

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class RSDDataset(Dataset):
    def __init__(self, df, text_col, label_col):
        self.texts = df[text_col].tolist()
        self.labels = df[label_col].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])

        enc = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

train_loader = DataLoader(
    RSDDataset(train_df, TEXT_COL, LABEL_COL),
    batch_size=MICRO_BATCH_SIZE,
    shuffle=True
)

val_loader = DataLoader(
    RSDDataset(val_df, TEXT_COL, LABEL_COL),
    batch_size=MICRO_BATCH_SIZE * 2,
    shuffle=False
)

test_loader = DataLoader(
    RSDDataset(test_df, TEXT_COL, LABEL_COL),
    batch_size=MICRO_BATCH_SIZE * 2,
    shuffle=False
)

batch = next(iter(train_loader))
print("Batch input_ids shape     :", batch["input_ids"].shape)
print("Batch attention_mask shape:", batch["attention_mask"].shape)
print("Batch labels shape        :", batch["label"].shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Batch input_ids shape     : torch.Size([8, 512])
Batch attention_mask shape: torch.Size([8, 512])
Batch labels shape        : torch.Size([8])


In [27]:
MODEL_NAME = "microsoft/deberta-v3-base"
NUM_CLASSES = 4

class SISMOOrdinalModel(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES):
        super().__init__()

        self.backbone = AutoModel.from_pretrained(MODEL_NAME)

        # Important for DeBERTa v3
        if hasattr(self.backbone.config, "use_cache"):
            self.backbone.config.use_cache = False

        print("Backbone UNFROZEN: full fine-tuning (no gradient checkpointing).")

        hidden_size = self.backbone.config.hidden_size

        self.lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=256,
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        )

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(256 * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        seq_output = outputs.last_hidden_state  # (B, T, H)

        lstm_out, (h_n, _) = self.lstm(seq_output)  # h_n: (2, B, 256)
        h_forward = h_n[-2]
        h_backward = h_n[-1]
        pooled = torch.cat([h_forward, h_backward], dim=-1)

        logits = self.classifier(self.dropout(pooled))
        return logits


model = SISMOOrdinalModel().to(DEVICE)

print("Model initialized on", DEVICE)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Backbone UNFROZEN: full fine-tuning (no gradient checkpointing).
Model initialized on cuda


In [28]:
from transformers import get_linear_schedule_with_warmup
EPOCHS = 4
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1

support_counts = torch.tensor([305, 530, 135, 66], dtype=torch.float32)
print("Support counts:", support_counts.tolist())

raw_weights = 1.0 / torch.log(support_counts + 1.0)
class_weights = raw_weights / raw_weights.sum() * len(support_counts)
class_weights = class_weights.to(DEVICE)
print("Class weights:", class_weights.tolist())

criterion = OrdinalLoss(
    alpha=2.0,
    num_classes=NUM_CLASSES,
    device=DEVICE,
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * WARMUP_RATIO),
    num_training_steps=total_steps,
)

print(f"EPOCHS={EPOCHS} | total_steps={total_steps}")

Support counts: [305.0, 530.0, 135.0, 66.0]
Class weights: [0.9012120962142944, 0.8220493197441101, 1.049974799156189, 1.2267636060714722]
EPOCHS=4 | total_steps=5988


In [29]:
def train_one_epoch(model, data_loader, optimizer, criterion, device, scheduler=None):
    model.train()
    total_loss = 0.0
    total_examples = 0

    for step, batch in enumerate(data_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        logits = model(input_ids, attention_mask)
        loss = criterion(logits=logits, targets=labels)

        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        bs = input_ids.size(0)
        total_loss += loss.item() * bs
        total_examples += bs

        if (step + 1) % 50 == 0:
            print(f"  Step {step+1} | Loss={loss.item():.4f}")

    return total_loss / total_examples


def evaluate(model, data_loader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().tolist())
            all_preds.extend(preds.cpu().tolist())

    metrics = compute_graded_metrics(all_labels, all_preds)
    gp = metrics["graded_precision"]
    gr = metrics["graded_recall"]
    gf1 = metrics["graded_f1"]
    acc = (torch.tensor(all_labels) == torch.tensor(all_preds)).float().mean().item()

    return acc, gp, gr, gf1

In [30]:
best_val_gf1 = 0.0

print("===== Start Training =====")

for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_loss = train_one_epoch(
        model,
        train_loader,
        optimizer,
        criterion,
        DEVICE,
        scheduler,
    )

    val_acc, val_gp, val_gr, val_gf1 = evaluate(model, val_loader, DEVICE)

    print(
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} | "
        f"val_acc={val_acc:.4f} | "
        f"GP={val_gp:.4f} | GR={val_gr:.4f} | GF1={val_gf1:.4f}"
    )

    if val_gf1 > best_val_gf1:
        best_val_gf1 = val_gf1
        torch.save(model.state_dict(), "best_sismo_ordinal.pt")
        print("  -> Best model updated and saved.")

print("\nBest Val Graded F1:", best_val_gf1)

===== Start Training =====

Epoch 1/4
  Step 50 | Loss=1.2944
  Step 100 | Loss=1.2039
  Step 150 | Loss=1.3592
  Step 200 | Loss=1.1735
  Step 250 | Loss=0.9754
  Step 300 | Loss=1.1698
  Step 350 | Loss=1.0656
  Step 400 | Loss=1.1487
  Step 450 | Loss=1.3497
  Step 500 | Loss=1.2489
  Step 550 | Loss=0.9233
  Step 600 | Loss=1.1426
  Step 650 | Loss=1.0379
  Step 700 | Loss=1.2540
  Step 750 | Loss=0.8183
  Step 800 | Loss=1.2100
  Step 850 | Loss=1.0314
  Step 900 | Loss=1.0690
  Step 950 | Loss=0.9973
  Step 1000 | Loss=0.7977
  Step 1050 | Loss=0.9944
  Step 1100 | Loss=0.8631
  Step 1150 | Loss=1.1196
  Step 1200 | Loss=1.2240
  Step 1250 | Loss=0.8290
  Step 1300 | Loss=1.1019
  Step 1350 | Loss=0.8293
  Step 1400 | Loss=0.9439
  Step 1450 | Loss=0.9695
[Epoch 1] train_loss=1.0995 | val_acc=0.7421 | GP=0.8735 | GR=0.8685 | GF1=0.8710
  -> Best model updated and saved.

Epoch 2/4
  Step 50 | Loss=1.0650
  Step 100 | Loss=0.8288
  Step 150 | Loss=0.8934
  Step 200 | Loss=0.9051
 

In [31]:
# ===== BEGIN: Gemini-generated block =====

import torch
from sklearn.metrics import accuracy_score, classification_report


model_to_eval = model

model_to_eval.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        logits = model(input_ids, attention_mask)

        logits_adj = logits.clone()

        logits_adj[:, 2] += 0.2
        logits_adj[:, 3] += 0.4

        preds = torch.argmax(logits_adj, dim=1)

        all_labels.extend(labels.cpu().tolist())
        all_preds.extend(preds.cpu().tolist())

y_test = all_labels
y_pred = all_preds

acc = accuracy_score(y_test, y_pred)
print(f"\nSimple Accuracy: {acc:.4f}")
# ===== END: Gemini-generated block =====

target_names = ['Indicator', 'Ideation', 'Behavior', 'Attempt']
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

graded_metrics = compute_graded_metrics(y_test, y_pred)
print("\n=== Graded Metrics ===")
print(f"Graded Precision: {graded_metrics['graded_precision']:.4f}")
print(f"Graded Recall:    {graded_metrics['graded_recall']:.4f}")
print(f"Graded F1-Score:  {graded_metrics['graded_f1']:.4f}")


Simple Accuracy: 0.7201

Classification Report:
              precision    recall  f1-score   support

   Indicator       0.75      0.75      0.75       305
    Ideation       0.77      0.76      0.76       530
    Behavior       0.54      0.59      0.56       135
     Attempt       0.59      0.53      0.56        66

    accuracy                           0.72      1036
   macro avg       0.66      0.66      0.66      1036
weighted avg       0.72      0.72      0.72      1036


=== Graded Metrics ===
Graded Precision: 0.8620
Graded Recall:    0.8581
Graded F1-Score:  0.8600
