# Suicide / Crisis Detection Training Notebook\nThis notebook trains a DistilBERT classifier to detect suicide vs non-suicide messages.\nIt includes preprocessing, training, evaluation, and inference demo.

In [1]:
!pip install transformers datasets torch scikit-learn pandas



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Dataset

In [2]:
# Replace 'your_dataset.csv' with your dataset filename
df = pd.read_csv("/content/drive/MyDrive/Yash/Suicide_Detection.csv")

# Keep only necessary columns
df = df[["text", "class"]].dropna()

# Map labels to integers
label_map = {"non-suicide": 0, "suicide": 1}
df["label"] = df["class"].map(label_map)

# Simple preprocessing (strip, normalize spaces)
df["text"] = df["text"].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
df.head()

Unnamed: 0,text,class,label
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,1
1,Am I weird I don't get affected by compliments...,non-suicide,0
2,Finally 2020 is almost over... So I can never ...,non-suicide,0
3,i need helpjust help me im crying so hard,suicide,1
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,1


## Split Dataset

In [3]:
train_df, test_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.15, stratify=train_df['label'], random_state=42)

print("Dataset sizes:", len(train_df), len(val_df), len(test_df))

Dataset sizes: 167672 29590 34812


## Convert to Hugging Face Datasets

In [5]:
train_ds = Dataset.from_pandas(train_df[['text','label']])
val_ds   = Dataset.from_pandas(val_df[['text','label']])
test_ds  = Dataset.from_pandas(test_df[['text','label']])

In [6]:
train_ds = train_ds.shuffle(seed=42).select(range(int(len(train_ds)*0.25)))
val_ds   = val_ds.shuffle(seed=42).select(range(int(len(val_ds)*0.25)))
test_ds  = test_ds.shuffle(seed=42).select(range(int(len(test_ds)*0.25)))

## Tokenization

In [7]:
MODEL = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tok(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)

train_ds = train_ds.map(tok, batched=True)
val_ds   = val_ds.map(tok, batched=True)
test_ds  = test_ds.map(tok, batched=True)

columns = ['input_ids','attention_mask','label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)
test_ds.set_format(type='torch', columns=columns)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/41918 [00:00<?, ? examples/s]

Map:   0%|          | 0/7397 [00:00<?, ? examples/s]

Map:   0%|          | 0/8703 [00:00<?, ? examples/s]

## Define Model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Setup

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    p, r, f, _ = precision_recall_fscore_support(labels, preds, average='binary', pos_label=1)
    acc = accuracy_score(labels, preds)
    return {'precision': p, 'recall': r, 'f1': f, 'accuracy': acc}

training_args = TrainingArguments(
    output_dir='./out',
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_steps=50,
    fp16=False,  # CPU-friendly
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

## Train Model

In [11]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33myashchandarana6733[0m ([33myashchandarana6733-roundpixel[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1094,0.091152,0.973476,0.958277,0.965817,0.965932
2,0.0446,0.101607,0.955068,0.984118,0.969376,0.968771
3,0.0203,0.116223,0.965215,0.978466,0.971795,0.971475


TrainOutput(global_step=1965, training_loss=0.06642094267385303, metrics={'train_runtime': 2711.155, 'train_samples_per_second': 46.384, 'train_steps_per_second': 0.725, 'total_flos': 8329152625293312.0, 'train_loss': 0.06642094267385303, 'epoch': 3.0})

In [12]:
# Save the trained model
trainer.save_model("./suicide_model")

## Evaluate on Test Set

In [13]:
metrics = trainer.predict(test_ds)
print(metrics.metrics)

{'test_loss': 0.10898315906524658, 'test_precision': 0.9724624487938097, 'test_recall': 0.9762394334018735, 'test_f1': 0.974347280811766, 'test_accuracy': 0.9741468459152016, 'test_runtime': 62.5978, 'test_samples_per_second': 139.03, 'test_steps_per_second': 2.173}


## Inference Demo

In [15]:
import torch

# Get the device your model is on
device = next(model.parameters()).device

def predict(text):
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=256
    ).to(device)  # move inputs to same device as model

    outputs = model(**inputs)
    pred = outputs.logits.argmax(dim=1).item()
    return 'suicide' if pred == 1 else 'non-suicide'

print(predict("I feel like giving up, nothing makes sense anymore."))
print(predict("I had a great day with my family!"))


suicide
non-suicide


In [16]:
import shutil
import os
from google.colab import files

# Compress the directory
output_filename = 'out_compressed'
shutil.make_archive(output_filename, 'zip', '/content/out')

# Download the compressed file
files.download(f'{output_filename}.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>