**Setup & Imports**

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import f1_score, roc_auc_score

In [8]:


# Paths (notebook is in notebooks/, repo root is parent)
NOTEBOOK_DIR = Path().resolve()
BASE_DIR = NOTEBOOK_DIR.parent
DATA_DIR = BASE_DIR / "data"
JIGSAW_DIR = DATA_DIR / "jigsaw"
DREDDIT_DIR = DATA_DIR / "dreaddit"
EXPORT_DIR = DATA_DIR / "exports"
MODEL_DIR = BASE_DIR / "models" / "distilbert_jigsaw"

EXPORT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

LABEL_COLS = [
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate",
]

id2label = {i: c for i, c in enumerate(LABEL_COLS)}
label2id = {c: i for i, c in enumerate(LABEL_COLS)}

print("Notebook dir:", NOTEBOOK_DIR)
print("Repo root:", BASE_DIR)
print("Jigsaw train.csv:", JIGSAW_DIR / "train.csv")
print("Device:", "cuda" if torch.cuda.is_available() else "cpu")



Notebook dir: /Users/it-by/Desktop/GitHub/textthreat-poc/notebooks
Repo root: /Users/it-by/Desktop/GitHub/textthreat-poc
Jigsaw train.csv: /Users/it-by/Desktop/GitHub/textthreat-poc/data/jigsaw/train.csv
Device: cpu


## Prepare Jigsaw Dataset for DistilBERT Training

In [9]:
# --- Jigsaw EDA ---
jigsaw_path = JIGSAW_DIR / "train.csv"
jig = pd.read_csv(jigsaw_path)

print("=== JIGSAW ===")
print("Rows:", len(jig))
print("Columns:", jig.columns.tolist())

print("\nLabel distribution (sum of 1s):")
print(jig[LABEL_COLS].sum())

jig["char_len"] = jig["comment_text"].str.len()
print("\nComment length stats (chars):")
print(jig["char_len"].describe())

jig[["id", "comment_text"]].head(3)

dreaddit_train_path = DREDDIT_DIR / "dreaddit-train.csv"

if dreaddit_train_path.exists():
    dre = pd.read_csv(dreaddit_train_path)
    print("=== DREDDIT (TRAIN) ===")
    print("Rows:", len(dre))
    print("Columns:", dre.columns.tolist())
    dre[["id", "text", "label"]].head(3)
else:
    print("Dreaddit train not found, skipping for now.")



=== JIGSAW ===
Rows: 159571
Columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Label distribution (sum of 1s):
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

Comment length stats (chars):
count    159571.000000
mean        394.073221
std         590.720282
min           6.000000
25%          96.000000
50%         205.000000
75%         435.000000
max        5000.000000
Name: char_len, dtype: float64
=== DREDDIT (TRAIN) ===
Rows: 2838
Columns: ['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe

## Prepare Jigsaw Dataset for DistilBERT Training

In [10]:
# Load via HuggingFace datasets from CSV
dataset = load_dataset(
    "csv",
    data_files={"train": str(jigsaw_path)}
)

ds = dataset["train"]

# Shuffle + subset for faster PoC
ds = ds.shuffle(seed=42).select(range(20000))  # 20k rows

# Train/validation split
split = ds.train_test_split(test_size=0.1, seed=42)
train_ds = split["train"]
val_ds = split["test"]

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["comment_text"],
        truncation=True,
        padding=False,
        max_length=256,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

def prepare(ds_hf):
    keep = ["input_ids", "attention_mask"] + LABEL_COLS
    drop = [c for c in ds_hf.column_names if c not in keep]
    ds_hf = ds_hf.remove_columns(drop)
    ds_hf.set_format("torch")
    return ds_hf

train_ds = prepare(train_ds)
val_ds = prepare(val_ds)

train_ds, val_ds


Generating train split: 159571 examples [00:00, 171481.78 examples/s]
Map: 100%|██████████| 18000/18000 [00:02<00:00, 7841.73 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 12098.33 examples/s]


(Dataset({
     features: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'input_ids', 'attention_mask'],
     num_rows: 18000
 }),
 Dataset({
     features: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'input_ids', 'attention_mask'],
     num_rows: 2000
 }))

## Define DistilBERT Model & Evaluation Metrics

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(LABEL_COLS),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))  # sigmoid
    y_true = labels
    y_pred = (probs >= 0.5).astype(int)

    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)

    try:
        roc_macro = roc_auc_score(y_true, probs, average="macro")
    except ValueError:
        roc_macro = float("nan")

    return {"f1_micro": f1_micro, "roc_auc_macro": roc_macro}

data_collator = DataCollatorWithPadding(tokenizer)
"Model & metrics ready."

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'Model & metrics ready.'