**Setup & Imports**

In [1]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import f1_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# Paths (notebook is in notebooks/, repo root is parent)
NOTEBOOK_DIR = Path().resolve()
BASE_DIR = NOTEBOOK_DIR.parent
DATA_DIR = BASE_DIR / "data"
JIGSAW_DIR = DATA_DIR / "jigsaw"
DREDDIT_DIR = DATA_DIR / "dreaddit"
EXPORT_DIR = DATA_DIR / "exports"
MODEL_DIR = BASE_DIR / "models" / "distilbert_jigsaw"

EXPORT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

LABEL_COLS = [
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate",
]

id2label = {i: c for i, c in enumerate(LABEL_COLS)}
label2id = {c: i for i, c in enumerate(LABEL_COLS)}

jigsaw_path = JIGSAW_DIR / "train.csv"
dreaddit_train_path = DREDDIT_DIR / "dreaddit-train.csv"

print("Notebook dir:", NOTEBOOK_DIR)
print("Repo root:", BASE_DIR)
print("Jigsaw train.csv:", JIGSAW_DIR / "train.csv")
print("Device:", "cuda" if torch.cuda.is_available() else "cpu")



Notebook dir: C:\Users\abdul\Desktop\GitHub\textthreat-poc\notebooks
Repo root: C:\Users\abdul\Desktop\GitHub\textthreat-poc
Jigsaw train.csv: C:\Users\abdul\Desktop\GitHub\textthreat-poc\data\jigsaw\train.csv
Device: cpu


## Load Jigsaw & Dreaddit + Basic EDA

In [3]:
# --- Jigsaw EDA ---
jigsaw_path = JIGSAW_DIR / "train.csv"
jig = pd.read_csv(jigsaw_path)

print("=== JIGSAW ===")
print("Rows:", len(jig))
print("Columns:", jig.columns.tolist())

print("\nLabel distribution (sum of 1s):")
print(jig[LABEL_COLS].sum())

jig["char_len"] = jig["comment_text"].str.len()
print("\nComment length stats (chars):")
print(jig["char_len"].describe())

jig[["id", "comment_text"]].head(3)

dreaddit_train_path = DREDDIT_DIR / "dreaddit-train.csv"

if dreaddit_train_path.exists():
    dre = pd.read_csv(dreaddit_train_path)
    print("=== DREDDIT (TRAIN) ===")
    print("Rows:", len(dre))
    print("Columns:", dre.columns.tolist())
    dre[["id", "text", "label"]].head(3)
else:
    print("Dreaddit train not found, skipping for now.")



=== JIGSAW ===
Rows: 159571
Columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Label distribution (sum of 1s):
toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

Comment length stats (chars):
count    159571.000000
mean        396.593961
std         594.387869
min           6.000000
25%          97.000000
50%         207.000000
75%         438.000000
max        5000.000000
Name: char_len, dtype: float64
=== DREDDIT (TRAIN) ===
Rows: 2838
Columns: ['subreddit', 'post_id', 'sentence_range', 'text', 'id', 'label', 'confidence', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe

## BASELINE: TF-IDF + LinearSVC on Jigsaw (Tier A)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score

df_base = jig.copy()
X = df_base["comment_text"].astype(str)
Y = df_base[LABEL_COLS].values

X_train, X_val, y_train, y_val = train_test_split(
    X, Y, test_size=0.1, random_state=42
)

tfidf_toxic = TfidfVectorizer(
    max_features=100_000,
    ngram_range=(1, 2),
    min_df=5,
)

X_train_tfidf = tfidf_toxic.fit_transform(X_train)
X_val_tfidf = tfidf_toxic.transform(X_val)

svm_clf = OneVsRestClassifier(LinearSVC())
svm_clf.fit(X_train_tfidf, y_train)

y_pred = svm_clf.predict(X_val_tfidf)

f1_micro_base = f1_score(y_val, y_pred, average="micro", zero_division=0)

# ROC-AUC using decision_function
try:
    y_scores = svm_clf.decision_function(X_val_tfidf)
    roc_macro_base = roc_auc_score(y_val, y_scores, average="macro")
except Exception:
    roc_macro_base = float("nan")

baseline_metrics = {
    "f1_micro": float(f1_micro_base),
    "roc_auc_macro": float(roc_macro_base),
}
baseline_metrics

{'f1_micro': 0.7246520874751491, 'roc_auc_macro': 0.9667396930304796}

## STRESS MODEL: TF-IDF + Logistic Regression on Dreaddit

In [5]:
from sklearn.linear_model import LogisticRegression

X_stress = dre["text"].astype(str)
y_stress = dre["label"].astype(int)

X_tr_s, X_val_s, y_tr_s, y_val_s = train_test_split(
    X_stress, y_stress, test_size=0.2, random_state=42
)

tfidf_stress = TfidfVectorizer(
    max_features=50_000,
    ngram_range=(1, 2),
    min_df=3,
)

X_tr_s_tfidf = tfidf_stress.fit_transform(X_tr_s)
X_val_s_tfidf = tfidf_stress.transform(X_val_s)

logreg_stress = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
)
logreg_stress.fit(X_tr_s_tfidf, y_tr_s)

y_pred_s = logreg_stress.predict(X_val_s_tfidf)
y_proba_s = logreg_stress.predict_proba(X_val_s_tfidf)[:, 1]

f1_stress = f1_score(y_val_s, y_pred_s)
roc_stress = roc_auc_score(y_val_s, y_proba_s)

stress_metrics = {
    "f1": float(f1_stress),
    "roc_auc": float(roc_stress),
}
stress_metrics


{'f1': 0.749185667752443, 'roc_auc': 0.8161565791934177}

In [6]:
# === Save Dreaddit stress model to disk ===
import joblib

STRESS_MODEL_DIR = BASE_DIR / "models" / "stress_dreaddit"
STRESS_MODEL_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(tfidf_stress, STRESS_MODEL_DIR / "tfidf_stress.joblib")
joblib.dump(logreg_stress, STRESS_MODEL_DIR / "logreg_stress.joblib")

STRESS_MODEL_DIR

WindowsPath('C:/Users/abdul/Desktop/GitHub/textthreat-poc/models/stress_dreaddit')

## Prepare Jigsaw dataset for DistilBERT (Tier B)

In [7]:
# === Prepare Jigsaw dataset for DistilBERT (Tier B) ===
from datasets import load_dataset

jigsaw_ds = load_dataset(
    "csv",
    data_files={"train": str(JIGSAW_DIR / "train.csv")}
)["train"]

# Subsample for PoC
jigsaw_ds = jigsaw_ds.shuffle(seed=42).select(range(20000))

# ✅ ADD: create a single 'labels' field = multi-hot vector of your 6 columns
def add_labels(example):
    # Cast each label to float so BCEWithLogits sees Float targets
    example["labels"] = [float(example[c]) for c in LABEL_COLS]
    return example

jigsaw_ds = jigsaw_ds.map(add_labels)

# Train/val split AFTER labels are added
split = jigsaw_ds.train_test_split(test_size=0.1, seed=42)
train_ds = split["train"]
val_ds = split["test"]

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["comment_text"],
        truncation=True,
        padding=False,
        max_length=256,
    )

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

def prepare(ds):
    # ✅ CHANGE: keep 'labels' instead of individual label columns
    keep = ["input_ids", "attention_mask", "labels"]
    drop = [c for c in ds.column_names if c not in keep]
    ds = ds.remove_columns(drop)
    ds.set_format("torch")
    return ds

train_ds = prepare(train_ds)
val_ds = prepare(val_ds)

train_ds, val_ds


Map: 100%|██████████| 20000/20000 [00:02<00:00, 7738.30 examples/s]
Map: 100%|██████████| 18000/18000 [00:02<00:00, 6784.95 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 8377.32 examples/s]


(Dataset({
     features: ['labels', 'input_ids', 'attention_mask'],
     num_rows: 18000
 }),
 Dataset({
     features: ['labels', 'input_ids', 'attention_mask'],
     num_rows: 2000
 }))

## Define DistilBERT Model & Evaluation Metrics

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(LABEL_COLS),
    problem_type="multi_label_classification",
    id2label=id2label,
    label2id=label2id,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))  # sigmoid
    y_true = labels
    y_pred = (probs >= 0.5).astype(int)

    f1_micro = f1_score(y_true, y_pred, average="micro", zero_division=0)

    try:
        roc_macro = roc_auc_score(y_true, probs, average="macro")
    except ValueError:
        roc_macro = float("nan")

    return {"f1_micro": f1_micro, "roc_auc_macro": roc_macro}

data_collator = DataCollatorWithPadding(tokenizer)
"Model & metrics ready."

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'Model & metrics ready.'

## Train DistilBERT on Jigsaw (1 Epoch, PoC)

In [None]:
# Train DistilBERT (1 epoch PoC) and evaluate
training_args = TrainingArguments(
    output_dir=str(MODEL_DIR / "hf_outputs"),
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=1,          # PoC: 1 epoch
    logging_steps=100,
    do_eval=True,                # compatible with older transformers versions
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,         # this is fine; you don't *have* to use processing_class
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()
metrics = trainer.evaluate()
metrics


## Save Model & Metrics

In [None]:
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

with open(MODEL_DIR / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print("Saved to:", MODEL_DIR)
metrics

Saved to: C:\Users\abdul\Desktop\GitHub\textthreat-poc\models\distilbert_jigsaw


{'eval_model_preparation_time': 0.001,
 'eval_runtime': 127.3948,
 'eval_samples_per_second': 15.699,
 'eval_steps_per_second': 1.962}

## Generate ECS-Style Events for Splunk (NDJSON)

In [None]:
# === EXPORT: ECS-style events with toxicity + stress ===
import uuid
import hashlib
from datetime import datetime, timezone

events_path = EXPORT_DIR / "textthreat_events_ecs.ndjson"

def ecs_severity_from_risk(risk_score: float) -> int:
    """
    Map 0–100 risk_score -> ECS-like severity 1–5.
    Adjust bins if you want.
    """
    if risk_score <= 20:
        return 1
    elif risk_score <= 40:
        return 2
    elif risk_score <= 60:
        return 3
    elif risk_score <= 80:
        return 4
    else:
        return 5

def short_snippet(text: str, max_len: int = 120) -> str:
    t = text.replace("\\n", " ").strip()
    return (t[:max_len] + "…") if len(t) > max_len else t

def sha256_text(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Sample a subset for export to keep file small
sample_df = jig.sample(n=2000, random_state=123).reset_index(drop=True)

with open(events_path, "w", encoding="utf-8") as f_out:
    for _, row in sample_df.iterrows():
        full_text = str(row["comment_text"])
        comment_id = str(row.get("id", ""))

        # --- Toxicity (DistilBERT) ---
        inputs = tokenizer(
            full_text,
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors="pt",
        ).to(device)

        with torch.no_grad():
            logits = model(**inputs).logits
            probs_tox = torch.sigmoid(logits)[0].cpu().numpy()

        tox_probs = {lbl: float(p) for lbl, p in zip(LABEL_COLS, probs_tox)}
        tox_score = max(tox_probs.values())  # 0–1
        tox_labels = [lbl for lbl, p in tox_probs.items() if p >= 0.5]

        # --- Stress (Dreaddit model) ---
        X_text_stress = tfidf_stress.transform([full_text])
        stress_prob = float(logreg_stress.predict_proba(X_text_stress)[0, 1])  # 0–1

        # --- Combine into risk_score 0–100 ---
        overall_score_0_1 = max(tox_score, stress_prob)
        risk_score = float(overall_score_0_1 * 100.0)
        ecs_severity = ecs_severity_from_risk(risk_score)

        # Threat categories (toxicity / stress / none)
        threat_categories = []
        if tox_score > 0.5:
            threat_categories.append("toxicity")
        if stress_prob > 0.5:
            threat_categories.append("stress")
        if not threat_categories:
            threat_categories.append("none")

        # Correlation: if both signals are high
        correlated = []
        if "toxicity" in threat_categories and "stress" in threat_categories:
            correlated.append("stress")

        now_iso = datetime.now(timezone.utc).isoformat()
        ev_id = str(uuid.uuid4())
        text_hash = sha256_text(full_text)
        snippet = short_snippet(full_text)

        event = {
            "@timestamp": now_iso,
            "event": {
                "kind": "event",
                "severity": ecs_severity,
                "risk_score": round(risk_score, 1),
                "dataset": "textthreat.jigsaw",
                "module": "textthreat",
                "id": ev_id,
            },
            "hash": {
                "sha256": text_hash,
            },
            "message": snippet,  # short / redacted text
            "threat": {
                "category": threat_categories,
            },
            "labels": tox_labels,
            "signal": {
                "type": "toxicity",
                "score": round(tox_score, 3),
                "calibrated": False,     # set True if you calibrate later
                "correlated": correlated,
            },
            "ml": {
                "model": {
                    "name": "distilbert_improved",
                    "version": "v0.1",
                }
            },
            "tags": [
                "signal:toxicity",
                "signal:stress" if stress_prob > 0.5 else "signal:stress:low",
                "source:jigsaw",
            ],
            # Keep your own namespace
            "textthreat": {
                "comment_id": comment_id,
                "full_text": full_text,  # in real deployment you'd remove or redact this
                "toxicity": {
                    "score": round(tox_score, 3),
                    "scores_by_label": tox_probs,
                },
                "stress": {
                    "score": round(stress_prob, 3),
                },
            },
        }

        f_out.write(json.dumps(event) + "\n")

events_path

WindowsPath('C:/Users/abdul/Desktop/GitHub/textthreat-poc/data/exports/textthreat_events_ecs.ndjson')

## Preview Exported Events (Sanity Check)

In [None]:
with open(events_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        print(line.strip())

