## Logistic Regression with TF-IDF

In [None]:
# Load the sampled data with labels
import pandas as pd
df_sampled2 = pd.read_csv("../data/processed/tr_speech_sample2.csv")

# Turn threats labels into binary labels
df_sampled2['label'] = df_sampled2['label'].map({
    'threat/criticism': 1,
    'non-threat': 0
})

In [None]:
# LR with TF-IDF and SMOTE with preprocessing + NER

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pickle
import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    cross_val_predict
)
from sklearn.metrics import classification_report
from joblib import parallel_backend
import torch

# Prep NLP tools 
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words("english"))
nlp = spacy.load("en_core_web_sm", disable=["parser"])  # keep NER

def spacy_tokenizer(doc: str):
    sp = nlp(doc)
    lemmas = [
        tok.lemma_.lower()
        for tok in sp
        if tok.is_alpha and tok.lemma_.lower() not in stop_words
    ]
    ents = [f"ENT_{ent.label_}" for ent in sp.ents]
    return lemmas + ents

#  Load & inspect DataFrame 
df = df_sampled2
print(f"Dataset rows: {len(df)}")
print("Label distribution:\n", df['label'].value_counts(), "\n")

#  Train/hold-out split 
X_train, X_holdout, y_train, y_holdout = train_test_split(
    df['text'], df['label'],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

# Build the pipeline 
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        tokenizer=spacy_tokenizer,
        token_pattern=None,
        sublinear_tf=True,
        max_df=0.9
    )),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(
        class_weight="balanced",
        solver="liblinear",
        max_iter=1000
    ))
])

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2), (1,3)],
    "tfidf__min_df":      [3, 5, 10],
    "clf__C":             [0.01, 0.1, 1, 10],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = GridSearchCV(
    pipeline,
    param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

# Grid-search in threading mode 
print("Starting grid-search (threading)...")
with parallel_backend("threading"):
    search.fit(X_train, y_train)

print("\n Best hyperparameters:", search.best_params_, "\n")

# Training & hold-out evaluations 
print("Training set performance:")
print(classification_report(
    y_train, search.predict(X_train), digits=4
))

print("Hold-out set performance:")
print(classification_report(
    y_holdout, search.predict(X_holdout), digits=4
))

# 5-Fold OOF CV (also in threading mode) 
best_pipe = search.best_estimator_
print(" 5-fold OOF CV performance:")
with parallel_backend("threading"):
    y_oof = cross_val_predict(
        best_pipe,
        df["text"],
        df["label"],
        cv=cv,
        n_jobs=-1,
        method="predict"
    )
print(classification_report(df["label"], y_oof, digits=4))

# Top coeffs & save 
def print_top_coefs(pipe, top_n=10):
    vec = pipe.named_steps["tfidf"]
    clf = pipe.named_steps["clf"]
    feats = np.array(vec.get_feature_names_out())
    coefs = clf.coef_[0]
    print(f"\nTop {top_n} features for class=1 (threat):")
    for i in np.argsort(coefs)[-top_n:][::-1]:
        print(f"  {feats[i]:20s} {coefs[i]:.3f}")
    print(f"\nTop {top_n} features for class=0 (non-threat):")
    for i in np.argsort(coefs)[:top_n]:
        print(f"  {feats[i]:20s} {coefs[i]:.3f}")

print_top_coefs(best_pipe, top_n=10)

with open("best_threat_model_spacy_ner.pkl", "wb") as f:
    pickle.dump(best_pipe, f)
print("\n Pipeline saved as best_threat_model_spacy_ner.pkl")


## BERT-based Model

In [None]:
#  DistilBERT with 5-Fold CV and SMOTE with spaCy Lemmatization, NLTK Stop‑Word Filtering,

# Imports 
import numpy as np
import pandas as pd
import nltk
import spacy
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import evaluate
import torch

#  Download NLTK stopwords & load spaCy model 
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

#  Quick dataframe info 
print(f"Rows: {len(df_sampled2)}")
print("Label counts:\n", df_sampled2['label'].value_counts(), "\n")

# Train / hold‑out split 
X = df_sampled2['text']
y = df_sampled2['label']
X_train, X_hold, y_train, y_hold = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#  Pre‑processing function 
def preprocess(batch):
    cleaned = []
    for doc in nlp.pipe(batch["text"], batch_size=32):
        lemmas = [
            token.lemma_.lower()
            for token in doc
            if token.is_alpha and token.lemma_.lower() not in stop_words
        ]
        cleaned.append(" ".join(lemmas))
    return {"text": cleaned}

# to_hf: spaCy → HF Dataset → Tokenize 
def to_hf(texts, labels):
    df = pd.DataFrame({"text": texts.tolist(), "label": labels.tolist()})
    ds = Dataset.from_pandas(df, preserve_index=False)
    ds = ds.map(preprocess, batched=True)  # spaCy lemmatization + stop-word removal
    ds = ds.map(tokenize_fn, batched=True) # BERT tokenization
    ds = ds.rename_column("label", "labels").remove_columns("text")
    ds.set_format("torch", columns=["input_ids","attention_mask","labels"])
    return ds

# Load tokenizer & define tokenize_fn 
MODEL_NAME = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Build HF Datasets 
hf_train = to_hf(X_train, y_train)
hf_hold  = to_hf(X_hold,  y_hold)

# Define metrics & compute_metrics 
accuracy  = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall    = evaluate.load("recall")
f1_score  = evaluate.load("f1")
roc_auc   = evaluate.load("roc_auc")   

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # turn logits into probabilities
    probs = torch.softmax(logits, axis=-1)    
    preds = np.argmax(probs, axis=-1)

    return {
        "accuracy":  accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall":    recall.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1":        f1_score.compute(predictions=preds, references=labels, average="binary")["f1"],
        "roc_auc":   roc_auc.compute(prediction_scores=probs[:,1], references=labels)["roc_auc"],
    }

#  Fine‑tune on 80% and evaluate on hold‑out 20% 
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
training_args = TrainingArguments(
    output_dir="./holdout_bert",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_steps=500,
    save_total_limit=1,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_hold,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()
holdout_metrics = trainer.evaluate()
print("\nHold-out metrics:", holdout_metrics)

# 5‑Fold OOF cross‑validation on full data 
texts  = df_sampled2['text'].tolist()
labels = df_sampled2['label'].tolist()
skf    = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds    = np.zeros(len(labels), dtype=int)
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels), 1):
    print(f"\n=== Fold {fold} ===")
    tr_text = [texts[i] for i in train_idx]; tr_lab = [labels[i] for i in train_idx]
    vl_text = [texts[i] for i in val_idx];   vl_lab = [labels[i] for i in val_idx]
    ds_tr = to_hf(pd.Series(tr_text), pd.Series(tr_lab))
    ds_vl = to_hf(pd.Series(vl_text), pd.Series(vl_lab))
    m = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    args = TrainingArguments(
        output_dir=f"./cv_fold{fold}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=50,
        save_steps=500,
        save_total_limit=1,
        disable_tqdm=True,
    )
    tr = Trainer(
        model=m,
        args=args,
        train_dataset=ds_tr,
        eval_dataset=ds_vl,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    tr.train()
    out = tr.predict(ds_vl)
    preds = np.argmax(out.predictions, axis=-1)
    oof_preds[val_idx] = preds
    fm = compute_metrics((out.predictions, out.label_ids))
    print({k: f"{v:.4f}" for k,v in fm.items()})
    fold_metrics.append(fm)

print("\n=== 5-Fold OOF Classification Report ===")
print(classification_report(labels, oof_preds, digits=4))
print("\nCV fold averages:", pd.DataFrame(fold_metrics).mean())
