In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

In [4]:
dataset = load_dataset("deepset/prompt-injections")

In [6]:
model_path = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "Safe", 1: "Injection"}
label2id = {"Safe": 0, "Injection": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=2, 
                                                           id2label=id2label, 
                                                           label2id=label2id,)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Keep most model base parameters fixed, and only train the pooling layers and classification head on top
# Keep computational cost low while having some flexibility
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True
    

In [8]:
def preprocess(batch):
    return tokenizer(
      batch["text"],
      truncation=True,
      padding="max_length",
      max_length=256,
    )

tokenized_data = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/546 [00:00<?, ? examples/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

In [9]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [16]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 3

training_args = TrainingArguments(
    output_dir="injection-classifier",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.6252,0.708326,0.483,0.846
2,0.5937,0.627008,0.716,0.858
3,0.5793,0.645599,0.491,0.86




TrainOutput(global_step=105, training_loss=0.5994227091471355, metrics={'train_runtime': 421.0709, 'train_samples_per_second': 3.89, 'train_steps_per_second': 0.249, 'total_flos': 215491818866688.0, 'train_loss': 0.5994227091471355, 'epoch': 3.0})

In [18]:
pred_out = trainer.predict(tokenized_data["test"])
logits = pred_out.predictions       # shape (N, 2)
labels = pred_out.label_ids         # shape (N,)

# 2. Manually compute softmax → positive-class probs
#    subtract max for numerical stability
logits_max = np.max(logits, axis=1, keepdims=True)
exp_logits = np.exp(logits - logits_max)
probs = exp_logits[:, 1] / exp_logits.sum(axis=1)

# 3. Sweep thresholds 0→1 in 0.01 steps for best ACCURACY
best_acc, best_thr = 0.0, 0.5
for t in np.linspace(0, 1, 101):
    preds = (probs >= t).astype(int)
    acc = np.mean(preds == labels)
    if acc > best_acc:
        best_acc, best_thr = acc, t

print(f"Best accuracy {best_acc:.4f} at threshold {best_thr:.2f}")

# 4. (Optional) Sweep for best F1 too
best_f1, best_f1_thr = 0.0, 0.5
for t in np.linspace(0, 1, 101):
    preds = (probs >= t).astype(int)
    tp = np.sum((preds == 1) & (labels == 1))
    fp = np.sum((preds == 1) & (labels == 0))
    fn = np.sum((preds == 0) & (labels == 1))
    # skip if no positive predictions or no true positives
    if tp + fp == 0 or tp + fn == 0:
        continue
    prec = tp / (tp + fp)
    rec  = tp / (tp + fn)
    f1   = 2 * prec * rec / (prec + rec)
    if f1 > best_f1:
        best_f1, best_f1_thr = f1, t

print(f"Best F1      {best_f1:.4f} at threshold {best_f1_thr:.2f}")



Best accuracy 0.7931 at threshold 0.47
Best F1      0.8092 at threshold 0.43
