In [5]:
import pandas as pd
import torch
from datasets     import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay


In [6]:
import pandas as pd
df = pd.read_csv("../data/phishing_site_urls.csv")
df['label'] = df['Label'].map({'good': 0, 'bad': 1})
df = df[['URL', 'label']]
df.head()




Unnamed: 0,URL,label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
3,mail.printakid.com/www.online.americanexpress....,1
4,thewhiskeydregs.com/wp-content/themes/widescre...,1


In [7]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)


In [8]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["URL"],
                     truncation=True,
                     padding="max_length",
                     max_length=128)

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test":  Dataset.from_pandas(test_df),
})

ds = ds.rename_column("label", "labels")

ds = ds.map(tokenize, batched=True)
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 439476/439476 [00:09<00:00, 45896.54 examples/s]
Map: 100%|██████████| 109870/109870 [00:02<00:00, 44021.96 examples/s]


In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

def make_trainer(model_name, ds):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    args = TrainingArguments(
        output_dir=f"./checkpoints/{model_name.split('/')[-1]}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        evaluation_strategy="epoch",
        num_train_epochs=3,
        logging_steps=50,
    )
    return Trainer(model=model, args=args,
                   train_dataset=ds["train"],
                   eval_dataset=ds["test"],
                   tokenizer=tokenizer)

distil_trainer = make_trainer("distilbert-base-uncased", ds)
bert_trainer   = make_trainer("bert-base-uncased",     ds)

distil_trainer.train()
bert_trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  if isinstance(self.fsdp_config.get("transformer_layer_cls_to_wrap", None), str):


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [14]:
!pip install accelerate --upgrade

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def eval_trainer(trainer):
    preds_output = trainer.predict(ds["test"])
    preds = np.argmax(preds_output.predictions, axis=1)
    true  = preds_output.label_ids
    print(classification_report(true, preds))
    return true, preds

true_d, pred_d = eval_trainer(distil_trainer)
true_b, pred_b = eval_trainer(bert_trainer)


SyntaxError: invalid syntax (563591931.py, line 3)

In [None]:
# get a batch
batch = ds["test"][0]
inputs = { k: batch[k].unsqueeze(0) for k in ["input_ids","attention_mask"] }
model = bert_trainer.model.eval()

# forward pass with outputs
outputs = model(**inputs, output_attentions=True)
attns   = outputs.attentions  # tuple: one tensor per layer

# pick layer 0, head 0
import matplotlib.pyplot as plt
plt.imshow(attns[0][0,0].detach().numpy())
plt.title("Layer 0, Head 0")
plt.xlabel("Token position")
plt.ylabel("Token position")
plt.show()


In [None]:

# Confusion matrices
for name, (t, p) in [("DistilBERT", (true_d,pred_d)),
                    ("BERT",     (true_b,pred_b))]:
    disp = ConfusionMatrixDisplay.from_predictions(t, p, normalize="true")
    disp.figure_.suptitle(f"{name} Confusion Matrix")

# Accuracy bar plot
accs = [accuracy_score(true_d, pred_d), accuracy_score(true_b, pred_b)]
plt.figure()
plt.bar(["DistilBERT","BERT"], accs)
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.ylim(0,1)
plt.show()
