# Phishing Urls Detection by Fine Tuning DistilBERT
---

In [None]:
# display resources
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Dec  1 20:11:33 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Install Libraries

In [None]:
%pip install transformers datasets evaluate accelerate pipeline bitsandbytes
%pip install torch torchdata
%pip install peft
%pip install loralib
%pip install huggingface_hub

In [None]:
import pandas as pd
import numpy as np
import random
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding
)
import torch
import evaluate
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel,
    PeftConfig,
)
from huggingface_hub import login

In [None]:
login()

In [None]:
# training directory
DIR_MODEL = f"/content/drive/MyDrive/Colab Notebooks/fine-tuning-llm/malware_detection/peft/models/"

# device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

### Fine Tuning Dataset: kmack/Phishing_urls
+ [Phishing_urls](https://huggingface.co/datasets/kmack/Phishing_urls) => available from HuggingFace

In [None]:
data_urls = load_dataset("kmack/Phishing_urls")

# Load training data
data_train = data_urls['train']

# Split test data into test and validate sets
data_test = data_urls['test'].train_test_split(test_size=0.3, seed=1985)

train = data_train.shuffle(seed=1985).select(range(5000))
test = data_test['train'].shuffle(seed=1985).select(range(1000))
validate = data_test['test'].shuffle(seed=1985).select(range(100))


### Base Model
+ The distilBERT base model (case insensitive version) was fine-tuned with the Phishing_urls to improve classification



In [None]:
# DistilBERT Base Model
base_model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# classification mappings
id2label = {0:"Negative",1:"Positive"}
label2id = {"Negative":0, "Positive":1}

# base model for training
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.bfloat16
    ).to(DEVICE)

# original model for evaluation
original_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.bfloat16
    ).to(DEVICE)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preprocessing
+ Preprocessing is required to tokenize the inputs and standardize the length of each review.
+ Steps:
  + Tokenize each review
  + Standardize review length: A combination of truncation and padding was used to ensure the length of text for each review was the same length.
  + The DataCollatorWithPadding function from HuggingFace was used to automatically set padding levels during training.

In [None]:
# Example raw url string
tokens = tokenizer('thecanadianencyclopedia.com/index.cfm?PgNm=TCE&Params=A1ARTA0006086', truncation=True)

print(tokens)

{'input_ids': [101, 1996, 28621, 11692, 11916, 20464, 24174, 2401, 1012, 4012, 1013, 5950, 1012, 12935, 2213, 1029, 18720, 2078, 2213, 1027, 22975, 2063, 1004, 11498, 5244, 1027, 17350, 8445, 2050, 8889, 2692, 16086, 20842, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
def preprocess(examples):
  """ Tokenize the input text """
  tokens = tokenizer(examples['text'], truncation=True)
  return tokens

# preprocess each review in the train, test and validate datasets
tokenized_train = train.map(preprocess, batched=True)
tokenized_test = test.map(preprocess, batched=True)
tokenized_val = validate.map(preprocess, batched=True)

### Generate Responses
+ Steps
	+ Tokenize the review
	+ Generate a response
	+ Extract the logits
	+ Infer the classification from the maximum logit value
	+ If verbose=1, then print the review, the decoded classification, and human labels

In [None]:
def get_response(example, model, tokenizer, verbose=False):
  """ Generate a classification for a sample review """
  # tokenize the input text
  encoded_input = tokenizer(example['text'], return_tensors="pt", truncation=True, padding =True)
  encoded_input.to(DEVICE)

  # get the logits
  logits = model(**encoded_input).logits

  # classify
  prediction = torch.argmax(logits).tolist()

  # print a summary
  if verbose:
    # decode the prediction
    decoded_output = id2label[prediction]
    print("Input Text")
    print("="*100)
    print(example['text'])
    print("="*100)
    print(f"Prediction: {decoded_output} | Label: {id2label[example['label']]}")
  else:
    return prediction

In [None]:
def calc_training_metrics(pred):
  """ Calculate the evaluation metrics during training """
  f1 = evaluate.load('f1')

  # get the logits and labels from the prediction object
  logits, labels = pred

  # classify by using the logit (assign using the largest value)
  predictions = np.argmax(logits, axis=-1)

  score = f1.compute(predictions=predictions, references=labels)['f1']
  return {'f1':score}

### Training with Parameter Efficient Fine Tuning (PEFT) -> LoRA
+ Steps
	+ Define the LoRA parameters in the LoraConfig object
	+ Prepare the PEFT model from the base model + LoRA config object
	+ View the number of trainable parameters in the PEFT model


In [None]:
# LoRA config
lora_config = LoraConfig(
    r = 8, # dimension of adaptors, rank
    target_modules = ["q_lin"],
    lora_alpha=16, # alpha scaling
    lora_dropout=0.05,
    task_type=TaskType.SEQ_CLS # text classification
)

# Create the PEFT model from the base model and LoRA config
peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 665,858 || all params: 67,620,868 || trainable%: 0.9847


#### Training
This project aimed to demonstrate how to fine-tune LLMs for specific tasks using public datasets. As the focus was not on performance, no attempt at hyperparameter tuning was undertaken. In most instances, the default hyperparameter values were used
+ **Key Parameters**
+ output_dir - location to save trained adaptor weights
+ learning_rate -set to default
+ auto_find_batch_size - set to auto
+ Logging and evaluation were set to occur after each epoch
+ load_best_model_at_end - set to true to capture the best model from the epoch training
+ The data collator is used to automatically pad the text to the longest sequence in each batch

In [None]:
# Data Collator: This function dynamically sets the padding during training and ensures prompts of are equal length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# training config
DIR_TRAIN = "./training_output"

config_training = TrainingArguments(
    output_dir=DIR_TRAIN,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    logging_steps=1,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Trainer
trainer = Trainer(
    model=peft_model,
    args=config_training,
    data_collator = data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=calc_training_metrics
)

# train
trainer.train()

# save adaptor weights
trainer.save_model(DIR_MODEL)
# peft_model.push_to_hub(hub_name)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mashrafamit9227[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,F1
1,0.7188,0.441094,0.807692
2,0.3086,0.517344,0.808081
3,0.4844,0.530117,0.804124
4,0.4902,0.516699,0.823529
5,0.0327,0.517559,0.811881
6,0.0334,0.644609,0.783505
7,0.0032,0.646895,0.791667
8,0.0417,0.72521,0.795918
9,0.013,0.745674,0.8
10,0.1191,0.760913,0.795918


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

#### Merge Base Model & Adapters
+ The trained LoRA adaptors must be merged with the original base model
+ The resulting model consists of the base model plus the trained adaptors


In [None]:
# merge base model + peft adaptors
tuned_model = PeftModel.from_pretrained(
    base_model,
    DIR_MODEL, # LoRA adapters
    torch_dthype=torch.bfloat16,
    trust_remote_code=True,
    is_trainable=False
  )

### Evaluate Model Performance
+ This is a supervised binary classification task (we have the ground truth labels). Therefore, a classification accuracy measure can be used. The F1 score was selected to balance precision and recall




In [None]:
def evaluate_model(test_indexes, data, model, tokenizer):
  """ Generate classifications for each example in the test indexes """
  # accumulator
  results = []

  # loop through each test index in the dataset
  for idx in test_indexes:
    # get the human label and the generated classification
    example = data[idx]
    label = example['label']
    pred = get_response(example, model, tokenizer, verbose=False)

    # accumuate results
    results.append({'idx':idx,'label':label,'pred':pred})
  return pd.DataFrame(results)

In [None]:
import random
from evaluate import load

f1 = load("f1")

# Select samples from the test dataset
num_samples = test.num_rows - 1
num_to_test = min(500, num_samples)
test_indexes = random.sample(range(num_samples), num_to_test)

# Evaluate the Base Model
df_base = evaluate_model(test_indexes, test, original_model, tokenizer)
f1_base = f1.compute(predictions=df_base['pred'], references=df_base['label'])['f1']
print(f"Base Model F1 Score: {f1_base*100:,.2f}%")

# Evaluate the Tuned Model
df_tuned = evaluate_model(test_indexes, test, tuned_model, tokenizer)
f1_tuned = f1.compute(predictions=df_tuned['pred'], references=df_tuned['label'])['f1']
print(f"Tuned Model F1 Score: {f1_tuned*100:,.2f}%")

Base Model F1 Score: 57.88%
Tuned Model F1 Score: 84.23%


### PEFT/LoRA fine-tuning increased the F1 score from 57% to 84%

> Add blockquote



In [None]:
!pip freeze > requirements.txt