# PhishHook - A Phishing URL Detector
#### By: Aryaan Khan and Bradley Lewis

## Import Libraries

In [4]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

from url_parser import extract_url_features
from phishhooknet import PhishHookNet

## Load the Data

In [5]:
# Load the dataset
dataset = pd.read_csv("phishing_data.csv")

# Assuming extract_url_features is already defined and properly imported
features = [extract_url_features(url) for url in dataset['url']]
features_df = pd.DataFrame(features)

# Assuming the label column is named 'Label' and needs conversion from text to binary
labels = (dataset['status'] == 'phishing').astype(int)

## Create the Dataloader

In [6]:
# Check if MPS is available
if torch.backends.mps.is_available():
    print("MPS is available!")
    # Set the device to MPS
    device = torch.device("mps")
elif torch.cuda.is_available():
    print("CUDA is available!")
    # Set the device to CUDA
    device = torch.device("cuda")
else:
    print("CUDA not available. Using CPU instead.")
    # Set the device to CPU
    device = torch.device("cpu")

# Initially split the data into temporary training data and final test data
X_temp, X_test, y_temp, y_test = train_test_split(features_df, labels, test_size=0.2)

# Split the temporary training data into final training data and validation data
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25)  # 0.25 x 0.8 = 0.2

# Convert data to tensors and transfer to the specified device
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)

# Create Tensor datasets for all sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders for all datasets
train_loader = DataLoader(train_dataset, batch_size=30000, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10000, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=10000, shuffle=False)

# Confirm the DataLoader details
print(train_loader.dataset.tensors[0].shape, train_loader.dataset.tensors[1].shape)
print(val_loader.dataset.tensors[0].shape, val_loader.dataset.tensors[1].shape)
print(test_loader.dataset.tensors[0].shape, test_loader.dataset.tensors[1].shape)

MPS is available!
torch.Size([6888, 28]) torch.Size([6888, 1])
torch.Size([2296, 28]) torch.Size([2296, 1])
torch.Size([2297, 28]) torch.Size([2297, 1])


## Define the Training and Validation Loops

In [7]:
# Function to train the model
def train(model, train_loader, optimizer, criterion):
    model.train()  # Set the model to training mode
    total_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()  # Clear gradients
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
    return total_loss / len(train_loader.dataset)

# Function to validate the model
def validate(model, val_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    total = 0
    correct = 0
    with torch.no_grad():  # No need to track gradients
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            predicted = outputs.round()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    avg_loss = total_loss / len(val_loader.dataset)
    accuracy = correct / total * 100
    return avg_loss, accuracy

## Define the Early Stopping Condition Class

In [8]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'phishing_url_model.pth')
        self.val_loss_min = val_loss


## Train the Model

In [9]:
# Assuming the model and dataset are already defined
model = PhishHookNet(input_size=X_train.shape[1]).to(device)  # Adjust input size based on actual features

# Loss function, optimizer, and early stopping
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
early_stopping = EarlyStopping(patience=500, verbose=True, delta=0.001)

# Training loop with model saving based on validation loss improvement
num_epochs = 500
best_val_loss = float('inf')
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss, accuracy = validate(model, val_loader, criterion)
    
    # Early stopping
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    print(f'Epoch {epoch+1}: train_loss = {train_loss:.4f}, val_loss = {val_loss:.4f}')

Validation loss decreased (inf --> 6.025760).  Saving model ...
Epoch 1: train_loss = 0.8758, val_loss = 6.0258
Validation loss decreased (6.025760 --> 1.221867).  Saving model ...
Epoch 2: train_loss = 5.9426, val_loss = 1.2219
Validation loss decreased (1.221867 --> 0.680582).  Saving model ...
Epoch 3: train_loss = 1.1111, val_loss = 0.6806
EarlyStopping counter: 1 out of 500
Epoch 4: train_loss = 0.6793, val_loss = 0.7067
EarlyStopping counter: 2 out of 500
Epoch 5: train_loss = 0.7062, val_loss = 0.7010
EarlyStopping counter: 3 out of 500
Epoch 6: train_loss = 0.7017, val_loss = 0.6944
EarlyStopping counter: 4 out of 500
Epoch 7: train_loss = 0.6952, val_loss = 0.6930
EarlyStopping counter: 5 out of 500
Epoch 8: train_loss = 0.6935, val_loss = 0.6930
EarlyStopping counter: 6 out of 500
Epoch 9: train_loss = 0.6935, val_loss = 0.6930
EarlyStopping counter: 7 out of 500
Epoch 10: train_loss = 0.6936, val_loss = 0.6930
EarlyStopping counter: 8 out of 500
Epoch 11: train_loss = 0.6936

## Evaluate the Model

In [10]:
# Load the best model back
model.load_state_dict(torch.load('phishing_url_model.pth'))

# Evaluate the model on the test dataset
model.eval()

# Calculate accuracy
train_accuracy = validate(model, train_loader, criterion)[1]
val_accuracy = validate(model, val_loader, criterion)[1]
test_accuracy = validate(model, test_loader, criterion)[1]

# Print the results
print(f'Training accuracy: {train_accuracy:.2f}%')
print(f'Validation accuracy: {val_accuracy:.2f}%')
print(f'Test accuracy: {test_accuracy:.2f}%')

Training accuracy: 89.00%
Validation accuracy: 85.93%
Test accuracy: 85.50%
