# PhishHook - A Phishing URL Detector
#### By: Aryaan Khan and Bradley Lewis

## Import Libraries

In [37]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

from urllib.parse import urlparse, parse_qs

## Load the Data

In [38]:
# Define the features to keep, based on the extract_url_features function output
features_to_keep = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 
    'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 
    'nb_slash', 'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 
    'nb_space', 'nb_www', 'nb_com', 'nb_dslash', 'status'  # 'status' is the target variable
]

print(len(features_to_keep))

# Read the data from the CSV file
phishing_data = pd.read_csv("phishing_data.csv")

# Drop columns not in the features_to_keep list
phishing_data = phishing_data[features_to_keep]

# Replace string values with corresponding integer for columns known to require it (if any)
# This step may need to be adjusted based on the actual data values in these columns
phishing_data['nb_hyphens'] = phishing_data['nb_hyphens'].replace({'zero': 0, 'one': 1}).astype(int)

# Convert the target values to binary
phishing_data['status'] = (phishing_data['status'] == 'phishing').astype(int)

# Check for any other columns that might have inconsistent types and convert them
for column in phishing_data.columns:
    if phishing_data[column].dtype  == 'object':
        phishing_data[column] = pd.to_numeric(phishing_data[column], errors='coerce')

# Fill NA values with the mean of each column
phishing_data.fillna(phishing_data.mean(), inplace=True)

# Convert all columns to float64
phishing_data = phishing_data.astype(float)

# Display the info to confirm all types are now float64 and only desired columns are retained
phishing_data.info()

24
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11481 entries, 0 to 11480
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   length_url       11481 non-null  float64
 1   length_hostname  11481 non-null  float64
 2   ip               11481 non-null  float64
 3   nb_dots          11481 non-null  float64
 4   nb_hyphens       11481 non-null  float64
 5   nb_at            11481 non-null  float64
 6   nb_qm            11481 non-null  float64
 7   nb_and           11481 non-null  float64
 8   nb_or            11481 non-null  float64
 9   nb_eq            11481 non-null  float64
 10  nb_underscore    11481 non-null  float64
 11  nb_tilde         11481 non-null  float64
 12  nb_percent       11481 non-null  float64
 13  nb_slash         11481 non-null  float64
 14  nb_star          11481 non-null  float64
 15  nb_colon         11481 non-null  float64
 16  nb_comma         11481 non-null  float64
 17  nb_semico

## Create the Dataloader

In [39]:
# Split the data into features and target variable
X = phishing_data.drop('status', axis=1).values
y = phishing_data['status'].values

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the datasets into PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

# Create Tensor datasets
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Confirm the DataLoader details
train_loader.dataset.tensors[0].shape, train_loader.dataset.tensors[1].shape

(torch.Size([9184, 23]), torch.Size([9184, 1]))

## Define the Neural Network

In [40]:
class PhishHookNet(nn.Module):
    def __init__(self, input_size):
        super(PhishHookNet, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        # self.dropout1 = nn.Dropout(0.5)
        self.layer2 = nn.Linear(128, 64)
        # self.dropout2 = nn.Dropout(0.5)
        self.output = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        # x = self.dropout1(x)
        x = torch.relu(self.layer2(x))
        # x = self.dropout2(x)
        x = torch.sigmoid(self.output(x))
        return x

## Define the Training and Validation Loops

In [41]:
# Function to train the model
def train(model, train_loader, optimizer, criterion):
    model.train()  # Set the model to training mode
    total_loss = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()  # Clear gradients
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
    return total_loss / len(train_loader.dataset)

# Function to validate the model
def validate(model, val_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No need to track gradients
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
    avg_loss = total_loss / len(val_loader.dataset)
    return avg_loss

## Define the Early Stopping Condition Class

In [42]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'phishing_url_model.pth')
        self.val_loss_min = val_loss


## Train the Model

In [43]:
# Assuming the model and dataset are already defined
model = PhishHookNet(input_size=X_train.shape[1])  # Adjust input size based on actual features

# Loss function, optimizer, and early stopping
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01) 
early_stopping = EarlyStopping(patience=15, verbose=True)

# Training loop with model saving based on validation loss improvement
num_epochs = 100
best_val_loss = float('inf')
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = validate(model, val_loader, criterion)
    
    # Early stopping
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    print(f'Epoch {epoch+1}: train_loss = {train_loss:.4f}, val_loss = {val_loss:.4f}')

# Load the best model back
model.load_state_dict(torch.load('phishing_url_model.pth'))

Validation loss decreased (inf --> 0.544209).  Saving model ...
Epoch 1: train_loss = 0.6174, val_loss = 0.5442
Validation loss decreased (0.544209 --> 0.482140).  Saving model ...
Epoch 2: train_loss = 0.5059, val_loss = 0.4821
Validation loss decreased (0.482140 --> 0.423185).  Saving model ...
Epoch 3: train_loss = 0.4652, val_loss = 0.4232
EarlyStopping counter: 1 out of 15
Epoch 4: train_loss = 0.4489, val_loss = 0.4281
EarlyStopping counter: 2 out of 15
Epoch 5: train_loss = 0.4496, val_loss = 0.4353
EarlyStopping counter: 3 out of 15
Epoch 6: train_loss = 0.4376, val_loss = 0.4360
Validation loss decreased (0.423185 --> 0.411403).  Saving model ...
Epoch 7: train_loss = 0.4345, val_loss = 0.4114
Validation loss decreased (0.411403 --> 0.407022).  Saving model ...
Epoch 8: train_loss = 0.4396, val_loss = 0.4070
EarlyStopping counter: 1 out of 15
Epoch 9: train_loss = 0.4346, val_loss = 0.4639
EarlyStopping counter: 2 out of 15
Epoch 10: train_loss = 0.4379, val_loss = 0.4341
Earl

<All keys matched successfully>

## Extract URL Features

In [44]:
def extract_url_features(url):
    features = {}
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Basic features from URL components
    features['length_url'] = len(url)
    features['length_hostname'] = len(parsed_url.netloc)
    features['ip'] = 1 if parsed_url.hostname and parsed_url.hostname.replace('.', '').isdigit() else 0
    features['nb_dots'] = url.count('.')
    features['nb_hyphens'] = url.count('-')
    features['nb_at'] = url.count('@')
    features['nb_qm'] = url.count('?')
    features['nb_and'] = url.count('&')
    features['nb_or'] = url.count('|')
    features['nb_eq'] = len(query_params)
    features['nb_underscore'] = url.count('_')
    features['nb_tilde'] = url.count('~')
    features['nb_percent'] = url.count('%')
    features['nb_slash'] = url.count('/')
    features['nb_star'] = url.count('*')
    features['nb_colon'] = url.count(':')
    features['nb_comma'] = url.count(',')
    features['nb_semicolumn'] = url.count(';')
    features['nb_dollar'] = url.count('$')
    features['nb_space'] = url.count(' ')
    features['nb_www'] = parsed_url.netloc.count('www')
    features['nb_com'] = parsed_url.netloc.count('.com')
    features['nb_dslash'] = url.count('//')

    return features

## Test the Model with A Different Dataset

In [45]:
# Load the dataset
test_dataset = pd.read_csv("test_phishing_data.csv")

def preprocess_data(dataset):
    # Convert labels from 'good'/'bad' to 0/1
    dataset['Label'] = (dataset['Label'] == 'bad').astype(int)

    # Extract features for each URL
    features = [extract_url_features(url) for url in dataset['URL']]
    feature_df = pd.DataFrame(features)

    return feature_df, dataset['Label']

features, labels = preprocess_data(test_dataset)

# Convert to tensors
features_tensor = torch.tensor(features.values, dtype=torch.float32)
labels_tensor = torch.tensor(labels.values, dtype=torch.float32).view(-1, 1)

# Create dataset and dataloader
test_dataset = TensorDataset(features_tensor, labels_tensor)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predicted = outputs.round()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

# Evaluate the model on the test dataset
model.eval()

# Calculate accuracy
accuracy = evaluate_model(model, test_loader)
print(f'Accuracy of the model on the new dataset: {accuracy * 100:.2f}%')

Accuracy of the model on the new dataset: 64.35%
