**Mount to Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Necessary Libraries**

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from tqdm import tqdm
import time
import seaborn as sns
import shutil

1. Load Preprocessed Tensors

In [None]:
X_train_tensor = torch.load('/content/drive/MyDrive/Thesis - Undergraduate Ch./Preprocessing/X_train_7000_tensor_most.pt')
X_test_tensor = torch.load('/content/drive/MyDrive/Thesis - Undergraduate Ch./Preprocessing/X_test_1500_tensor_most.pt')
y_test_tensor = torch.load('/content/drive/MyDrive/Thesis - Undergraduate Ch./Preprocessing/y_test_1500_tensor_most.pt')

print("Preprocessed tensors loaded successfully!")

In [None]:
# Print the output shape for each data
print(f"X_train shape: {X_train_tensor.shape}")
print(f"X_test shape: {X_test_tensor.shape}")
print(f"y_test shape: {y_test_tensor.shape}")

2. Define VAE-CNN Architecture

In [None]:
#@title Vocabulary = 87 (Dataset 10000: 7000 Legitimate & 1500 Legitimate + 1500 Phishing)
class VAE_CNN(nn.Module):
    def __init__(self, latent_dim=4):
        super(VAE_CNN, self).__init__()
        self.encoder = nn.Sequential(
            # the layers can be adjusted based on the input size, in this case, the input is (1, 100, 87)
            nn.Conv2d(1, 32, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
        )

        # Output size (CNN) 
        self.flattened_size = 128 * 12 * 10 

        self.fc_mu = nn.Linear(self.flattened_size, latent_dim)
        self.fc_logvar = nn.Linear(self.flattened_size, latent_dim)
        self.fc_decode = nn.Linear(latent_dim, self.flattened_size)

        self.decoder = nn.Sequential(
            # The layers can be adjusted based on the output size, in this case, the expected output is (1, 100, 87)
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),  # [64,24,20]
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),   # [32,48,40]
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 1, kernel_size=(5,8), stride=2, padding=0, output_padding=(1,1)),  # [1,100,87]
        )

    def encode(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
      x = self.fc_decode(z)
      x = x.view(-1, 128, 12, 10)
      x = self.decoder(x)
      return x

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon_x = self.decode(z)
        return recon_x, mu, logvar

# Display a summary of the model architecture
model = VAE_CNN()
summary(model, input_size=[(1, 100, 87)], batch_size=1) # (batch, channels, height, width = N, C, H, W)

4. Train Model

In [None]:
#@title With Hyperparameter Tuning (10,000) - Retrained
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameter candidates
latent_dims = [4, 8, 16]
learning_rates = [0.001]
batch_sizes = [16, 32]
num_epochs = 20
optimizers = ['Adam']

# Loss function
mse_loss = nn.MSELoss(reduction='sum')

# VAE loss function (Reconstruction Loss + KL Divergence Loss)
def vae_loss(recon_x, x, mu, logvar):
    reconstruction_loss = mse_loss(recon_x, x)
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    total_loss = reconstruction_loss + kl_loss
    return total_loss, reconstruction_loss, kl_loss

# Store all training results
results = []

# Loop through each combination of hyperparameters
for latent_dim in latent_dims:
    for lr in learning_rates:
        for batch_size in batch_sizes:
            for opt_name in optimizers:
                print(f"\n[Training] latent_dim: {latent_dim}, lr: {lr}, batch_size: {batch_size}, optimizer: {opt_name}")

                # Create DataLoader for each batch size
                train_loader = DataLoader(TensorDataset(X_train_tensor), batch_size=batch_size, shuffle=True)

                # Initialize model and optimizer
                model = VAE_CNN(latent_dim=latent_dim).to(device)
                optimizer = optim.Adam(model.parameters(), lr=lr)

                train_loss_history = []
                recon_loss_history = []
                kl_loss_history = []

                # Start training timer
                start_time = time.time()

                # Training loop
                for epoch in tqdm(range(num_epochs), desc=f"Training {latent_dim}, lr {lr}, bs {batch_size}, opt {opt_name}"):
                    model.train()
                    total_loss, total_recon_loss, total_kl_loss = 0, 0, 0

                    for batch in train_loader:
                        x_batch = batch[0].to(device)

                        optimizer.zero_grad()
                        recon_batch, mu, logvar = model(x_batch)
                        loss, recon_loss, kl = vae_loss(recon_batch, x_batch, mu, logvar)
                        loss.backward()
                        optimizer.step()

                        total_loss += loss.item()
                        total_recon_loss += recon_loss.item()
                        total_kl_loss += kl.item()

                    avg_loss = total_loss / len(train_loader.dataset)
                    avg_recon_loss = total_recon_loss / len(train_loader.dataset)
                    avg_kl_loss = total_kl_loss / len(train_loader.dataset)

                    train_loss_history.append(avg_loss)
                    recon_loss_history.append(avg_recon_loss)
                    kl_loss_history.append(avg_kl_loss)

                    tqdm.write(f"Epoch [{epoch+1}/{num_epochs}] - kl: {avg_kl_loss:.4f} - loss: {avg_loss:.4f} - recon: {avg_recon_loss:.4f}")

                # End training timer
                end_time = time.time()
                duration = end_time - start_time
                print(f"Training duration: {duration/60:.2f} minutes")

                # Save results
                results.append({
                    'latent_dim': latent_dim,
                    'batch_size': batch_size,
                    'optimizer': opt_name,
                    'learning_rate': lr,
                    'label': f'latent{latent_dim}_lr{lr}_bs{batch_size}_{opt_name}',
                    'train_loss_history': train_loss_history,
                    'duration': duration
                })

                # Plot individual loss curves
                plt.figure(figsize=(12,6))
                plt.plot(train_loss_history, label='Total Loss')
                plt.plot(recon_loss_history, label='Reconstruction Loss')
                plt.plot(kl_loss_history, label='KL Divergence Loss')
                plt.xlabel('Epoch')
                plt.ylabel('Loss')
                plt.title(f'Loss Curves\nlatent_dim={latent_dim}, lr={lr}, batch_size={batch_size}, optimizer={opt_name}')
                plt.legend()
                plt.grid()

                # Save plot
                plot_path = f"/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Plots/Fixed/loss_latent{latent_dim}_lr{lr}_bs{batch_size}_opt{opt_name}.png"
                plt.savefig(plot_path)
                plt.show()
                plt.close()

                print(f"Plot saved: {plot_path}")

                # Save the model
                model_path = f"/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent{latent_dim}_lr{lr}_bs{batch_size}_opt{opt_name}.pth"
                torch.save(model.state_dict(), model_path)
                print(f"Model trained and saved successfully at: {model_path}")

# Global comparison: Plot total loss between all hyperparameter combinations
plt.figure(figsize=(14,8))
for result in results:
    plt.plot(result['train_loss_history'], label=f"{result['label']} ({result['duration']/60:.1f}m)")
plt.xlabel('Epoch')
plt.ylabel('Total Loss')
plt.title('Comparison of Total Loss between Hyperparameters')
plt.legend()
plt.grid()

# Save global comparison plot
comparison_plot_path = "/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Plots/Fixed/comparison_total_loss_updated.png"
plt.savefig(comparison_plot_path)
plt.show()
plt.close()

print(f"Comparison plot saved: {comparison_plot_path}")

# Plot comparison for each batch size separately
batch_sizes_unique = sorted(set([r['batch_size'] for r in results]))

for batch_size in batch_sizes_unique:
    plt.figure(figsize=(14,8))
    batch_results = [r for r in results if r['batch_size'] == batch_size]

    for result in batch_results:
        plt.plot(result['train_loss_history'], label=f"{result['label']} ({result['duration']/60:.1f}m)")

    plt.xlabel('Epoch')
    plt.ylabel('Total Loss')
    plt.title(f'Comparison of Total Loss for Batch Size {batch_size}')
    plt.legend()
    plt.grid()

    # Save batch-specific plot
    batch_plot_path = f"/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Plots/Fixed/comparison_total_loss_batch{batch_size}_updated.png"
    plt.savefig(batch_plot_path)
    plt.show()
    plt.close()

    print(f"Batch-specific comparison plot saved: {batch_plot_path}")


6. Model Evaluation

In [None]:
#@title Load Trained Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VAE_CNN(latent_dim=8).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent8_lr0.001_bs16_optAdam.pth'))
model.eval()
print("Model loaded successfully!")

In [None]:
#@title Threshold Selection with Percentile

# --------------------------
# Dataset & DataLoader
# --------------------------
batch_size = 128
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --------------------------
# Model Evaluation per Batch
# --------------------------
model.eval()
losses, labels = [], []

with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device)
        recon, _, _ = model(x)
        batch_loss = torch.mean((recon - x) ** 2, dim=(1, 2, 3))
        losses.append(batch_loss.cpu())
        labels.append(y.cpu())

loss_per_sample = torch.cat(losses)
labels = torch.cat(labels).int()

# --------------------------
# Thresholds from Various Percentiles (1%-99%)
# --------------------------
percentiles = list(range(1, 100))
thresholds = [torch.quantile(loss_per_sample[labels == 0], p / 100) for p in percentiles]

results = []

print("\n=== Percentile-based Evaluation ===")
for p, thresh in zip(percentiles, thresholds):
    preds = (loss_per_sample > thresh).int()
    cm = confusion_matrix(labels, preds)
    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    results.append((p, thresh.item(), precision, recall, f1))

    print(f"\n--- Percentile {p}% ---")
    print(f"Threshold : {thresh.item():.6f}")
    print(f"Precision : {precision:.4f}")
    print(f"Recall    : {recall:.4f}")
    print(f"F1 Score  : {f1:.4f}")
    print(f"Confusion Matrix:\n{cm}")

# --------------------------
# Metric vs Percentile Visualization
# --------------------------
percentiles, thresholds, precisions, recalls, f1s = zip(*results)

plt.figure(figsize=(12, 6))
plt.plot(percentiles, precisions, label="Precision", marker='.')
plt.plot(percentiles, recalls, label="Recall", marker='.')
plt.plot(percentiles, f1s, label="F1 Score", marker='.')
plt.xlabel("Percentile Threshold")
plt.ylabel("Score")
plt.title("Precision, Recall, F1 Score vs Threshold Percentile")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --------------------------
# Threshold Value vs Percentile Visualization
# --------------------------
plt.figure(figsize=(10, 4))
plt.plot(percentiles, thresholds, label="Threshold Value", color='black')
plt.xlabel("Percentile")
plt.ylabel("Threshold")
plt.title("Threshold Value vs Percentile")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
#@title Model Evaluation per Batch

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define models and their corresponding latent dimensions
models_info = [
    {'path': '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent4_lr0.001_bs16_optAdam.pth', 'latent_dim': 4},
    {'path': '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent4_lr0.001_bs32_optAdam.pth', 'latent_dim': 4},
    {'path': '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent8_lr0.001_bs16_optAdam.pth', 'latent_dim': 8},
    {'path': '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent8_lr0.001_bs32_optAdam.pth', 'latent_dim': 8},
    {'path': '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent16_lr0.001_bs16_optAdam.pth', 'latent_dim': 16},
    {'path': '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/vae_cnn_latent16_lr0.001_bs32_optAdam.pth', 'latent_dim': 16},
]

# Initialize tracking variables
best_f1 = 0
best_model_info = {}

# Loop through all models
for info in models_info:
    # Load model
    model = VAE_CNN(latent_dim=info['latent_dim']).to(device)
    model.load_state_dict(torch.load(info['path']))
    model.eval()
    print(f"\nModel loaded: {info['path']} with latent_dim={info['latent_dim']}")

    # Evaluate model
    losses, labels = [], []
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(device)
            recon, _, _ = model(x)
            batch_loss = torch.mean((recon - x) ** 2, dim=(1, 2, 3))
            losses.append(batch_loss)
            labels.append(y)

    losses = torch.cat(losses)
    labels = torch.cat(labels).int().to(device)

    # Threshold based on quantile
    threshold = torch.quantile(losses[labels == 0], 0.82)
    predictions = (losses > threshold).int()

    # Calculate metrics
    cm = confusion_matrix(labels.cpu(), predictions.cpu())
    precision_val = precision_score(labels.cpu(), predictions.cpu(), zero_division=0)
    recall_val = recall_score(labels.cpu(), predictions.cpu(), zero_division=0)
    f1_val = f1_score(labels.cpu(), predictions.cpu(), zero_division=0)

    print(f"Precision: {precision_val:.4f}")
    print(f"Recall   : {recall_val:.4f}")
    print(f"F1-score : {f1_val:.4f}")

    # Plot Confusion Matrix
    plt.figure(figsize=(5.5, 4.5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Legitimate', 'Phishing'],
                yticklabels=['Legitimate', 'Phishing'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()

    # Plot Reconstruction Loss Distribution
    plt.figure(figsize=(7, 5))
    sns.histplot(losses[labels == 0].cpu().numpy(), bins=50, label='Legitimate', color='green', kde=True, stat='density')
    sns.histplot(losses[labels == 1].cpu().numpy(), bins=50, label='Phishing', color='red', kde=True, stat='density')
    plt.axvline(threshold.item(), color='black', linestyle='--', label=f'Threshold = {threshold.item():.4f}')
    plt.title("Reconstruction Loss Distribution")
    plt.xlabel("Loss")
    plt.ylabel("Density")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

    # Track best model
    if f1_val > best_f1:
        best_f1 = f1_val
        best_model_info = {
            'path': info['path'],
            'latent_dim': info['latent_dim'],
            'precision': precision_val,
            'recall': recall_val,
            'f1_score': f1_val
        }

# Print the Best Model Summary
print("\n=== Best Model Summary ===")
print(f"Best model: {best_model_info['path'].split('/')[-1]}, Latent Dim: {best_model_info['latent_dim']}")
print(f"Precision : {best_model_info['precision']:.4f}")
print(f"Recall    : {best_model_info['recall']:.4f}")
print(f"F1-Score  : {best_model_info['f1_score']:.4f}")


7. Save the Best Model

In [None]:
# Save the best model file
import shutil

# Path target for saving the file
best_model_save_path = '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/Best_Model.pth'

# Copy the best model
shutil.copy(best_model_info['path'], best_model_save_path)

print(f"Best model has been saved to: {best_model_save_path}")

Best model has been saved to: /content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/Best_Model.pth


8. Model Testing with New Data

In [None]:
# Load char_to_idx
with open('/content/drive/MyDrive/Thesis - Undergraduate Ch./Preprocessing/vocabulary_tested.json') as f:
    char_to_idx = json.load(f)
char_to_idx = {k: int(v) for k, v in char_to_idx.items()}

vocab_size = len(char_to_idx) + 1
max_len = 100
print("Vocab size:", vocab_size)

# Load Model
model = VAE_CNN(latent_dim=8)
model.load_state_dict(torch.load(
    '/content/drive/MyDrive/Thesis - Undergraduate Ch./Models/Dataset 10000/Training Results/Fixed/Best_Model.pth',
    map_location=device))
model.to(device)
model.eval()

# Preprocess
def preprocess_url(url, char_to_idx, max_len=100, vocab_size=100):
    encoded = np.zeros((max_len, vocab_size), dtype=np.float32)
    for i, ch in enumerate(url[:max_len]):
        if ch in char_to_idx:
            encoded[i, char_to_idx[ch]] = 1.0
    return torch.tensor(encoded).unsqueeze(0).unsqueeze(0).to(device)

# Test URL
new_url = " "
url_tensor = preprocess_url(new_url, char_to_idx, max_len=max_len, vocab_size=vocab_size)

# Predict + Timing
start_time = time.time()

with torch.no_grad():
    reconstructed, _, _ = model(url_tensor)
    loss = torch.mean((reconstructed - url_tensor) ** 2).item()

end_time = time.time()

# Decision
if loss > threshold:
    print(f"PHISHING (Loss: {loss:.4f})")
else:
    print(f"LEGITIMATE (Loss: {loss:.4f})")

# Display Inference Time
duration = (end_time - start_time) * 1000
print(f"Detection Time: {duration:.2f} ms")
