In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [2]:
# Define column names based on the dataset documentation
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
    "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
    "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
    "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]

network_data = pd.read_csv("kddcup.data.corrected", names=column_names, header=None)

In [5]:
print(network_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 42 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   duration                     int64  
 1   protocol_type                object 
 2   service                      object 
 3   flag                         object 
 4   src_bytes                    int64  
 5   dst_bytes                    int64  
 6   land                         int64  
 7   wrong_fragment               int64  
 8   urgent                       int64  
 9   hot                          int64  
 10  num_failed_logins            int64  
 11  logged_in                    int64  
 12  num_compromised              int64  
 13  root_shell                   int64  
 14  su_attempted                 int64  
 15  num_root                     int64  
 16  num_file_creations           int64  
 17  num_shells                   int64  
 18  num_access_files             int64  
 19  

In [6]:
# Count missing values in each column
missing_values = network_data.isnull().sum()

print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [7]:
# Show all labels
network_data["label"].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [11]:
# Count number of data points with a "normal" label
network_data[network_data["label"] == "normal."].count()

duration                       972781
protocol_type                  972781
service                        972781
flag                           972781
src_bytes                      972781
dst_bytes                      972781
land                           972781
wrong_fragment                 972781
urgent                         972781
hot                            972781
num_failed_logins              972781
logged_in                      972781
num_compromised                972781
root_shell                     972781
su_attempted                   972781
num_root                       972781
num_file_creations             972781
num_shells                     972781
num_access_files               972781
num_outbound_cmds              972781
is_host_login                  972781
is_guest_login                 972781
count                          972781
srv_count                      972781
serror_rate                    972781
srv_serror_rate                972781
rerror_rate 

In [17]:
# Get the data labeled normal for training the VAE
normal_data = network_data[network_data["label"] == "normal."].drop(columns=["label"])

# One hot encode the categorical features
categorical_cols = ["protocol_type", "service", "flag"]
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
encoded_categories = encoder.fit_transform(normal_data[categorical_cols])

In [18]:
# Standardize numerical features
numerical_cols = normal_data.drop(columns=categorical_cols).columns
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(normal_data[numerical_cols])

In [21]:
# Combine the encoded and standardized data
combined_data = np.hstack([scaled_numerical_data, encoded_categories])

In [22]:
# Create the train/validation/test data (80/10/10)
train_data, temp_data = train_test_split(combined_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [56]:
# VAE class
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16, dropout_prob=0.3):
        super(VAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim

        # Encoder section
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128), # batch norm
            nn.ReLU(),
            nn.Dropout(dropout_prob), # dropout
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
        )
        
        # Mean of latent distribution
        self.mu_layer = nn.Linear(64, latent_dim)
        # Log variance of latent distribution
        self.logvar_layer = nn.Linear(64, latent_dim)

        # Decoder section
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.BatchNorm1d(64),  # Batch Normalization
            nn.ReLU(),
            nn.Dropout(dropout_prob),  # Dropout
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),  # Batch Normalization
            nn.ReLU(),
            nn.Dropout(dropout_prob),  # Dropout
            nn.Linear(128, input_dim),
            nn.Sigmoid()  # Assumes inputs normalized between 0 and 1
        )

    # Keep the latent space stochastic, preventing the VAE from collapsing into a deterministic autoencoder
    def reparameterize(self, mu, logvar):
        # Convert log variance to standard deviation
        std = torch.exp(0.5 * logvar)
        # Random sample from standard normal
        eps = torch.randn_like(std)

        # Shift and scale eps to match the desired distribution
        return mu + eps * std

    def forward(self, x):
        # Encode the input data
        hidden = self.encoder(x)
        mu, logvar = self.mu_layer(hidden), self.logvar_layer(hidden)

        assert not torch.isnan(mu).any(), 'NaN detected in mu'
        assert not torch.isnan(logvar).any(), 'NaN detected in logvar'
        
        # Sample latent vector
        z = self.reparameterize(mu, logvar)

        # Decode the latent representation
        reconstructed = self.decoder(z)
        return reconstructed, mu, logvar

    def loss_function(self, x, reconstructed_x, mu, logvar):
        # Calculate the reconstruction loss (MSE)
        reconstruction_loss = F.mse_loss(reconstructed_x, x, reduction='sum')

        # KL divergence loss
        kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return reconstruction_loss + kl_loss

In [30]:
# Convert numpy arrays to pytorch tensors
train_tensor = torch.tensor(train_data, dtype=torch.float32)
val_tensor = torch.tensor(val_data, dtype=torch.float32)

In [60]:
# Define hyperparameters
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
learning_rate = 0.001
decay = 0.001
num_epochs = 50

input_dimensions = train_data.shape[1]
latent_dimensions = 32
vae = VAE(input_dimensions, latent_dimensions).to(device)
optimizer = optim.AdamW(vae.parameters(), lr=learning_rate, weight_decay=decay)
# Reduce LR in half if val loss doesn't decrease for 4 epochs
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=4, factor=0.5)

In [58]:
# Create DataLoaders
train_loader = DataLoader(TensorDataset(train_tensor), batch_size=batch_size)
val_loader = DataLoader(TensorDataset(val_tensor), batch_size=batch_size)

In [None]:
best_model_path = "best_vae_model.pth"
best_val_loss = float("inf")
criterion = vae.loss_function

# Training loop
for epoch in range(num_epochs):
    vae.train()
    train_loss = 0.0

    for batch in train_loader:
        x = batch[0].to(device)

        optimizer.zero_grad()
        # Get the decoded latent representation, the mean and log variance
        reconstructed_x, mu, logvar = vae(x)

        loss = criterion(x, reconstructed_x, mu, logvar)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    vae.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            x = batch[0].to(device)
            reconstructed_x, mu, logvar = vae(x)

            loss = criterion(x, reconstructed_x, mu, logvar)
            val_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)

    scheduler.step(val_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print("New best model saved.")
        torch.save(vae.state_dict(), best_model_path)

Epoch [1/50] - Train Loss: 1116.4185 | Val Loss: 1420.8759
New best model saved.
Epoch [2/50] - Train Loss: 1043.4382 | Val Loss: 1390.8957
New best model saved.
Epoch [3/50] - Train Loss: 1026.8123 | Val Loss: 1518.1093
Epoch [4/50] - Train Loss: 1023.6392 | Val Loss: 1390.9208
Epoch [5/50] - Train Loss: 1022.4049 | Val Loss: 1412.2557
Epoch [6/50] - Train Loss: 1021.4028 | Val Loss: 1394.6940
Epoch [7/50] - Train Loss: 1019.9279 | Val Loss: 1385.5111
New best model saved.
Epoch [8/50] - Train Loss: 1019.3714 | Val Loss: 1387.3758
Epoch [9/50] - Train Loss: 1018.8950 | Val Loss: 1388.8670
