<a href="https://colab.research.google.com/github/ashwanth-07/Network-Anomaly-Detection-using-Autoencoders/blob/main/Network_anomaly_autoencoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**<h1>Network Anomaly detection using Autoencoders</h1>**

**Dataset: NSL-KDD : http://205.174.165.80/CICDataset/NSL-KDD/**

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plot
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

train_path = '/content/gdrive/My Drive/Data/KDDTrain.csv' # DO NOT MODIFY THIS. Please make sure your data has this exact path
val_path = '/content/gdrive/My Drive/Data/KDDTest.csv' # DO NOT MODIFY THIS. Please make sure your data has this exact path
train = pd.read_csv(train_path)
val = pd.read_csv(val_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
torch.manual_seed(111)

<torch._C.Generator at 0x798228576530>

In [None]:
train

Unnamed: 0,'duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',...,'dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','class'
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,tcp,exec,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25188,0,tcp,ftp_data,SF,334,0,0,0,0,0,...,39,1.00,0.00,1.00,0.18,0.00,0.00,0.00,0.00,anomaly
25189,0,tcp,private,REJ,0,0,0,0,0,0,...,13,0.05,0.07,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25190,0,tcp,nnsp,S0,0,0,0,0,0,0,...,20,0.08,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


**<h1>1. Preprocessing</h1>**

In [None]:
column_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds',
       'is_host_login', 'is_guest_login', 'count', 'srv_count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate',
       'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class']

**Representing class labels as 0 and 1 where 0 represents normal, 1 is anomaly**

In [None]:
train.columns = column_names
train['class'] = train['class'].apply(lambda x: 0 if x == 'normal' else 1) # 0 represents normal, 1 is anomaly

val.columns = column_names
val['class'] = val['class'].apply(lambda x: 0 if x == 'normal' else 1)

**Normalising data by scaling numerical variables and using one-hot encoding on categorical variables**

In [None]:
def preprocess_dataframe(train, valid):
    # Combine the datasets
    combined = pd.concat([train, valid])

    numerical_features = combined.select_dtypes(include=['float64', 'int64']).columns
    categorical_features = combined.select_dtypes(exclude=['float64', 'int64']).columns
    numerical_features = numerical_features.drop('class')

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Fit the preprocessor on the combined dataset
    preprocessor.fit(combined)

    # Transform the individual datasets
    processed_train = preprocessor.transform(train)
    processed_valid = preprocessor.transform(valid)

    categorical_features_encoded = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

    all_features = list(numerical_features) + list(categorical_features_encoded)

    train_df = pd.DataFrame(processed_train, columns=all_features)
    train_df['class'] = train['class']

    valid_df = pd.DataFrame(processed_valid, columns=all_features)
    valid_df['class'] = valid['class']

    return train_df, valid_df


In [None]:
train_df, val_df = preprocess_dataframe(train, val)
train_df

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,class
0,-0.121345,-0.009689,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,-0.121345,-0.009883,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,-0.121345,-0.009965,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,-0.121345,-0.009835,0.080701,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,1.181993,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,-0.121345,-0.009853,-0.036181,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,1.181993,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,-0.121345,-0.009965,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
25188,-0.121345,-0.009777,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,1.181993,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
25189,-0.121345,-0.009965,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
25190,-0.121345,-0.009965,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
df = train_df[train_df["class"] == 0]
y = df['class']
df = df.drop('class', axis = 1)

In [None]:
df.shape

(13449, 119)

In [None]:
# Assuming `data` is your input data
df = df[:13440]
tensor_data = torch.Tensor(df.values)  # Convert data to tensor
dataset = TensorDataset(tensor_data)
dataloader = DataLoader(dataset, batch_size=40, shuffle=True) #creating mini-batches for training

**<h1>2. Modelling (Auto-encoder)</h1>**

In [None]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(in_features=119, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=32),
        )
        self.decoder = nn.Sequential(
            nn.Linear(in_features=32, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=119),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.001)
epochs = 100

losses = []

for epoch in range(epochs):
    epoch_loss = 0.0
    for batch in dataloader:
        inputs = batch[0].to(device)
        outputs = model(inputs)
        loss = criterion(outputs, inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_epoch_loss = epoch_loss / len(dataloader)  # Compute the average loss in this epoch
    losses.append(avg_epoch_loss)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_epoch_loss}")


Epoch 1/100, Loss: 0.14769103305020725
Epoch 2/100, Loss: 0.06162510644589063
Epoch 3/100, Loss: 0.04136726633684399
Epoch 4/100, Loss: 0.03746187869719939
Epoch 5/100, Loss: 0.022950465324573174
Epoch 6/100, Loss: 0.020192124095739842
Epoch 7/100, Loss: 0.020803867238144102
Epoch 8/100, Loss: 0.017469475834375425
Epoch 9/100, Loss: 0.01672449692601471
Epoch 10/100, Loss: 0.01739569185537264
Epoch 11/100, Loss: 0.01586017673114492
Epoch 12/100, Loss: 0.011305138315254312
Epoch 13/100, Loss: 0.011844168449377841
Epoch 14/100, Loss: 0.00920086995410245
Epoch 15/100, Loss: 0.013512505127263943
Epoch 16/100, Loss: 0.014102445119490204
Epoch 17/100, Loss: 0.008835506042703941
Epoch 18/100, Loss: 0.00788819766083699
Epoch 19/100, Loss: 0.009878442499722865
Epoch 20/100, Loss: 0.0066470639967952765
Epoch 21/100, Loss: 0.006987839266436086
Epoch 22/100, Loss: 0.007563657428753296
Epoch 23/100, Loss: 0.007107764632001775
Epoch 24/100, Loss: 0.012761603435224277
Epoch 25/100, Loss: 0.00729968615

In [None]:
def compute_avg_reconstruction_error(data_loader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    total_batches = 0

    with torch.no_grad():  # No need to track gradients
        for batch in data_loader:
            inputs = batch[0].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            total_loss += loss.item()
            total_batches += 1

    avg_loss = total_loss / total_batches
    return avg_loss

avg_reconstruction_error = compute_avg_reconstruction_error(dataloader)
print(f"Average reconstruction error: {avg_reconstruction_error}")


Average reconstruction error: 0.018365622439540902


In [None]:
y = train_df['class']
train_df = train_df.drop('class', axis = 1)

In [None]:
val_y = val_df['class']
val_df = val_df.drop('class', axis = 1)

In [None]:
val_df

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,-0.121345,-0.009965,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.121345,-0.009965,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.120427,-0.002676,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.121345,-0.009954,-0.042529,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.120886,-0.009965,-0.042302,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,-0.121345,-0.009519,-0.037496,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,1.181993,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22540,-0.121345,-0.009787,-0.028351,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,1.181993,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22541,-0.121345,0.020655,0.083134,-0.013732,-0.077477,-0.013976,1.091755,-0.099622,1.181993,0.090774,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22542,-0.121345,-0.009941,-0.041894,-0.013732,-0.077477,-0.013976,-0.091261,-0.099622,-0.846029,-0.019505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


**<h1>3. Evaluation</h1>**

In [None]:
def detect_anomaly_and_evaluate(new_data, labels):
    model.eval()
    outputs_list = []
    anomaly_predictions = []

    with torch.no_grad():
        for data_point, label in zip(new_data, labels):
            inputs = torch.Tensor(data_point).unsqueeze(0).to(device)  # Add batch dimension
            outputs = model(inputs)
            reconstruction_error = criterion(outputs, inputs)

            # Save the model output
            outputs_list.append(outputs.squeeze().cpu().numpy())  # Remove batch dimension and move to cpu

            # Detect anomaly
            is_anomaly = reconstruction_error > avg_reconstruction_error*0.25
            anomaly_predictions.append(is_anomaly.item())

    # Compute accuracy
    correct_predictions = sum(int(pred == label) for pred, label in zip(anomaly_predictions, labels))
    accuracy = correct_predictions / len(labels)

    return anomaly_predictions, accuracy


In [None]:
predictions, accuracy = detect_anomaly_and_evaluate(train_df.values, y)

In [None]:
print(f"Accuracy on training data: {accuracy*100}")

Accuracy on training data: 93.65274690377899


In [None]:
val_predictions, val_accuracy = detect_anomaly_and_evaluate(val_df.values, val_y)

In [None]:
print(f"Accuracy on validation data: {val_accuracy*100}")

Accuracy on validation data: 88.44925479063164
