In [1]:
import os
import pandas as pd
import kagglehub
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.functional as F
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt
from torchmetrics import AUROC, ROC

from data_loading.tools import reduce_mem_usage

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Path to the cached dataset
cache_path = os.path.expanduser("~/.cache/kagglehub/datasets")
data_path = os.path.join(cache_path, "aryashah2k/nfuqnidsv2-network-intrusion-detection-dataset/versions/1")

if not os.path.exists(data_path):
    # Download latest version
    data_path = kagglehub.dataset_download("aryashah2k/nfuqnidsv2-network-intrusion-detection-dataset")

data_path

'/home/riley/.cache/kagglehub/datasets/aryashah2k/nfuqnidsv2-network-intrusion-detection-dataset/versions/1'

In [12]:
data = pd.read_csv(os.path.join(data_path, "NF-UQ-NIDS-v2.csv"), nrows=75_000)
data = reduce_mem_usage(data)
data.head()

Memory usage after optimization is: 11.52 MB
Decreased by 56.2%


  has_large_values = (abs_vals > 1e6).any()
  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack,Dataset
0,192.168.100.148,65389,192.168.100.7,80,6,7.0,420,3,0,0,...,0,35840,140,0,0,0,0.0,1,DoS,NF-BoT-IoT-v2
1,192.168.100.148,11154,192.168.100.5,80,6,7.0,280,2,40,1,...,0,0,0,0,0,0,0.0,1,DoS,NF-BoT-IoT-v2
2,192.168.1.31,42062,192.168.1.79,1041,6,0.0,44,1,40,1,...,0,0,0,0,0,0,0.0,0,Benign,NF-ToN-IoT-v2
3,192.168.1.34,46849,192.168.1.79,9110,6,0.0,44,1,40,1,...,0,0,0,0,0,0,0.0,0,Benign,NF-ToN-IoT-v2
4,192.168.1.30,50360,192.168.1.152,1084,6,0.0,44,1,40,1,...,0,0,0,0,0,0,0.0,0,Benign,NF-ToN-IoT-v2


In [7]:
data.select_dtypes('object').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   IPV4_SRC_ADDR  10000 non-null  object
 1   IPV4_DST_ADDR  10000 non-null  object
 2   Attack         10000 non-null  object
 3   Dataset        10000 non-null  object
dtypes: object(4)
memory usage: 312.6+ KB


In [13]:
attacks = list(data["Attack"].unique())
attacks_map = {f"{attack}" : index for attack,index in zip(attacks,range(len(attacks)))}
data["Attack"] = data["Attack"].map(attacks_map)
data.drop(["IPV4_DST_ADDR","IPV4_SRC_ADDR","Dataset"],axis=1,inplace=True)

scaler = MinMaxScaler()

scaled_data = scaler.fit_transform(data)

scaled_data = pd.DataFrame(scaled_data,columns=data.columns)
scaled_data["Label"] = data["Label"]
scaled_data["Attack"] = data["Attack"]
scaled_data.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,0.997772,0.001221,0.019841,0.028689,6.4e-05,1.8e-05,0.0,0.0,0.008969,0.008969,...,0.007813,0.0,0.54902,0.54902,0.0,0.0,0.0,0.0,1,0
1,0.170199,0.001221,0.019841,0.028689,4.3e-05,9e-06,1e-06,4.2e-05,0.098655,0.008969,...,0.007813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
2,0.641825,0.015885,0.019841,0.0,6e-06,0.0,1e-06,4.2e-05,0.098655,0.008969,...,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,0.71487,0.139012,0.019841,0.0,6e-06,0.0,1e-06,4.2e-05,0.098655,0.008969,...,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
4,0.768444,0.016541,0.019841,0.0,6e-06,0.0,1e-06,4.2e-05,0.098655,0.008969,...,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


In [14]:
class IntrusionDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        x = row.drop(["Attack","Label"]).copy().values
        y = row["Attack"]
        
        return torch.tensor(x).type(torch.double), torch.tensor(y).type(torch.double)

In [15]:
dataTrain, dataTest = train_test_split(scaled_data, test_size=0.2, random_state=42)
trainDataset = IntrusionDataset(dataTrain)
testDataset = IntrusionDataset(dataTest)
train_dataloader = DataLoader(trainDataset, batch_size=32, shuffle=True, drop_last=True)
test_dataloader = DataLoader(testDataset,batch_size=32,shuffle=True, drop_last=True)

In [16]:
for x,y in train_dataloader:
    print(x.shape)
    break

torch.Size([32, 41])


In [17]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear(512, 128)  # Adjust according to your input dimensions after conv layers
        self.fc2 = nn.Linear(128, 64)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(32,512)  # Flatten
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.lstm = nn.LSTM(input_size=41, hidden_size=128, batch_first=True)
        self.fc1 = nn.Linear(128, 64)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence dimension
        h0 = torch.zeros(1, x.size(0), 128).to(x.device).to(torch.double)  # Initial hidden state
        c0 = torch.zeros(1, x.size(0), 128).to(x.device).to(torch.double)  # Initial cell state
        out, _ = self.lstm(x, (h0, c0))  # Pass input through LSTM
        x = self.relu(self.fc1(out[:, -1, :]))  # Use the output from the last time step
        return x
    
class CNN_RNN_Ensemble(nn.Module):
    def __init__(self):
        super(CNN_RNN_Ensemble, self).__init__()
        self.cnn = CNN()
        self.rnn = RNN()
        self.fc = nn.Linear(64 + 64, 21)  # 64 from CNN + 64 from RNN, and 21 output classes

    def forward(self, x):
        cnn_out = self.cnn(x)
        rnn_out = self.rnn(x)
        combined = torch.cat((cnn_out, rnn_out), dim=1)
        x = self.fc(combined)
        return x

In [18]:
model = CNN_RNN_Ensemble().to(torch.double)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [22]:
for epoch in range(10):
    for X, Y in tqdm(train_dataloader):
        x, y = X.to(device), Y.to(device)
        optimizer.zero_grad()
        outputs = model(x).to(device)
        loss = criterion(outputs, y.to(torch.long))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

model.eval()
outputss = []
all_preds = []
all_labels = []
with torch.inference_mode():
    for inputs, labels in tqdm(test_dataloader):
        outputs = model(inputs).to(device)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())
        outputss.extend(outputs.numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Test Accuracy: {accuracy}')
print(classification_report(all_labels, all_preds))

100%|██████████| 1875/1875 [00:22<00:00, 82.02it/s]


Epoch 1, Loss: 0.035633513611926626


100%|██████████| 1875/1875 [00:22<00:00, 82.38it/s]


Epoch 2, Loss: 0.21818183707285807


100%|██████████| 1875/1875 [00:23<00:00, 80.60it/s]


Epoch 3, Loss: 0.18545120138041502


100%|██████████| 1875/1875 [00:22<00:00, 82.09it/s]


Epoch 4, Loss: 0.24003974459401126


100%|██████████| 1875/1875 [00:22<00:00, 82.09it/s]


Epoch 5, Loss: 0.23292927621308956


100%|██████████| 1875/1875 [00:22<00:00, 83.43it/s]


Epoch 6, Loss: 0.09557066034177085


100%|██████████| 1875/1875 [00:23<00:00, 81.16it/s]


Epoch 7, Loss: 0.2579162803100391


100%|██████████| 1875/1875 [00:22<00:00, 83.05it/s]


Epoch 8, Loss: 0.02687005979111626


100%|██████████| 1875/1875 [00:22<00:00, 83.05it/s]


Epoch 9, Loss: 0.2762449365772612


100%|██████████| 1875/1875 [00:22<00:00, 82.90it/s]


Epoch 10, Loss: 0.16822422094445236


  0%|          | 0/468 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument weight in method wrapper_CUDA___slow_conv2d_forward)

In [None]:
cm = confusion_matrix(all_labels,all_preds)
recall = recall_score(all_labels,all_preds,average="weighted")
precision = precision_score(all_labels,all_preds,average="weighted")
f1 = f1_score(all_labels,all_preds,average="weighted")

print(f"recall score : {recall}")
print(f"precision score:{precision}")
print(f"f1 score :{f1}")

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.heatmap(cm,annot=True,ax=ax)
plt.show()

In [None]:
# Example predictions and labels
preds = torch.tensor(outputss).to(torch.double)
labels = torch.tensor(all_labels).to(torch.long)

# Initialize AUROC and ROC metrics
auroc_metric = AUROC(task="multiclass",num_classes=21)
roc_metric = ROC(task="multiclass",num_classes=21)

# Compute AUROC
auroc = auroc_metric(preds, labels)

# Compute ROC curve
fpr, tpr, thresholds = roc_metric(preds, labels)

# Print the AUROC value
print(f"AUROC: {auroc.item()}")

# Plot ROC curve
plt.figure()
for fp, tp in zip(fpr,tpr):
    plt.plot(fp, tp, marker='.')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()