In [1]:
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, multilabel_confusion_matrix, ConfusionMatrixDisplay
#for dirname, _, filenames in os.walk('LUFlow/LUFlow'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

In [2]:
# read a selection of the data
df1 = pd.read_csv('LUFlow/LUFlow/2022/06/2022.06.12/2022.06.12.csv')
df2 = pd.read_csv('LUFlow/LUFlow/2022/06/2022.06.13/2022.06.13.csv')
df3 = pd.read_csv('LUFlow/LUFlow/2022/06/2022.06.14/2022.06.14.csv')

# merge into a single dataframe
df_dataset = pd.concat([df1, df2, df3])
df_dataset.reset_index(drop=True, inplace=True)
len(df_dataset.columns)

16

In [3]:
df_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1068376 entries, 0 to 1068375
Data columns (total 16 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   avg_ipt        1068376 non-null  float64
 1   bytes_in       1068376 non-null  int64  
 2   bytes_out      1068376 non-null  int64  
 3   dest_ip        1068376 non-null  int64  
 4   dest_port      964168 non-null   float64
 5   entropy        1068376 non-null  float64
 6   num_pkts_out   1068376 non-null  int64  
 7   num_pkts_in    1068376 non-null  int64  
 8   proto          1068376 non-null  int64  
 9   src_ip         1068376 non-null  int64  
 10  src_port       964168 non-null   float64
 11  time_end       1068376 non-null  int64  
 12  time_start     1068376 non-null  int64  
 13  total_entropy  1068376 non-null  float64
 14  label          1068376 non-null  object 
 15  duration       1068376 non-null  float64
dtypes: float64(6), int64(9), object(1)
memory usage: 130.4

In [4]:
df_dataset.describe()

Unnamed: 0,avg_ipt,bytes_in,bytes_out,dest_ip,dest_port,entropy,num_pkts_out,num_pkts_in,proto,src_ip,src_port,time_end,time_start,total_entropy,duration
count,1068376.0,1068376.0,1068376.0,1068376.0,964168.0,1068376.0,1068376.0,1068376.0,1068376.0,1068376.0,964168.0,1068376.0,1068376.0,1068376.0,1068376.0
mean,4964986.0,699.9835,3620.235,786.0,14856.036961,3.024693,7.804426,4.931449,5.60987,786.0,36898.051885,1504349000000000.0,1505360000000000.0,13035.08,1.518667
std,84310510.0,3288.582,8257.19,0.0,16039.005055,2.342624,21.69485,14.91951,1.870952,0.0,17895.153177,452106800000000.0,450745200000000.0,65941.2,5.734541
min,0.0,0.0,0.0,786.0,1.0,0.0,0.0,0.0,1.0,786.0,11.0,16550760000.0,16550890000.0,0.0,0.0
25%,0.0,0.0,0.0,786.0,5900.0,1.020244,1.0,0.0,6.0,786.0,19780.0,1655092000000000.0,1655092000000000.0,35.01955,0.0
50%,0.0,0.0,43.0,786.0,9200.0,3.0,3.0,1.0,6.0,786.0,45332.0,1655163000000000.0,1655163000000000.0,323.8136,0.000196
75%,35.71429,34.0,2904.0,786.0,9300.0,5.021325,7.0,5.0,6.0,786.0,47613.0,1655182000000000.0,1655182000000000.0,20671.65,0.227897
max,4294967000.0,65483.0,65535.0,786.0,65535.0,134.2394,255.0,255.0,47.0,786.0,65535.0,1655251000000000.0,1655251000000000.0,3979174.0,41.0236


In [5]:
# replace +ve and -ve infinity with NaN
df_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

In [6]:
# drop missing values
df_dataset.dropna(inplace=True)

In [7]:
print(df_dataset.duplicated().sum())

4284


In [8]:
df_dataset.drop_duplicates(inplace = True)

In [9]:
print(df_dataset.duplicated().sum())

0


In [10]:
X = df_dataset.drop(columns = ['label'], axis = 1)
Y = df_dataset['label']

le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.astype('int64')

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 50)

# Convert all columns to numeric types
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

In [11]:
Y

array([2, 2, 2, ..., 0, 2, 2], dtype=int64)

In [23]:
# Define the neural network architecture

class TrafficNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TrafficNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        self.fc4 = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(p=0.3)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        out = self.bn3(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc4(out)
        return out


In [25]:
# Hyperparameters
input_size = 15  # Adjust based on your dataset
hidden_size = 64
num_classes = 3 
num_epochs = 5
batch_size = 32
learning_rate = 0.001

In [27]:
# Convert to PyTorch tensors
train_dataset = TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(y_train, dtype=torch.int64))
test_dataset = TensorDataset(torch.tensor(X_test.values, dtype=torch.float32), torch.tensor(y_test, dtype=torch.int64))

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [29]:
# Initialize the model, loss function, and optimizer
model = TrafficNet(input_size, hidden_size, num_classes)
#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Training loop
model.to("cuda")
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        features = features.to("cuda")
        labels = labels.to("cuda")
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: -50139.0898
Epoch [2/5], Loss: -172441.9062
Epoch [3/5], Loss: -421437.7188


In [None]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print(f'{method.__name__}: {(te - ts) * 1000} ms')
        return result
    return timed

@timeit
def get_predictions(model, dataloader):
    model = model.to("cuda")
    # Set the model to evaluation mode
    model.eval()
    all_preds = []
    all_labels = []
    # Disable gradient calculation
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to("cuda"), labels.to("cuda")
            # Forward pass: compute predicted outputs by passing inputs to the model
            outputs = model(inputs)
            # Get the index of the maximum value
            _, preds = torch.max(outputs, 1)
            # Append the predictions and labels to the respective lists
            all_preds.append(preds)
            all_labels.append(labels)
    # Return the lists of predictions and labels
    all_preds = torch.cat(all_preds).cpu().numpy()
    all_labels = torch.cat(all_labels).cpu().numpy()
    return all_preds, all_labels

In [None]:
y_pred_list, y_true_list = get_predictions(model, train_loader)
y_pred_list

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import matplotlib.pyplot as plt

cm = confusion_matrix(y_true_list, y_pred_list)

fig, ax = plt.subplots(figsize=(12,8))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

disp.plot(ax=ax)
plt.xticks(rotation=90)
plt.show()