In [1]:
from collections import OrderedDict
from typing import List, Tuple

from flowmeter.flowmeter import Flowmeter

import urllib.request
import gzip
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, Subset

import flwr as fl
from flwr.common import Metrics

In [2]:
aefawef = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00516/mirai/Mirai_pcap.pcap.gz'
csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00516/mirai/Mirai_dataset.csv.gz'
label_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00516/mirai/Mirai_labels.csv.gz'

pcap_path = "C:/Users/Panda 1.0/OneDrive - Loughborough University/Masters/Thesis/LTSM-Test/Mirai_pcap.pcap"

In [3]:
df = pd.read_csv(csv_path, header=None)
df = df.drop(columns=[df.columns[0]])

# dataset info:
# df.info()
# df.head()
# print(df.head())

# check for missing data (not working atm -check why tmr)
# df.nunique() # return number of unique elements per column (excludes NaNs)
# df.isnull().sum()
# df.describe()


labels = pd.read_csv(label_path, dtype={"": int, "x": 'float64'}, header=None)
# labels.info()
# labels.head()
# labels.describe()
# print(labels.head())

# append label to the end of dataset:
column_wanted = labels.iloc[:, 0]
df_dataset = pd.concat([df, column_wanted], axis=1, ignore_index=False)
df_dataset = df_dataset.reset_index(drop=True)

df_dataset.info()
df_dataset.head()
df_dataset.describe()
print("dataframe dataset: ")
print(df_dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 764137 entries, 0 to 764136
Columns: 116 entries, 1 to 0
dtypes: float64(115), int64(1)
memory usage: 676.3 MB
dataframe dataset: 
              1           2             3          4           5    \
0        1.000000   60.000000  0.000000e+00   1.000000   60.000000   
1        1.999983   60.000000  0.000000e+00   1.999990   60.000000   
2        1.000000   86.000000  0.000000e+00   1.000000   86.000000   
3        1.999272   86.000000  9.094947e-13   1.999563   86.000000   
4        1.000000   60.000000  0.000000e+00   1.000000   60.000000   
...           ...         ...           ...        ...         ...   
764132  18.897961   60.185845  3.594147e+00  51.700968   60.182642   
764133  19.775173   60.176447  3.414055e+00  52.499151   60.179163   
764134  20.692078   60.167920  3.250494e+00  53.366680   60.175805   
764135  21.566383   60.160134  3.101020e+00  54.171935   60.172560   
764136   1.089072  117.452795  4.912762e+01   1.3

In [4]:
np_dataset = df_dataset.values
print(np_dataset)

[[  1.          60.           0.         ...   0.           0.
    0.        ]
 [  1.99998265  60.           0.         ...   0.           0.
    0.        ]
 [  1.          86.           0.         ...   0.           0.
    0.        ]
 ...
 [ 20.69207825  60.16791977   3.25049354 ...   0.           0.
    1.        ]
 [ 21.56638256  60.16013358   3.10101998 ...   0.           0.
    1.        ]
 [  1.08907225 117.4527954   49.12761856 ...   0.           0.
    1.        ]]


In [5]:
# Assumes cross-silo FL: dataset is split into 10 clients (organisations) to simulate this
NUM_CLIENTS = 10
BATCH_SIZE = 32

In [6]:
# split into training set and testing set with the labels seperate
split_point = int(len(np_dataset)*0.9)

train_np = np_dataset[:split_point, :]
test_np = np_dataset[split_point:, :]
# train_labels = labelled_dataset[:split_point, -1]
# test_labels = labelled_dataset[split_point:, -1]

In [7]:
class KitsuneDataset(Dataset):
    def __init__(self, numpy_array):
        self.features = numpy_array[:, :-1]
        self.labels = numpy_array[:, -1:].ravel()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        features = torch.tensor(self.features[index], dtype=torch.float32)
        label = torch.tensor(self.labels[index], dtype=torch.float32)
        return features, label

In [8]:
def load_datasets():#num_clients: int):
    train_set = KitsuneDataset(train_np)
    test_set = KitsuneDataset(test_np)
    # # Split training set into partitions(i.e., the no. of clients) to simulate the individual dataset
    # partition_size = len(train_tensor) // NUM_CLIENTS
    # lengths = [partition_size] * NUM_CLIENTS
    # datasets = random_split(trainset, lengths, torch.Generator().manual_seed(42))

    # # Split each partition into train/val and create DataLoader
    # trainloaders = []
    # valloaders = []

    # for ds in datasets:

    #     trainloaders.append(DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=False))
    #     valloaders.append(DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False))

    # w/o partitioning for clients:

    # create validation subset from training set:
    train_size = int(len(train_set) * 0.8)
    train_split = train_set[:train_size], 
    val_split = train_set[train_size:]

    # create dataloaders
    trainloader = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=False)
    valloader = DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=False)
    testloader = DataLoader(test_np, batch_size=BATCH_SIZE, shuffle=False)
    return trainloader, valloader, testloader

# load datasets
trainloader, valloader, testloader = load_datasets()
print("datasets loaded")
print("trainloader: ", trainloader)

datasets loaded
trainloader:  <torch.utils.data.dataloader.DataLoader object at 0x0000024C07B853D0>


In [18]:
for inputs, labels in trainloader:
    print(inputs.shape)
    print(labels.shape)

torch.Size([1, 550178, 115])
torch.Size([1, 550178])


In [9]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=0.0, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # x: input tensor of shape (batch_size, seq_len, input_size)
        print(x.size())
        batch_size, seq_len, _ = x.size()

        # hidden: tuple of hidden state and cell state tensors from previous time step
        # intially set to all zeros
        hidden = (torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size),
                  torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size))
        
        # output: output tensor of shape (batch_size, seq_len, hidden_size)
        output, hidden = self.lstm(x, hidden)
        

        output = self.fc(output[:, -1, :])
        # Get the last output in the sequence and pass it through a fully connected layer
        
        return output

In [10]:
def train(model, trainloader, epochs: int, verbose=False):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    model.train() # switch to train mode
    for epoch in range(epochs):
        correct, total, epoch_loss = 0, 0, 0.0
        for i, (inputs, labels) in enumerate(trainloader):
            labels = labels.unsqueeze(1)
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad() # zero parameter gradients

            # reset hidden state
            hidden = (torch.zeros(model.lstm.num_layers, inputs.size(0), model.lstm.hidden_size),
                      torch.zeros(model.lstm.num_layers, inputs.size(0), model.lstm.hidden_size))

            # forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # backward pass and optimize
            loss.backward()
            optimizer.step()

            epoch_loss += loss # update loss

            # compute accuracy
            total += labels.size(0)
            correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
        
        # log results of training
        epoch_loss /= len(trainloader.dataset)
        epoch_acc = correct / total
        if verbose:
            print(f"Epoch [{epoch+1}/{epochs}]: train loss {epoch_loss}, accuracy {epoch_acc}")




def test(model, testloader):
    criterion = torch.nn.CrossEntropyLoss()
    correct, total, loss = 0, 0, 0.0
    model.eval()
    with torch.no_grad():
        for inputs, labels in testloader:
            labels.unsqueeze(1)
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            # reset hidden state
            hidden = (torch.zeros(model.lstm.num_layers, inputs.size(0), model.lstm.hidden_size).to(device),
                      torch.zeros(model.lstm.num_layers, inputs.size(0), model.lstm.hidden_size).to(device))

            # forward pass + loss update
            outputs = model(inputs)
            loss += criterion(outputs, labels).item()

            # compute accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    # print test results
    loss /= len(testloader.dataset)
    accuracy = correct / total
    return loss, accuracy

In [11]:
DEVICE = torch.device("cpu")  # Try "cuda" to train on GPU
print(
    f"Training on {DEVICE} using PyTorch {torch.__version__} and Flower {fl.__version__}"
)


# centralised training:
# trainloader = trainloaders[0]
# valloader = valloaders[0]
model = LSTM(115, 10, 2, 1).to(DEVICE)
print("model initialised")

# for epoch in range(5):
#     train(model, trainloader, 1)
#     print("training occured")
#     loss, accuracy = test(model, valloader)
#     print("testing occured")
#     print(f"Epoch {epoch+1}: validation loss {loss}, accuracy {accuracy}")

# loss, accuracy = test(model, testloader)
# print(f"Final test set performance:\n\tloss {loss}\n\taccuracy {accuracy}")

Training on cpu using PyTorch 1.13.1+cu116 and Flower 1.3.0
model initialised
