In [1]:
import dgl.nn as dglnn
from dgl import from_networkx
import torch.nn as nn
import torch as th
import torch.nn.functional as F
import dgl.function as fn
import networkx as nx
import pandas as pd
import socket
import struct
import random
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data = pd.read_csv('NF-BoT-IoT.csv')

In [3]:
data

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,192.168.100.6,52670,192.168.100.1,53,17,5.212,71,126,1,1,0,4294966,0,Benign
1,192.168.100.6,49160,192.168.100.149,4444,6,0.000,217753000,199100,4521,4049,24,4176249,1,Theft
2,192.168.100.46,3456,192.168.100.5,80,17,0.000,8508021,8918372,9086,9086,0,4175916,0,Benign
3,192.168.100.3,80,192.168.100.55,8080,6,7.000,8442138,9013406,9086,9086,0,4175916,0,Benign
4,192.168.100.46,80,192.168.100.5,80,6,7.000,8374706,0,9086,0,0,4175916,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600095,192.168.100.46,80,192.168.100.5,80,6,7.000,2330065,0,2523,0,0,4263037,0,Benign
600096,192.168.100.5,0,192.168.100.3,0,6,0.000,1054423,0,1513,0,0,4263062,0,Benign
600097,192.168.100.7,365,192.168.100.3,565,17,0.000,62422,0,1357,0,0,4263062,0,Benign
600098,192.168.100.3,50850,13.54.166.67,8883,6,222.178,11300,1664,32,32,24,4264935,0,Benign


In [4]:
data = data[~data['Attack'].isin(['Theft'])]

In [5]:
from sklearn.utils import resample

df = pd.DataFrame(data)

# Group the DataFrame by 'Attack' column
grouped = df.groupby('Attack')

# Get the size of the smallest group
min_group_size = grouped.size().min()

# List to store the balanced DataFrames
balanced_dfs = []

# Undersample the majority class and append all groups to the list
for attack_type, group_df in grouped:
    if len(group_df) > min_group_size:
        undersampled_df = resample(group_df, replace=False, n_samples=min_group_size, random_state=42)
        balanced_dfs.append(undersampled_df)
    else:
        balanced_dfs.append(group_df)

# Concatenate all balanced DataFrames
balanced_df = pd.concat(balanced_dfs)

data = balanced_df

In [6]:
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))

In [7]:
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)

In [8]:
data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']

In [9]:
data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'],inplace=True)

In [10]:
data

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,Label,Attack
0,172.30.223.149:52670,192.168.100.1:53,17,5.212,71,126,1,1,0,4294966,0,Benign
2,172.29.197.62:3456,192.168.100.5:80,17,0.000,8508021,8918372,9086,9086,0,4175916,0,Benign
3,172.23.181.112:80,192.168.100.55:8080,6,7.000,8442138,9013406,9086,9086,0,4175916,0,Benign
4,172.16.179.233:80,192.168.100.5:80,6,7.000,8374706,0,9086,0,0,4175916,0,Benign
5,172.17.96.55:0,192.168.100.3:0,6,0.000,3799285,0,5452,0,0,4175932,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...
541847,172.29.166.218:53380,192.168.100.3:80,6,7.000,986,770,7,5,31,0,1,Reconnaissance
465501,172.19.213.54:36752,192.168.100.3:30,6,0.000,60,40,1,1,22,4294967,1,Reconnaissance
529706,172.18.185.27:40850,192.168.100.3:24058,6,0.000,44,40,1,1,22,4294967,1,Reconnaissance
531198,172.25.128.94:40850,192.168.100.3:40515,6,0.000,44,40,1,1,22,4294967,1,Reconnaissance


In [11]:
data.drop(columns=['Label'],inplace = True)

In [12]:
data.rename(columns={"Attack": "label"},inplace = True)

In [13]:
le = LabelEncoder()
le.fit_transform(data.label.values)
data['label'] = le.transform(data['label'])

In [14]:
label = data.label

In [15]:
data.drop(columns=['label'],inplace = True)

In [16]:
scaler = StandardScaler()

In [17]:
data =  pd.concat([data, label], axis=1)

In [18]:
data

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,label
0,172.30.223.149:52670,192.168.100.1:53,17,5.212,71,126,1,1,0,4294966,0
2,172.29.197.62:3456,192.168.100.5:80,17,0.000,8508021,8918372,9086,9086,0,4175916,0
3,172.23.181.112:80,192.168.100.55:8080,6,7.000,8442138,9013406,9086,9086,0,4175916,0
4,172.16.179.233:80,192.168.100.5:80,6,7.000,8374706,0,9086,0,0,4175916,0
5,172.17.96.55:0,192.168.100.3:0,6,0.000,3799285,0,5452,0,0,4175932,0
...,...,...,...,...,...,...,...,...,...,...,...
541847,172.29.166.218:53380,192.168.100.3:80,6,7.000,986,770,7,5,31,0,3
465501,172.19.213.54:36752,192.168.100.3:30,6,0.000,60,40,1,1,22,4294967,3
529706,172.18.185.27:40850,192.168.100.3:24058,6,0.000,44,40,1,1,22,4294967,3
531198,172.25.128.94:40850,192.168.100.3:40515,6,0.000,44,40,1,1,22,4294967,3


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
     data, label, test_size=0.3, random_state=123,stratify= label)

In [None]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL'])
encoder.fit(X_train, y_train)
X_train = encoder.transform(X_train)

In [21]:
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns ))  - set(list(['label'])) )
X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])

In [22]:
X_train['h'] = X_train[ cols_to_norm ].values.tolist()

In [23]:
X_train

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,OUT_BYTES,IN_PKTS,OUT_PKTS,TCP_FLAGS,FLOW_DURATION_MILLISECONDS,label,h
453781,172.29.171.219:80,192.168.100.147:53830,0.442672,-0.005504,-0.122551,-0.077738,-0.134237,-0.072817,-0.181481,0.722797,0,"[-0.12255061529292183, -0.07773824023860855, 0..."
363335,172.20.122.167:45947,192.168.100.7:1163,0.442672,0.615577,-0.123559,-0.078675,-0.139344,-0.080523,1.417177,0.722875,3,"[-0.12355910058513517, -0.07867477699742757, 0..."
552082,172.28.20.48:60544,192.168.100.3:80,0.442672,-0.005504,-0.122182,-0.077220,-0.131683,-0.072817,0.069461,-1.450567,2,"[-0.12218183023529107, -0.07722015607415547, 0..."
559515,172.27.212.199:37684,192.168.100.3:80,0.442672,-0.005504,-0.122423,-0.077220,-0.131683,-0.072817,0.069461,-1.450567,3,"[-0.12242295892681887, -0.07722015607415547, 0..."
133288,172.28.44.177:43563,192.168.100.5:2393,0.442672,0.615577,-0.123559,-0.078675,-0.139344,-0.080523,1.417177,0.722874,3,"[-0.12355910058513517, -0.07867477699742757, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...
589799,172.19.38.213:56330,192.168.100.3:80,0.442672,-0.005504,-0.122397,-0.077220,-0.131683,-0.072817,0.069461,-1.450567,1,"[-0.12239742765359828, -0.07722015607415547, 0..."
373936,172.25.103.43:64649,192.168.100.5:1073,0.442672,0.615577,-0.123559,-0.078675,-0.139344,-0.080523,1.417177,0.722874,3,"[-0.12355910058513517, -0.07867477699742757, 0..."
231622,172.23.122.18:3306,192.168.100.150:510,0.442672,-3.189052,-0.123204,-0.078515,-0.135514,-0.076670,-1.703808,0.722875,0,"[-0.12320449956818252, -0.07851536648528816, 0..."
346749,172.26.165.61:53687,192.168.159.152:53,-2.285161,0.615577,-0.123412,-0.078754,-0.138067,-0.082449,-1.425403,0.720321,0,"[-0.12341158656208287, -0.07875448225349727, -..."


In [24]:
G = nx.from_pandas_edgelist(X_train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())

In [25]:
G = G.to_directed()


In [26]:
G = from_networkx(G,edge_attrs=['h','label'] )


In [27]:
# Eq1
G.ndata['h'] = th.ones(G.num_nodes(), G.edata['h'].shape[1])

In [28]:
G.edata['train_mask'] = th.ones(len(G.edata['h']), dtype=th.bool)


In [29]:
G.edata['train_mask'] 

tensor([True, True, True,  ..., True, True, True])

In [30]:
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

In [31]:
class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        self.activation = activation

    def message_func(self, edges):
        return {'m': self.W_msg(th.cat([edges.src['h'], edges.data['h']], 2))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5          
            g.ndata['h'] = F.relu(self.W_apply(th.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))
            return g.ndata['h']


class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super(SAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.layers.append(SAGELayer(ndim_in, edim, 128, activation))
        self.layers.append(SAGELayer(128, edim, ndim_out, activation))
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, g, nfeats, efeats):
        for i, layer in enumerate(self.layers):
            if i != 0:
                nfeats = self.dropout(nfeats)
            nfeats = layer(g, nfeats, efeats)
        return nfeats.sum(1)

In [32]:
class MLPPredictor(nn.Module):
    def __init__(self, in_features, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        score = self.W(th.cat([h_u, h_v], 1))
        return {'score': score}

    def forward(self, graph, h):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

In [33]:
G.ndata['h'] = th.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
G.edata['h'] = th.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))

In [34]:
class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout):
        super().__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        self.pred = MLPPredictor(ndim_out, 4)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

In [35]:
G = G.to('cuda:0')
G.device

device(type='cuda', index=0)

In [36]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(G.edata['label'].cpu().numpy()),
                                                 y=G.edata['label'].cpu().numpy())

In [37]:
class_weights = th.FloatTensor(class_weights).cuda()
criterion = nn.CrossEntropyLoss(weight = class_weights)

In [38]:
G.edata['h'].device


device(type='cuda', index=0)

In [39]:
G.ndata['h'].device


device(type='cuda', index=0)

In [None]:
node_features = G.ndata['h']
edge_features = G.edata['h']

edge_label = G.edata['label']
train_mask = G.edata['train_mask']

model = Model(G.ndata['h'].shape[2], 128, G.ndata['h'].shape[2], F.relu, 0.2).cuda()
opt = th.optim.Adam(model.parameters())

for epoch in range(1,5000):
    pred = model(G, node_features,edge_features).cuda()
    loss = criterion(pred[train_mask] ,edge_label[train_mask])
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 100 == 0:
      print('Training acc:', compute_accuracy(pred[train_mask], edge_label[train_mask]))


Training acc: 0.7064424753189087
Training acc: 0.7081690430641174
Training acc: 0.7081690430641174
Training acc: 0.7063780426979065
Training acc: 0.7080144286155701
Training acc: 0.7105012536048889
Training acc: 0.7074861526489258
Training acc: 0.7073573470115662
Training acc: 0.7109522223472595
Training acc: 0.7084138989448547
Training acc: 0.7092771530151367
Training acc: 0.7126530408859253
Training acc: 0.7120474576950073
Training acc: 0.7116351127624512
Training acc: 0.7106558680534363
Training acc: 0.7131813168525696
Training acc: 0.7126014828681946
Training acc: 0.7131040096282959
Training acc: 0.7115964889526367
Training acc: 0.7116479873657227
Training acc: 0.7125628590583801
Training acc: 0.7118412852287292
Training acc: 0.7135162949562073
Training acc: 0.7130267024040222
Training acc: 0.7135549783706665
Training acc: 0.7118284106254578
Training acc: 0.7113258838653564
Training acc: 0.7132586240768433
Training acc: 0.7103466391563416
Training acc: 0.7120732069015503
Training a

In [None]:
X_test = encoder.transform(X_test)

In [None]:
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])

In [None]:
X_test

In [None]:
X_test['h'] = X_test[ cols_to_norm ].values.tolist()

In [None]:
G_test = nx.from_pandas_edgelist(X_test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
G_test = G_test.to_directed()
G_test = from_networkx(G_test,edge_attrs=['h','label'] )
actual = G_test.edata.pop('label')
G_test.ndata['feature'] = th.ones(G_test.num_nodes(), G.ndata['h'].shape[2])

In [None]:
G_test.ndata['feature'] = th.reshape(G_test.ndata['feature'], (G_test.ndata['feature'].shape[0], 1, G_test.ndata['feature'].shape[1]))

In [None]:
G_test.edata['h'] = th.reshape(G_test.edata['h'], (G_test.edata['h'].shape[0], 1, G_test.edata['h'].shape[1]))


In [None]:
G_test = G_test.to('cuda:0')


In [None]:
import timeit
start_time = timeit.default_timer()
node_features_test = G_test.ndata['feature']
edge_features_test = G_test.edata['h']
test_pred = model(G_test, node_features_test, edge_features_test).cuda()
elapsed = timeit.default_timer() - start_time

In [None]:
print(str(elapsed) + ' seconds')


In [None]:
test_pred = test_pred.argmax(1)


In [None]:
test_pred = th.Tensor.cpu(test_pred).detach().numpy()


In [None]:
actual = le.inverse_transform(actual)
test_pred = le.inverse_transform(test_pred)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
import numpy as np


def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(12, 12))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

plot_confusion_matrix(cm = confusion_matrix(actual, test_pred), 
                      normalize    = False,
                      target_names = np.unique(actual),
                      title        = "Confusion Matrix")

In [None]:
# Create DataFrame
df = pd.DataFrame(data)

# Group by attack type
grouped = df.groupby('Attack')

# Plot for each feature
features = ['FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'OUT_BYTES', 'IN_BYTES', 'IN_PKTS', 'OUT_PKTS']
for feature in features:
    plt.figure()
    for attack, group in grouped:
        plt.plot(group[feature], label=attack)
    plt.xlabel('Index')
    plt.ylabel(feature)
    plt.title(f'{feature} for Each Attack Type')
    plt.legend()
    plt.show()