In [1]:
from dgl import from_networkx
import dgl
import torch.nn as nn
import torch as th
import torch.nn.functional as F
import dgl.function as fn
import networkx as nx
import pandas as pd
import socket
import struct
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import category_encoders as ce
import numpy as np
from sklearn.utils import class_weight

In [2]:
data = pd.read_csv('NF-BoT-IoT.csv')
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
data['IPV4_SRC_ADDR'] = data.IPV4_SRC_ADDR.apply(str)
data['L4_SRC_PORT'] = data.L4_SRC_PORT.apply(str)
data['IPV4_DST_ADDR'] = data.IPV4_DST_ADDR.apply(str)
data['L4_DST_PORT'] = data.L4_DST_PORT.apply(str)
data['IPV4_SRC_ADDR'] = data['IPV4_SRC_ADDR'] + ':' + data['L4_SRC_PORT']
data['IPV4_DST_ADDR'] = data['IPV4_DST_ADDR'] + ':' + data['L4_DST_PORT']
data.drop(columns=['L4_SRC_PORT','L4_DST_PORT'],inplace=True)
data.drop(columns=['Attack'],inplace = True)
data.rename(columns={"Label": "label"},inplace = True)
label = data.label
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.3, random_state=123,stratify= label)
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns ))  - set(list(['label'])) )

X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
X_train['h'] = X_train[ cols_to_norm ].values.tolist()
G = nx.from_pandas_edgelist(X_train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
G = G.to_directed()
G = from_networkx(G,edge_attrs=['h','label'] )
G.ndata['h'] = th.ones(G.num_nodes(), G.edata['h'].shape[1])

X_test[cols_to_norm] = scaler.fit_transform(X_test[cols_to_norm])
X_test['h'] = X_test[ cols_to_norm ].values.tolist()
G_test = nx.from_pandas_edgelist(X_test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR", ['h','label'],create_using=nx.MultiGraph())
G_test = G_test.to_directed()
G_test = from_networkx(G_test,edge_attrs=['h','label'] )
G_test.ndata['h'] = th.ones(G_test.num_nodes(), G_test.edata['h'].shape[1])

In [14]:
dgl.save_graphs('train2_b.bin', [G])

In [4]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')

In [15]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')
graphs, _ = dgl.load_graphs('train2_b.bin')
G = graphs[0]

In [16]:
G = G.to(device)
node_features = G.ndata['h']
edge_features = G.edata['h']
edge_label = G.edata['label']


In [17]:
print(node_features.shape[0])
print(edge_features.shape[0])

433094
840140


In [5]:
def compute_accuracy(pred, labels):
    return (pred.argmax(1) == labels).float().mean().item()

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out ):
        super(SAGELayer, self).__init__()
        ### force to outut fix dimensions
        self.W_msg = nn.Linear(ndim_in + edims, ndim_out)
        ### apply weight
        self.W_apply = nn.Linear(ndim_in + ndim_out, ndim_out)
        

    def message_func(self, edges):
        return {'m': self.W_msg(th.cat([edges.data['h'], edges.dst['h']], 1))}

    def forward(self, g_dgl, nfeats, efeats):
        with g_dgl.local_scope():
            g = g_dgl
            g.ndata['h'] = nfeats
            g.edata['h'] = efeats
            # Eq4
            g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
            # Eq5          
            g.ndata['h'] = F.relu(self.W_apply(th.cat([g.ndata['h'], g.ndata['h_neigh']], 1)))
            return g.ndata['h']

class MLPPredictor(nn.Module):
    def __init__(self, in_features, edim, out_classes):
        super().__init__()
        self.W = nn.Linear(in_features * 2 + edim, out_classes)

    def apply_edges(self, edges):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        h_e = edges.data['h']
        score = self.W(th.cat([h_u, h_e, h_v], 1))
        return {'score': score}

    def forward(self, graph, h, efeats):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.edata['h'] = efeats
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

class Model(nn.Module):
    def __init__(self,  ndim_in, ndim_out, edim):
        super().__init__()
        self.atten = nn.Parameter(th.randn(1, edim))
        self.cov1 = SAGELayer(ndim_in, edim, ndim_out)
        self.cov2 = SAGELayer(ndim_out, edim, ndim_out)
        self.pred = MLPPredictor(ndim_out, edim, 2)
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, g, nfeats, efeats):
        efeats = efeats * self.atten
        nfeats = F.relu(self.cov1(g, nfeats, efeats))
        nfeats = self.dropout(nfeats)
        nfeats = F.relu(self.cov2(g, nfeats, efeats))
        nfeats = self.dropout(nfeats)
        return self.pred(g, nfeats, efeats)


In [7]:
for epoch in range(1500):
    pred = model( G, node_features, edge_features)
    loss = criterion(pred ,edge_label)
    opt.zero_grad()
    loss.backward()
    opt.step()
    if epoch % 100 == 0:
      print('Training acc:', compute_accuracy(pred, edge_label))

Training acc: 0.579767644405365
Training acc: 0.986433207988739
Training acc: 0.9868664145469666
Training acc: 0.98711758852005
Training acc: 0.9870997071266174
Training acc: 0.9871389865875244
Training acc: 0.987159252166748
Training acc: 0.987230658531189
Training acc: 0.9873104095458984
Training acc: 0.9869937896728516
Training acc: 0.9872509241104126
Training acc: 0.98727947473526
Training acc: 0.987318754196167
Training acc: 0.9872972965240479
Training acc: 0.987328290939331


In [18]:
th.save(model.state_dict(),'A-SAGE-R_b.pth')

In [8]:
model.load_state_dict(th.load('SAGE_b.pth'))

<All keys matched successfully>

In [11]:
graphs, _ = dgl.load_graphs('test2_b.bin')
G_test = graphs[0]

In [12]:
G_test = G_test.to(device)
node_features_test = G_test.ndata['h']
edge_features_test = G_test.edata['h']
edge_label_test = G_test.edata['label']


In [13]:
print(node_features_test.shape[0])
print(edge_features_test.shape[0])

188857
360060


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
out = model( G_test, node_features_test, edge_features_test)
out = F.softmax(out, dim=1)
_, pred = out.max(dim=1)
acc = accuracy_score(edge_label_test.cpu().numpy(), pred.cpu().numpy())
precision = precision_score(edge_label_test.cpu().numpy(), pred.cpu().numpy())
recall = recall_score(edge_label_test.cpu().numpy(), pred.cpu().numpy())
f1 = f1_score(edge_label_test.cpu().numpy(), pred.cpu().numpy())
print(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}')

Accuracy: 0.9872, Precision: 0.9876, Recall: 0.9994, F1-score: 0.9935
