In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn import preprocessing
from dgl.data import DGLDataset
import dgl
import time
import networkx as nx
import category_encoders as ce
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import torch
import tqdm
import math

from typing import *
from sklearn.preprocessing import StandardScaler, Normalizer
import socket
import struct
import random
from sklearn.model_selection import train_test_split

In [2]:
file_name = "NF-CSE-CIC-IDS2018-v2.csv"
data = pd.read_csv(file_name)

In [3]:
data.Label.value_counts()

Label
0    16635567
1     2258141
Name: count, dtype: int64

In [4]:
data.rename(columns=lambda x: x.strip(), inplace=True)
data['IPV4_SRC_ADDR'] = data["IPV4_SRC_ADDR"].apply(str)
data['L4_SRC_PORT'] = data["L4_SRC_PORT"].apply(str)
data['IPV4_DST_ADDR'] = data["IPV4_DST_ADDR"].apply(str)
data['L4_DST_PORT'] = data["L4_DST_PORT"].apply(str)

In [5]:
data.drop(columns=["L4_SRC_PORT", "L4_DST_PORT"], inplace=True)

In [6]:
data.Attack.unique()

array(['SSH-Bruteforce', 'Benign', 'DDoS attacks-LOIC-HTTP',
       'DDOS attack-HOIC', 'DoS attacks-Slowloris', 'DoS attacks-Hulk',
       'FTP-BruteForce', 'Infilteration', 'Bot', 'DoS attacks-GoldenEye',
       'Brute Force -Web', 'DoS attacks-SlowHTTPTest', 'SQL Injection',
       'DDOS attack-LOIC-UDP', 'Brute Force -XSS'], dtype=object)

In [7]:
data = data.groupby(by='Attack').sample(frac=0.1, random_state=13)

In [8]:
data.groupby(by="Attack").count()

Unnamed: 0_level_0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Benign,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,...,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557,1663557
Bot,14310,14310,14310,14310,14310,14310,14310,14310,14310,14310,...,14310,14310,14310,14310,14310,14310,14310,14310,14310,14310
Brute Force -Web,214,214,214,214,214,214,214,214,214,214,...,214,214,214,214,214,214,214,214,214,214
Brute Force -XSS,93,93,93,93,93,93,93,93,93,93,...,93,93,93,93,93,93,93,93,93,93
DDOS attack-HOIC,108086,108086,108086,108086,108086,108086,108086,108086,108086,108086,...,108086,108086,108086,108086,108086,108086,108086,108086,108086,108086
DDOS attack-LOIC-UDP,211,211,211,211,211,211,211,211,211,211,...,211,211,211,211,211,211,211,211,211,211
DDoS attacks-LOIC-HTTP,30730,30730,30730,30730,30730,30730,30730,30730,30730,30730,...,30730,30730,30730,30730,30730,30730,30730,30730,30730,30730
DoS attacks-GoldenEye,2772,2772,2772,2772,2772,2772,2772,2772,2772,2772,...,2772,2772,2772,2772,2772,2772,2772,2772,2772,2772
DoS attacks-Hulk,43265,43265,43265,43265,43265,43265,43265,43265,43265,43265,...,43265,43265,43265,43265,43265,43265,43265,43265,43265,43265
DoS attacks-SlowHTTPTest,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412,...,1412,1412,1412,1412,1412,1412,1412,1412,1412,1412


In [9]:
X = data.drop(columns=["Attack", "Label"])
y = data[["Attack", "Label"]]

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=13, stratify=y)

In [10]:
encoder = ce.TargetEncoder(cols=['TCP_FLAGS','L7_PROTO','PROTOCOL',
                                  'CLIENT_TCP_FLAGS','SERVER_TCP_FLAGS','ICMP_TYPE',
                                  'ICMP_IPV4_TYPE','DNS_QUERY_ID','DNS_QUERY_TYPE',
                                  'FTP_COMMAND_RET_CODE'])
encoder.fit(X_train, y_train.Label)

# Transform on training set
X_train = encoder.transform(X_train)

# Transform on testing set
X_test = encoder.transform(X_test)

In [11]:
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [12]:
scaler = Normalizer()
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns))) # Ignore first two as the represents IP addresses
scaler.fit(X_train[cols_to_norm])

# Transform on training set
X_train[cols_to_norm] = scaler.transform(X_train[cols_to_norm])
X_train['h'] = X_train.iloc[:, 2:].values.tolist()

# Transform on testing set
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test.iloc[:, 2:].values.tolist()

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [13]:
X_train.head()

Unnamed: 0,IPV4_SRC_ADDR,IPV4_DST_ADDR,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,h
10660988,172.31.66.109,172.31.0.2,1.401126e-08,1.108397e-08,7.8e-05,1.237078e-06,9.8e-05,1.237078e-06,1.40505e-08,1.40505e-08,...,0.0,0.0,0.0,1.458031e-07,1.458029e-07,1.136291e-07,1.10963e-08,7.4e-05,1.478535e-07,"[1.4011259652278659e-08, 1.1083974799736558e-0..."
15147714,172.31.66.68,172.31.0.2,1.245158e-08,1.008341e-08,7.9e-05,1.099371e-06,9.7e-05,1.099371e-06,1.248645e-08,1.248645e-08,...,0.0,0.0,0.0,1.295728e-07,1.295727e-07,9.85804e-08,9.861101e-09,6.6e-05,1.31395e-07,"[1.245157764089544e-08, 1.0083409899455166e-08..."
9587672,172.31.64.28,172.31.0.2,1.552131e-08,1.227854e-08,7.7e-05,1.370403e-06,9.9e-05,1.370403e-06,1.556478e-08,1.556478e-08,...,0.0,0.0,0.0,1.615169e-07,1.615167e-07,1.164452e-07,1.229219e-08,8.2e-05,1.637882e-07,"[1.5521310572476723e-08, 1.2278540225056638e-0..."
7746586,172.31.67.18,172.31.0.2,7.552795e-09,5.974837e-09,4.5e-05,6.668492e-07,8.6e-05,6.668492e-07,7.573948e-09,7.573948e-09,...,0.0,0.0,0.0,7.859541e-08,7.859535e-08,5.499144e-08,5.883835e-09,4e-05,7.970069e-08,"[7.55279530717906e-09, 5.974837018934741e-09, ..."
693958,172.31.67.31,172.31.0.2,3.657431e-09,2.893307e-09,4.9e-05,6.458418e-07,0.000115,6.458418e-07,3.667675e-09,3.667675e-09,...,0.0,0.0,0.0,3.805972e-08,3.805969e-08,2.821517e-08,2.896524e-09,1e-05,3.859495e-08,"[3.6574312874776476e-09, 2.8933070422099717e-0..."


In [14]:
lab_enc = preprocessing.LabelEncoder()
lab_enc.fit(data["Attack"])

# Transform on training set
train["Attack"] = lab_enc.transform(train["Attack"])

# Transform on testing set
test["Attack"] = lab_enc.transform(test["Attack"])

In [15]:
# Training graph

train_g = nx.from_pandas_edgelist(train, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

train_g = train_g.to_directed()
train_g = dgl.from_networkx(train_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([train_g.number_of_nodes(),
train_g.edata['h'].shape[1]])
train_g.ndata['h'] = nfeat_weight

# Testing graph
test_g = nx.from_pandas_edgelist(test, "IPV4_SRC_ADDR", "IPV4_DST_ADDR",
            ["h", "Label", "Attack"], create_using=nx.MultiGraph())

test_g = test_g.to_directed()
test_g = dgl.from_networkx(test_g, edge_attrs=['h', 'Attack', 'Label'])
nfeat_weight = torch.ones([test_g.number_of_nodes(),
test_g.edata['h'].shape[1]])
test_g.ndata['h'] = nfeat_weight

In [16]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn
import tqdm
import gc

class SAGELayer(nn.Module):
    def __init__(self, ndim_in, edims, ndim_out, activation):
      super(SAGELayer, self).__init__()
      self.W_apply = nn.Linear(ndim_in + edims , ndim_out)
      self.activation = F.relu
      self.W_edge = nn.Linear(128 * 2, 256)
      self.reset_parameters()

    def reset_parameters(self):
      gain = nn.init.calculate_gain('relu')
      nn.init.xavier_uniform_(self.W_apply.weight, gain=gain)

    def message_func(self, edges):
      return {'m':  edges.data['h']}

    def forward(self, g_dgl, nfeats, efeats):
      with g_dgl.local_scope():
        g = g_dgl
        g.ndata['h'] = nfeats
        g.edata['h'] = efeats
        g.update_all(self.message_func, fn.mean('m', 'h_neigh'))
        g.ndata['h'] = F.relu(self.W_apply(torch.cat([g.ndata['h'], g.ndata['h_neigh']], 2)))

        # Compute edge embeddings
        u, v = g.edges()
        edge = self.W_edge(torch.cat((g.srcdata['h'][u], g.dstdata['h'][v]), 2))
        return g.ndata['h'], edge

In [17]:
class SAGE(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim,  activation):
      super(SAGE, self).__init__()
      self.layers = nn.ModuleList()
      self.layers.append(SAGELayer(ndim_in, edim, 128, F.relu))

    def forward(self, g, nfeats, efeats, corrupt=False):
      if corrupt:
        e_perm = torch.randperm(g.number_of_edges())
        #n_perm = torch.randperm(g.number_of_nodes())
        efeats = efeats[e_perm]
        #nfeats = nfeats[n_perm]
      for i, layer in enumerate(self.layers):
        #nfeats = layer(g, nfeats, efeats)
        nfeats, e_feats = layer(g, nfeats, efeats)
      #return nfeats.sum(1)
      return nfeats.sum(1), e_feats.sum(1)

In [18]:
class Discriminator(nn.Module):
    def __init__(self, n_hidden):
      super(Discriminator, self).__init__()
      self.weight = nn.Parameter(torch.Tensor(n_hidden, n_hidden))
      self.reset_parameters()

    def uniform(self, size, tensor):
      bound = 1.0 / math.sqrt(size)
      if tensor is not None:
        tensor.data.uniform_(-bound, bound)

    def reset_parameters(self):
      size = self.weight.size(0)
      self.uniform(size, self.weight)

    def forward(self, features, summary):
      features = torch.matmul(features, torch.matmul(self.weight, summary))
      return features

In [19]:
class DGI(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation):
      super(DGI, self).__init__()
      self.encoder = SAGE(ndim_in, ndim_out, edim,  F.relu)
      #self.discriminator = Discriminator(128)
      self.discriminator = Discriminator(256)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)
      self.loss = nn.BCEWithLogitsLoss()

    def forward(self, g, n_features, e_features):
      positive = self.encoder(g, n_features, e_features, corrupt=False)
      negative = self.encoder(g, n_features, e_features, corrupt=True)

      positive = positive[1]
      negative = negative[1]

      summary = torch.sigmoid(positive.mean(dim=0))

      positive = self.discriminator(positive, summary)
      negative = self.discriminator(negative, summary)

      l1 = self.loss(positive, torch.ones_like(positive))
      l2 = self.loss(negative, torch.zeros_like(negative))

      return l1 + l2

In [20]:
ndim_in = train_g.ndata['h'].shape[1]
hidden_features = 128
ndim_out = 128
num_layers = 1
edim = train_g.edata['h'].shape[1]
learning_rate = 1e-3
epochs = 4000

In [21]:
dgi = DGI(ndim_in,
    ndim_out,
    edim,
    F.relu)

dgi_optimizer = torch.optim.Adam(dgi.parameters(),
                lr=1e-3,
                weight_decay=0.)

In [22]:
# Format node and edge features for E-GraphSAGE
train_g.ndata['h'] = torch.reshape(train_g.ndata['h'],
                                   (train_g.ndata['h'].shape[0], 1,
                                    train_g.ndata['h'].shape[1]))

train_g.edata['h'] = torch.reshape(train_g.edata['h'],
                                   (train_g.edata['h'].shape[0], 1,
                                    train_g.edata['h'].shape[1]))

In [23]:
# Convert to GPU
train_g = train_g

In [24]:
training_emb = dgi.encoder(train_g, train_g.ndata['h'], train_g.edata['h'])[1]
training_emb = training_emb.detach().cpu().numpy()

In [25]:
test_g.ndata['h'] = torch.reshape(test_g.ndata['h'],
                                   (test_g.ndata['h'].shape[0], 1,
                                    test_g.ndata['h'].shape[1]))



test_g.edata['h'] = torch.reshape(test_g.edata['h'],
                                   (test_g.edata['h'].shape[0], 1,
                                    test_g.edata['h'].shape[1]))

In [26]:
# Convert to GPU
test_g = test_g

In [27]:
testing_emb = dgi.encoder(test_g, test_g.ndata['h'], test_g.edata['h'])[1]
testing_emb = testing_emb.detach().cpu().numpy()

In [28]:
df_train = pd.DataFrame(training_emb, )
df_train["Attack"] = lab_enc.inverse_transform(
        train_g.edata['Attack'].detach().cpu().numpy())
df_train["Label"] = train_g.edata['Label'].detach().cpu().numpy()

df_test = pd.DataFrame(testing_emb, )
df_test["Attack"] = lab_enc.inverse_transform(
        test_g.edata['Attack'].detach().cpu().numpy())
df_test["Label"] = test_g.edata['Label'].detach().cpu().numpy()

In [29]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,Attack,Label
0,-0.186130,0.089536,0.234762,0.499413,0.610749,-0.108356,-0.442557,-0.095262,0.068685,0.096509,...,-0.100570,0.261413,0.077442,-0.006520,0.272711,-0.072337,-0.090303,-0.199628,Benign,0
1,-0.186130,0.089536,0.234762,0.499413,0.610749,-0.108356,-0.442557,-0.095262,0.068685,0.096509,...,-0.100570,0.261413,0.077442,-0.006520,0.272711,-0.072337,-0.090303,-0.199628,Benign,0
2,-0.186130,0.089536,0.234762,0.499413,0.610749,-0.108356,-0.442557,-0.095262,0.068685,0.096509,...,-0.100570,0.261413,0.077442,-0.006520,0.272711,-0.072337,-0.090303,-0.199628,Benign,0
3,-0.186130,0.089536,0.234762,0.499413,0.610749,-0.108356,-0.442557,-0.095262,0.068685,0.096509,...,-0.100570,0.261413,0.077442,-0.006520,0.272711,-0.072337,-0.090303,-0.199628,Benign,0
4,-0.186130,0.089536,0.234762,0.499413,0.610749,-0.108356,-0.442557,-0.095262,0.068685,0.096509,...,-0.100570,0.261413,0.077442,-0.006520,0.272711,-0.072337,-0.090303,-0.199628,Benign,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2641549,-0.184909,0.099202,0.243210,0.494740,0.596927,-0.109210,-0.437399,-0.103650,0.072978,0.098303,...,-0.100357,0.261067,0.081300,-0.022048,0.282673,-0.056804,-0.085026,-0.186575,Benign,0
2641550,-0.193974,0.120719,0.243462,0.516168,0.583101,-0.084616,-0.444571,-0.132814,0.085291,0.093678,...,-0.084378,0.268192,0.070342,-0.044484,0.320765,-0.016463,-0.070495,-0.216916,Benign,0
2641551,-0.191958,0.123398,0.250685,0.501447,0.574978,-0.084962,-0.455707,-0.125891,0.064002,0.079514,...,-0.083362,0.268188,0.070823,-0.048042,0.308147,-0.025993,-0.064052,-0.212066,Benign,0
2641552,-0.193751,0.122689,0.249479,0.508969,0.582178,-0.082512,-0.451045,-0.130019,0.073107,0.084857,...,-0.086211,0.268132,0.068637,-0.048151,0.312106,-0.021339,-0.071330,-0.216649,Benign,0


# Embeddings CBLOF  Embeddings

In [30]:
import torch
import dgl
import numpy as np
import pandas as pd
import torch.optim as optim
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
import gc

from tqdm import tqdm
import itertools

In [31]:
benign_train_samples = df_train[df_train.Label == 0].drop(columns=["Label", "Attack"])
normal_train_samples = df_train.drop(columns=["Label", "Attack"])

train_labels = df_train["Label"]
test_labels = df_test["Label"]

test_samples = df_test.drop(columns=["Label", "Attack"])

In [32]:
contamination = [0.001, 0.01, 0.04, 0.05, 0.1, 0.2]

In [33]:
from pyod.models.cblof import CBLOF
n_est = [2,3,5,7,9,10]
params = list(itertools.product(n_est, contamination))
score = -1
bs = None
for n_est, con in tqdm(params):
    
    clf_if = CBLOF(n_clusters=n_est, contamination=con)
    clf_if.fit(benign_train_samples)
    y_pred = clf_if.predict(test_samples)
    test_pred = y_pred

    f1 = f1_score(test_labels, test_pred, average='macro')

    if f1 > score:
        score = f1
        best_params = {'n_estimators': n_est,
                       "con": con
                }
        bs = test_pred
    del clf_if
    gc.collect()


print(best_params)
print(score)
print(classification_report(test_labels, bs, digits=4))

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

{'n_estimators': 3, 'con': 0.001}
0.9446504908193768
              precision    recall  f1-score   support

           0     0.9772    0.9987    0.9878    996643
           1     0.9882    0.8287    0.9015    135488

    accuracy                         0.9783   1132131
   macro avg     0.9827    0.9137    0.9447   1132131
weighted avg     0.9785    0.9783    0.9775   1132131



In [41]:
dgi.load_state_dict(torch.load('best_dgi.pkl'))

<All keys matched successfully>