In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime, timedelta
import pickle
import bisect

In [2]:
columns = [
    'flow_key',
    'lst_timestamp',
    'qt_pkt',
    'qt_pkt_tcp',
    'qt_pkt_udp',
    'qt_pkt_icmp',
    'qt_pkt_ip',
    'qt_prtcl',
    'qt_src_prt',
    'qt_dst_prt',
    'qt_fin_fl',
    'qt_syn_fl',
    'qt_res_fl',
    'qt_psh_fl',
    'qt_ack_fl',
    'qt_urg_fl',
    'qt_ecn_fl',
    'qt_cwr_fl',
    'avg_hdr_len',
    'avg_pkt_len',
    'frq_pkt',
    'avg_ttl',
    'tm_dur_s',
]

## Train data

In [3]:
data = pd.read_csv('../data/interim/part_c/train_flows.csv', sep=';', names=columns)
data['lst_timestamp'] = data['lst_timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
print(data.shape)
data.head()

(2506580, 23)


Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_psh_fl,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s
0,"('192.168.50.7', '74.208.236.171')",2018-12-01 13:17:32,100,100,0,0,0,1,12,1,...,22,76,0,0,0,20.0,143.62,50.0,128.0,2.0
1,"('74.208.236.171', '192.168.50.7')",2018-12-01 13:17:32,100,100,0,0,0,1,1,12,...,8,100,0,0,0,20.0,917.72,50.0,54.0,2.0
2,"('74.208.236.171', '192.168.50.7')",2018-12-01 13:17:32,200,200,0,0,0,1,1,12,...,16,200,0,0,0,20.0,1390.71,100.0,54.0,2.0
3,"('192.168.50.7', '74.208.236.171')",2018-12-01 13:17:32,200,200,0,0,0,1,15,1,...,32,170,0,0,0,20.0,119.38,100.0,128.0,2.0
4,"('74.208.236.171', '192.168.50.7')",2018-12-01 13:17:32,300,300,0,0,0,1,1,15,...,24,300,0,0,0,20.0,1533.166667,150.0,54.0,2.0


In [4]:
data.tail()

Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_psh_fl,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s
2506575,"('4.2.2.4', '192.168.50.8')",2018-12-01 21:15:40,11000,0,11000,0,0,1,1,4679,...,0,0,0,0,0,20.0,133.376364,0.384105,56.0,28638.0
2506576,"('192.168.50.1', '172.16.0.5')",2018-12-01 21:16:08,13000,8848,0,4152,0,2,87,573,...,2936,8848,0,0,0,20.0,191.372462,4.465819,64.0,2911.0
2506577,"('172.16.0.5', '192.168.50.1')",2018-12-01 21:16:12,6900,5044,334,1522,0,3,428,129,...,1977,4222,0,0,0,20.0,155.854493,4.154124,59.495072,1661.0
2506578,"('192.168.50.1', '172.16.0.5')",2018-12-01 21:16:24,13100,8942,0,4158,0,2,87,576,...,2974,8942,0,0,0,20.0,191.294962,4.475572,64.0,2927.0
2506579,"('172.16.0.5', '192.168.50.1')",2018-12-01 21:16:29,7000,5138,334,1528,0,3,433,129,...,2025,4304,0,0,0,20.0,155.469429,4.171633,59.396,1678.0


In [5]:
with open('../data/interim/part_c/train_label_dict.pkl', 'rb') as file:
    label_dict = pickle.load(file)

In [6]:
sorted(label_dict[('172.16.0.5', '192.168.50.1')].items(), key=lambda x: x[1])

[('DrDoS_NTP', Timestamp('2018-12-01 14:51:39.778786')),
 ('DrDoS_DNS', Timestamp('2018-12-01 15:22:40.254719')),
 ('DrDoS_LDAP', Timestamp('2018-12-01 15:32:32.915361')),
 ('DrDoS_MSSQL', Timestamp('2018-12-01 15:47:08.463107')),
 ('DrDoS_NetBIOS', Timestamp('2018-12-01 16:00:13.902732')),
 ('DrDoS_SNMP', Timestamp('2018-12-01 16:23:13.663368')),
 ('DrDoS_SSDP', Timestamp('2018-12-01 16:36:57.627789')),
 ('DrDoS_UDP', Timestamp('2018-12-01 17:04:45.928382')),
 ('WebDDoS', Timestamp('2018-12-01 17:30:30.664221')),
 ('UDP-lag', Timestamp('2018-12-01 17:30:30.740425')),
 ('Syn', Timestamp('2018-12-01 17:34:27.403143')),
 ('TFTP', Timestamp('2018-12-01 21:16:38.374479'))]

In [7]:
sorted(label_dict[('192.168.50.1', '172.16.0.5')].items(), key=lambda x: x[1])

[('DrDoS_NTP', Timestamp('2018-12-01 14:51:38.825113')),
 ('DrDoS_DNS', Timestamp('2018-12-01 15:22:39.740027')),
 ('DrDoS_LDAP', Timestamp('2018-12-01 15:32:32.723493')),
 ('DrDoS_MSSQL', Timestamp('2018-12-01 15:47:07.499031')),
 ('DrDoS_NetBIOS', Timestamp('2018-12-01 16:00:13.662425')),
 ('DrDoS_SNMP', Timestamp('2018-12-01 16:23:12.850532')),
 ('DrDoS_SSDP', Timestamp('2018-12-01 16:36:56.858758')),
 ('DrDoS_UDP', Timestamp('2018-12-01 17:04:45.170823')),
 ('UDP-lag', Timestamp('2018-12-01 17:30:11.908587')),
 ('WebDDoS', Timestamp('2018-12-01 17:30:30.661269')),
 ('Syn', Timestamp('2018-12-01 17:34:26.857157')),
 ('TFTP', Timestamp('2018-12-01 21:16:38.374659'))]

In [8]:
attacks = sorted(label_dict[('192.168.50.1', '172.16.0.5')].items(), key=lambda x: x[1])
labels = [x[0] for x in attacks]
times = [x[1] for x in attacks]

In [9]:
attack_flows = ["('192.168.50.1', '172.16.0.5')", "('172.16.0.5', '192.168.50.1')"]
def put_label(row):
    if row['flow_key'] in attack_flows:
        idx = bisect.bisect_left(times, row['lst_timestamp'])
        return labels[idx]
    else:
        return 'BENIGN'

In [10]:
data['label'] = data.apply(put_label, axis=1)
data.head()

Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s,label
0,"('192.168.50.7', '74.208.236.171')",2018-12-01 13:17:32,100,100,0,0,0,1,12,1,...,76,0,0,0,20.0,143.62,50.0,128.0,2.0,BENIGN
1,"('74.208.236.171', '192.168.50.7')",2018-12-01 13:17:32,100,100,0,0,0,1,1,12,...,100,0,0,0,20.0,917.72,50.0,54.0,2.0,BENIGN
2,"('74.208.236.171', '192.168.50.7')",2018-12-01 13:17:32,200,200,0,0,0,1,1,12,...,200,0,0,0,20.0,1390.71,100.0,54.0,2.0,BENIGN
3,"('192.168.50.7', '74.208.236.171')",2018-12-01 13:17:32,200,200,0,0,0,1,15,1,...,170,0,0,0,20.0,119.38,100.0,128.0,2.0,BENIGN
4,"('74.208.236.171', '192.168.50.7')",2018-12-01 13:17:32,300,300,0,0,0,1,1,15,...,300,0,0,0,20.0,1533.166667,150.0,54.0,2.0,BENIGN


In [11]:
data.label.value_counts()

DrDoS_NTP        767803
TFTP             692239
DrDoS_DNS        304914
DrDoS_SNMP       180220
DrDoS_UDP        118379
DrDoS_LDAP        98718
DrDoS_SSDP        97525
DrDoS_MSSQL       94521
DrDoS_NetBIOS     82265
Syn               51854
UDP-lag            7793
BENIGN             6379
WebDDoS            3970
Name: label, dtype: int64

## Test data

In [12]:
data = pd.read_csv('../data/interim/part_c/test_flows.csv', sep=';', names=columns)
data['lst_timestamp'] = data['lst_timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
print(data.shape)
data.head()

(612267, 23)


Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_psh_fl,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s
0,"('172.16.0.5', '192.168.50.4')",2018-11-03 12:20:21,100,58,0,42,0,2,6,8,...,22,34,0,0,0,20.0,96.06,0.819672,99.82,122.0
1,"('192.168.50.4', '172.16.0.5')",2018-11-03 12:20:32,100,54,0,46,0,2,8,6,...,18,54,0,0,0,20.0,113.1,0.75188,64.0,133.0
2,"('192.168.50.8', '4.2.2.4')",2018-11-03 12:20:41,100,0,100,0,0,1,50,1,...,0,0,0,0,0,20.0,81.06,9.090909,128.0,11.0
3,"('4.2.2.4', '192.168.50.8')",2018-11-03 12:20:41,100,0,100,0,0,1,1,50,...,0,0,0,0,0,20.0,130.72,9.090909,56.0,11.0
4,"('172.217.9.226', '192.168.50.8')",2018-11-03 12:20:41,100,100,0,0,0,1,2,5,...,52,100,0,0,0,20.0,389.46,100.0,53.0,1.0


In [13]:
data.tail()

Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_psh_fl,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s
612262,"('192.168.50.4', '172.16.0.5')",2018-11-03 20:33:57,16300,13460,0,2840,0,2,412,524,...,5312,13458,0,0,0,20.0,156.95681,1.979116,64.0,8236.0
612263,"('172.16.0.5', '192.168.50.4')",2018-11-03 20:34:13,19000,15561,20,3419,0,3,638,479,...,7447,12493,0,2,2,20.0,121.095263,1.912046,71.683737,9937.0
612264,"('173.194.206.109', '192.168.50.8')",2018-11-03 20:34:37,200,200,0,0,0,1,1,5,...,98,192,0,0,0,20.0,206.93,0.008541,106.0,23416.0
612265,"('192.168.50.4', '172.16.0.5')",2018-11-03 20:35:27,16400,13530,0,2870,0,2,414,528,...,5334,13528,0,0,0,20.0,156.700976,1.969733,64.0,8326.0
612266,"('172.16.0.5', '192.168.50.4')",2018-11-03 20:35:47,19100,15626,20,3454,0,3,642,483,...,7469,12525,0,2,2,20.0,120.968586,1.904097,71.783874,10031.0


In [14]:
with open('../data/interim/part_c/test_label_dict.pkl', 'rb') as file:
    label_dict = pickle.load(file)

In [15]:
sorted(label_dict[('172.16.0.5', '192.168.50.4')].items(), key=lambda x: x[1])

[('Portmap', Timestamp('2018-11-03 13:01:48.920515')),
 ('NetBIOS', Timestamp('2018-11-03 13:18:39.121734')),
 ('LDAP', Timestamp('2018-11-03 13:31:59.708498')),
 ('MSSQL', Timestamp('2018-11-03 13:51:59.988466')),
 ('UDP', Timestamp('2018-11-03 14:12:58.054429')),
 ('UDPLag', Timestamp('2018-11-03 14:27:53.999014')),
 ('Syn', Timestamp('2018-11-03 20:36:41.528982'))]

In [16]:
sorted(label_dict[('192.168.50.4', '172.16.0.5')].items(), key=lambda x: x[1])

[('Portmap', Timestamp('2018-11-03 13:01:48.767479')),
 ('NetBIOS', Timestamp('2018-11-03 13:18:28.139718')),
 ('LDAP', Timestamp('2018-11-03 13:31:59.708671')),
 ('MSSQL', Timestamp('2018-11-03 13:51:59.908375')),
 ('UDP', Timestamp('2018-11-03 14:12:58.054529')),
 ('UDPLag', Timestamp('2018-11-03 14:27:53.999196')),
 ('Syn', Timestamp('2018-11-03 20:36:22.391171'))]

In [17]:
attacks = sorted(label_dict[('192.168.50.4', '172.16.0.5')].items(), key=lambda x: x[1])
labels = [x[0] for x in attacks]
times = [x[1] for x in attacks]

In [18]:
attack_flows = ["('192.168.50.4', '172.16.0.5')", "('172.16.0.5', '192.168.50.4')"]
def put_label(row):
    if row['flow_key'] in attack_flows:
        idx = bisect.bisect_left(times, row['lst_timestamp'])
        return labels[idx]
    else:
        return 'BENIGN'

In [19]:
data['label'] = data.apply(put_label, axis=1)
data.head()

Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s,label
0,"('172.16.0.5', '192.168.50.4')",2018-11-03 12:20:21,100,58,0,42,0,2,6,8,...,34,0,0,0,20.0,96.06,0.819672,99.82,122.0,Portmap
1,"('192.168.50.4', '172.16.0.5')",2018-11-03 12:20:32,100,54,0,46,0,2,8,6,...,54,0,0,0,20.0,113.1,0.75188,64.0,133.0,Portmap
2,"('192.168.50.8', '4.2.2.4')",2018-11-03 12:20:41,100,0,100,0,0,1,50,1,...,0,0,0,0,20.0,81.06,9.090909,128.0,11.0,BENIGN
3,"('4.2.2.4', '192.168.50.8')",2018-11-03 12:20:41,100,0,100,0,0,1,1,50,...,0,0,0,0,20.0,130.72,9.090909,56.0,11.0,BENIGN
4,"('172.217.9.226', '192.168.50.8')",2018-11-03 12:20:41,100,100,0,0,0,1,2,5,...,100,0,0,0,20.0,389.46,100.0,53.0,1.0,BENIGN


In [20]:
data.label.value_counts()

Syn        173872
UDP        145404
MSSQL      119479
LDAP        87344
NetBIOS     73370
BENIGN       8912
Portmap      3820
UDPLag         66
Name: label, dtype: int64