In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import datetime, timedelta
import pickle
import bisect

## Test data

In [2]:
columns = [
    'flow_key',
    'lst_timestamp',
    'qt_pkt',
    'qt_pkt_tcp',
    'qt_pkt_udp',
    'qt_pkt_icmp',
    'qt_pkt_ip',
    'qt_prtcl',
    'qt_src_prt',
    'qt_dst_prt',
    'qt_fin_fl',
    'qt_syn_fl',
    'qt_res_fl',
    'qt_psh_fl',
    'qt_ack_fl',
    'qt_urg_fl',
    'qt_ecn_fl',
    'qt_cwr_fl',
    'avg_hdr_len',
    'avg_pkt_len',
    'frq_pkt',
    'avg_ttl',
    'tm_dur_s',
]

In [3]:
data = pd.read_csv('../data/interim/part_c/test_flows.csv', sep=';', names=columns)
data['lst_timestamp'] = data['lst_timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
print(data.shape)
data.head()

(612267, 23)


Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_psh_fl,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s
0,"('172.16.0.5', '192.168.50.4')",2018-11-03 12:20:21,100,58,0,42,0,2,6,8,...,22,34,0,0,0,20.0,96.06,0.819672,99.82,122.0
1,"('192.168.50.4', '172.16.0.5')",2018-11-03 12:20:32,100,54,0,46,0,2,8,6,...,18,54,0,0,0,20.0,113.1,0.75188,64.0,133.0
2,"('192.168.50.8', '4.2.2.4')",2018-11-03 12:20:41,100,0,100,0,0,1,50,1,...,0,0,0,0,0,20.0,81.06,9.090909,128.0,11.0
3,"('4.2.2.4', '192.168.50.8')",2018-11-03 12:20:41,100,0,100,0,0,1,1,50,...,0,0,0,0,0,20.0,130.72,9.090909,56.0,11.0
4,"('172.217.9.226', '192.168.50.8')",2018-11-03 12:20:41,100,100,0,0,0,1,2,5,...,52,100,0,0,0,20.0,389.46,100.0,53.0,1.0


In [4]:
data.tail()

Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_psh_fl,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s
612262,"('192.168.50.4', '172.16.0.5')",2018-11-03 20:33:57,16300,13460,0,2840,0,2,412,524,...,5312,13458,0,0,0,20.0,156.95681,1.979116,64.0,8236.0
612263,"('172.16.0.5', '192.168.50.4')",2018-11-03 20:34:13,19000,15561,20,3419,0,3,638,479,...,7447,12493,0,2,2,20.0,121.095263,1.912046,71.683737,9937.0
612264,"('173.194.206.109', '192.168.50.8')",2018-11-03 20:34:37,200,200,0,0,0,1,1,5,...,98,192,0,0,0,20.0,206.93,0.008541,106.0,23416.0
612265,"('192.168.50.4', '172.16.0.5')",2018-11-03 20:35:27,16400,13530,0,2870,0,2,414,528,...,5334,13528,0,0,0,20.0,156.700976,1.969733,64.0,8326.0
612266,"('172.16.0.5', '192.168.50.4')",2018-11-03 20:35:47,19100,15626,20,3454,0,3,642,483,...,7469,12525,0,2,2,20.0,120.968586,1.904097,71.783874,10031.0


In [5]:
with open('../data/interim/part_c/test_label_dict.pkl', 'rb') as file:
    label_dict = pickle.load(file)

In [6]:
list(label_dict[('172.16.0.5', '192.168.50.4')].items())

[('Portmap', Timestamp('2018-11-03 13:01:48.920515')),
 ('NetBIOS', Timestamp('2018-11-03 13:18:39.121734')),
 ('LDAP', Timestamp('2018-11-03 13:31:59.708498')),
 ('MSSQL', Timestamp('2018-11-03 13:51:59.988466')),
 ('UDP', Timestamp('2018-11-03 14:12:58.054429')),
 ('Syn', Timestamp('2018-11-03 20:36:41.528982')),
 ('UDPLag', Timestamp('2018-11-03 14:27:53.999014'))]

In [7]:
list(label_dict[('192.168.50.4', '172.16.0.5')].items())

[('Portmap', Timestamp('2018-11-03 13:01:48.767479')),
 ('NetBIOS', Timestamp('2018-11-03 13:18:28.139718')),
 ('LDAP', Timestamp('2018-11-03 13:31:59.708671')),
 ('MSSQL', Timestamp('2018-11-03 13:51:59.908375')),
 ('UDP', Timestamp('2018-11-03 14:12:58.054529')),
 ('Syn', Timestamp('2018-11-03 20:36:22.391171')),
 ('UDPLag', Timestamp('2018-11-03 14:27:53.999196'))]

In [8]:
attacks = sorted(label_dict[('192.168.50.4', '172.16.0.5')].items(), key=lambda x: x[1])
labels = [x[0] for x in attacks]
times = [x[1] for x in attacks]

In [9]:
attack_flows = ["('192.168.50.4', '172.16.0.5')", "('172.16.0.5', '192.168.50.4')"]
def put_label(row):
    if row['flow_key'] in attack_flows:
        idx = bisect.bisect_left(times, row['lst_timestamp'])
        return labels[idx]
    else:
        return 'BENIGN'

In [10]:
data['label'] = data.apply(put_label, axis=1)
data.head()

Unnamed: 0,flow_key,lst_timestamp,qt_pkt,qt_pkt_tcp,qt_pkt_udp,qt_pkt_icmp,qt_pkt_ip,qt_prtcl,qt_src_prt,qt_dst_prt,...,qt_ack_fl,qt_urg_fl,qt_ecn_fl,qt_cwr_fl,avg_hdr_len,avg_pkt_len,frq_pkt,avg_ttl,tm_dur_s,label
0,"('172.16.0.5', '192.168.50.4')",2018-11-03 12:20:21,100,58,0,42,0,2,6,8,...,34,0,0,0,20.0,96.06,0.819672,99.82,122.0,Portmap
1,"('192.168.50.4', '172.16.0.5')",2018-11-03 12:20:32,100,54,0,46,0,2,8,6,...,54,0,0,0,20.0,113.1,0.75188,64.0,133.0,Portmap
2,"('192.168.50.8', '4.2.2.4')",2018-11-03 12:20:41,100,0,100,0,0,1,50,1,...,0,0,0,0,20.0,81.06,9.090909,128.0,11.0,BENIGN
3,"('4.2.2.4', '192.168.50.8')",2018-11-03 12:20:41,100,0,100,0,0,1,1,50,...,0,0,0,0,20.0,130.72,9.090909,56.0,11.0,BENIGN
4,"('172.217.9.226', '192.168.50.8')",2018-11-03 12:20:41,100,100,0,0,0,1,2,5,...,100,0,0,0,20.0,389.46,100.0,53.0,1.0,BENIGN


In [11]:
data.label.value_counts()

Syn        173872
UDP        145404
MSSQL      119479
LDAP        87344
NetBIOS     73370
BENIGN       8912
Portmap      3820
UDPLag         66
Name: label, dtype: int64