In [None]:
from scapy.all import *
import numpy as np
import pandas as pd

filter by:
- tcp packets
- egressing packets
- source ip

In [None]:
pcapfile = 'dataset/raw/attack_capture.pcap'

In [None]:
# store packets by src ip
packets = dict()

In [None]:
def store_pkt(pkt):
    global packets
    ip = pkt[IP].src
    if ip not in packets.keys():
        packets[ip] = []
    packets[ip].append(pkt)

In [None]:
# filter packets egressing from network
network = '10.0.0.64/26'

In [None]:
# filter syntax = https://biot.com/capstats/bpf.html
sniff(
    offline=pcapfile,
    store=False,
    #count=1000,
    filter="tcp and src net {net}".format(net=network),
    prn=store_pkt
)

In [None]:
all_hosts = packets.keys()
all_hosts

In [None]:
pkt = packets['10.0.0.67'][0]

In [None]:
pkt.show()

In [None]:
float(pkt.time)

sort packets by timestamp

In [None]:
for host in packets.keys():
    packets[host] = sorted(packets[host], key=lambda pkt: pkt.time)

for each source ip (host):
- get observation windows (sequential or sliding?)

In [None]:
T = 15 # observation window in seconds (10-30)

In [None]:
print(packets['10.0.0.67'][0].time)
start_time = np.floor(packets['10.0.0.67'][0].time)
start_time

In [None]:
print(packets['10.0.0.67'][-1].time)
end_time = np.ceil(packets['10.0.0.67'][-1].time)
end_time

In [None]:
np.arange(start_time, end_time, 1)

In [None]:
# arrange packets in 1 seconds intervals
def samples(packets, sample_interval=1):
    start_time = np.floor(packets[0].time)
    end_time = np.ceil(packets[-1].time)
    
    packet_samples = []
    
    num_packets = len(packets)
    packet_idx = 0
    for interval_start in np.arange(start_time, end_time, sample_interval):
        sample = []
        while packet_idx < num_packets and packets[packet_idx].time < int(interval_start + sample_interval):
            sample.append(packets[packet_idx])
            packet_idx +=1
        packet_samples.append(sample)
            
    return packet_samples

In [None]:
tmp = samples(packets['10.0.0.67'], sample_interval=1)

In [None]:
for k in tmp:
    print(len(k))

In [None]:
np.arange(0, 9, 4)

In [None]:
len(tmp[0:0+4])

In [None]:
def seqObsWindow(data, lengthObsWindow):
    nSamples = len(data)
    obsWindows = []
    for s in np.arange(0,nSamples,lengthObsWindow):
        subdata=data[s:s+lengthObsWindow]
        if len(subdata) == lengthObsWindow:
            obsWindows.append(subdata)
    return obsWindows

In [None]:
j = seqObsWindow(tmp, 4)
print(len(tmp))
print(len(j[0]))
print(len(j))

In [None]:
print(len(j[1][0]))

In [None]:
def slidingObsWindow(data, lengthObsWindow, slidingValue):
    nSamples = len(data)
    obsWindows = []
    for s in np.arange(0,nSamples,slidingValue):
        subdata=data[s:s+lengthObsWindow]
        if len(subdata) == lengthObsWindow:
            obsWindows.append(subdata)
    return obsWindows

In [None]:
k = slidingObsWindow(tmp, 3, 2)
print(len(tmp))
print(len(k[2]))
print(len(k))

In [None]:
print(len(k[3][2]))

In [None]:
windows = slidingObsWindow(tmp, T, 5)
len(windows)

for each observation window:
- Number of TCP packets
- Mean TCP packet length
- Variance TCP packet length
- shannon entropy of TCP destination IP
- shannon entropy of TCP destination port
- total silence time
- average silence time sequence
- variance silence time sequence

In [None]:
obs_window = windows[0]
print(len(obs_window))

In [None]:
def num_tcp_pkts(obs_window):
    num = 0
    for sample in obs_window:
        num += len(sample)
    return num

In [None]:
def packet_length(obs_window):
    len_list = []
    for sample in obs_window:
        for pkt in sample:
            len_list.append(pkt[IP].len)
    return np.mean(len_list), np.std(len_list)

In [None]:
obs_window[0][0][TCP].dport

In [None]:
obs_window[0][0][IP].dst

In [None]:
def ip_port_hist(obs_window):
    ip_hist = dict()
    port_hist = dict()
    for sample in obs_window:
        for pkt in sample:
            dst_ip = pkt[IP].dst
            dst_port = pkt[TCP].dport
            if dst_ip not in ip_hist.keys():
                ip_hist[dst_ip] = 0
            ip_hist[dst_ip] += 1
            if dst_port not in port_hist.keys():
                port_hist[dst_port] = 0
            port_hist[dst_port] += 1
    return ip_hist, port_hist

In [None]:
ip_hist, port_hist = ip_port_hist(obs_window)

In [None]:
port_hist

In [None]:
def norm_entropy(hist, total_occurrences):
    num_vals = len(hist.keys())
    h = 0
    for val in hist.keys():
        # calc probs
        p_val = float(hist[val])/total_occurrences
        # calc normalized entropy
        norm_factor = np.log2(num_vals) if num_vals != 1 else 1
        h += (p_val * np.log2(p_val)) / norm_factor
    return abs(h)

In [None]:
num_tcp_packets = 480

In [None]:
# not random --> 0
port_hist = {80: 430, 8080: 20, 55: 30}
norm_entropy(port_hist,num_tcp_packets)

In [None]:
# random --> 1
port_hist = {80: 160, 8080: 160, 55: 160}
norm_entropy(port_hist,num_tcp_packets)

In [None]:
# ratio of silence time
def silence_ratio(obs_window):
    silence = 0
    for sample in obs_window:
        if len(sample) == 0:
            silence += 1
    return float(silence)/len(obs_window)

In [None]:
silence_ratio(obs_window)

In [None]:
def extratctSilenceActivity(data,threshold=0):
    if(data[0]<=threshold):
        s=[1]
        a=[]
    else:
        s=[]
        a=[1]
    for i in range(1,len(data)):
        if(data[i-1]>threshold and data[i]<=threshold):
            s.append(1)
        elif(data[i-1]<=threshold and data[i]>threshold):
            a.append(1)
        elif (data[i-1]<=threshold and data[i]<=threshold):
            s[-1]+=1
        else:
            a[-1]+=1
    return(s,a)

In [None]:
extratctSilenceActivity([0,0,0,1,2,3,0,0,0,0,42,3,1,0,0,23,3,0])

In [None]:
act_data = [len(sample) for sample in obs_window]
print(act_data)
extratctSilenceActivity(act_data)

In [None]:
m = len(windows) # examples
n = 1 + 8        # id + features
data = np.zeros((m,n))
data.shape

In [None]:
for obs_id, obs_window in enumerate(windows):
    # number of TCP packets
    num_tcp_packets = num_tcp_pkts(obs_window)
    
    # mean and standard deviation TCP packet length
    mu_size, std_size = packet_length(obs_window)
    
    # normalized shannon entropy of TCP destination IP and port
    ip_hist, port_hist = ip_port_hist(obs_window)
    ip_ent = norm_entropy(ip_hist,num_tcp_packets)
    port_ent = norm_entropy(port_hist,num_tcp_packets)
    
    # silence time ratio
    silence_r = silence_ratio(obs_window)
    
    # mean and std silence time sequence
    act_data = [len(sample) for sample in obs_window]
    s, a = extratctSilenceActivity(act_data)
    mean_silence_seq = np.mean(s)
    std_silence_seq = np.mean(s)
    
    # feature vector
    x = np.array([
        obs_id,
        num_tcp_packets, 
        mu_size, 
        std_size, 
        ip_ent, 
        port_ent, 
        silence_r, 
        mean_silence_seq, 
        std_silence_seq
    ])
    
    # add to dataset
    data[obs_id,:] = x

Normalize features

In [None]:
df = pd.DataFrame(
    data=data[:,1:],
    index=data[:,0], 
    columns=["pack_num", "mu_len", "std_len", "ip_ent", "port_ent", "sile_r", "mu_sile_seq", "std_sile_seq"], 
)

In [None]:
df[:5]

In [None]:
df.describe()

save dataset

In [None]:
df.to_csv('data.csv')