In [54]:
from scapy.all import PcapReader, Raw, TCP, UDP, IP
import json
import numpy as np
import datetime
from tqdm import tqdm
import pandas as pd
from flowprintOptimal.sekigo.core.flowRepresentation import PacketFlowRepressentation
from flowprintOptimal.sekigo.flowUtils.commons import saveFlows,loadFlows,normalizePacketRep

In [60]:
with PcapReader("data/unibs/testPcap.pcapng") as pcap:
    for packet in pcap:
        print(len(packet), len(packet.payload), type(packet.payload.payload.payload))

108 94 <class 'scapy.layers.dns.DNS'>
111 97 <class 'scapy.layers.dns.DNS'>
198 184 <class 'scapy.layers.dns.DNS'>
91 77 <class 'scapy.packet.Raw'>
87 73 <class 'scapy.packet.Raw'>
126 112 <class 'scapy.layers.dns.DNS'>
102 88 <class 'scapy.layers.dns.DNS'>
133 119 <class 'scapy.layers.dns.DNS'>
116 102 <class 'scapy.layers.dns.DNS'>
120 106 <class 'scapy.layers.dns.DNS'>
194 180 <class 'scapy.layers.dns.DNS'>
145 131 <class 'scapy.layers.dns.DNS'>
173 159 <class 'scapy.layers.dns.DNS'>
104 90 <class 'scapy.layers.dns.DNS'>
106 92 <class 'scapy.layers.dns.DNS'>
109 95 <class 'scapy.layers.dns.DNS'>
133 119 <class 'scapy.layers.dns.DNS'>
66 52 <class 'scapy.packet.NoPayload'>
66 52 <class 'scapy.packet.NoPayload'>
118 104 <class 'scapy.layers.dns.DNS'>
122 108 <class 'scapy.layers.dns.DNS'>
119 105 <class 'scapy.layers.dns.DNS'>
86 72 <class 'scapy.layers.inet6.ICMPv6NDOptSrcLLAddr'>
78 64 <class 'scapy.packet.NoPayload'>
88 74 <class 'scapy.layers.dns.DNS'>
141 127 <class 'scapy.packet

In [2]:
lines = []
with open("data/unibs/groundtruth.log") as f:
    for i,line in enumerate(f):
        #line = json.loads(line)
        if i != 0:
            lines.append(line.replace("\n", "").strip().split(":"))

In [3]:
lines[12]

['1254304160.293501',
 '245.234.7.50',
 '54.134.187.248',
 '57858',
 '110',
 'pop3;',
 'Mail',
 'TCP']

In [4]:
conns = dict()
rev_conns = []
for line in lines:
    conn = (line[1], line[2], line[3], line[4],line[7])
    rev_conn = (line[2], line[1], line[4], line[3])
    rev_conns.append(rev_conn)
    if conn not in conns:
        conns[conn] = dict(label = line[6], timestamp = line[0])


for rev_conn in rev_conns:
    assert rev_conn not in conns



In [5]:
conns[list(conns.keys())[np.random.randint(len(conns))]]

{'label': 'amule', 'timestamp': '1254494103.818089'}

In [52]:
flows = dict()
proto_map = {17 : "UDP", 6 : "TCP"}


def addPacketToFlows(key,length,direction,timestamp,label):
    if key not in flows:
        flows[key] = dict(lengths = [],directions = [], timestamps = [])
    flows[key]["lengths"].append(length)
    flows[key]["directions"].append(direction)
    flows[key]["timestamps"].append(timestamp)
    flows[key]["label"] = label



pcap_paths = ["data/unibs/unibs20090930.anon.pcap", "data/unibs/unibs20091001.anon.pcap","data/unibs/unibs20091002.anon.pcap"]
for pcap_path in pcap_paths:
    i = 0
    with PcapReader(pcap_path) as pcap:
        for packet in tqdm(pcap):
            payload = packet.payload
            print(type(packet),len(packet), type(packet.payload),len(packet.payload),type(packet.payload.payload.payload),len(packet.payload.payload.payload))
            src,dst,sport,dport,proto = payload.src,payload.dst,str(payload.sport),str(payload.dport),proto_map[payload.proto]
           
            i += 1
            if i == 50:
                assert False
            conn = (src,dst,sport,dport,proto)
            rev_conn = (dst,src,dport,sport,proto)
            length = payload.len
            timestamp = datetime.datetime.fromtimestamp(float(packet.time))
            

            if conn in conns:
                label = conns[conn]
                addPacketToFlows(key= conn,length= length,direction= 0,timestamp= timestamp,label= label)
            elif rev_conn in conns:
                label = conns[rev_conn]
                addPacketToFlows(key= rev_conn, length= length,direction= 1, timestamp= timestamp, label= label)

for key in list(flows.keys()):
    t1 = flows[key]["timestamps"][0]
    t2 = float(flows[key]["label"]["timestamp"])
    t2 = datetime.datetime.fromtimestamp(t2)

    if t1 != t2:
        flows.pop(key)
            

49it [00:00, 6112.87it/s]

<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'> 28 <class 'scapy.packet.NoPayload'> 0
<class 'scapy.layers.l2.Ether'> 42 <class 'scapy.layers.inet.IP'




AssertionError: 

In [7]:
packet_flow_reps = []
for value in flows.values():
    lengths = value["lengths"]
    directions = value["directions"]
    timestamps = value["timestamps"]
    label = value["label"]["label"]
    lengths = [x for _, x in sorted(zip(timestamps, lengths), key=lambda pair: pair[0])]
    directions = [x for _, x in sorted(zip(timestamps, directions), key=lambda pair: pair[0])]
    timestamps.sort()
    lengths,inter_arrival_times,directions = normalizePacketRep(lengths= lengths,timestamps= timestamps,directions= directions)
    packet_flow_reps.append(PacketFlowRepressentation(lengths= lengths,directions= directions, inter_arrival_times= inter_arrival_times,class_type= label))

In [8]:
len(flows)

75388

In [9]:
saveFlows(path= "data/unibs/unibs.json",flows= packet_flow_reps)

In [10]:
all_inter_arrival_times = []
for p in packet_flow_reps:
    all_inter_arrival_times.extend(p.inter_arrival_times)

1.0076848550404414

In [12]:
packet_flow_reps = list(filter(lambda x : len(x) >= 30,packet_flow_reps))

In [13]:
len(packet_flow_reps)

23205

In [14]:
packet_flow_reps[0].lengths

[0.042666666666666665,
 0.04,
 0.034666666666666665,
 0.10266666666666667,
 0.034666666666666665,
 0.98,
 0.196,
 0.034666666666666665,
 0.12733333333333333,
 0.034666666666666665,
 0.06333333333333334,
 0.034666666666666665,
 0.06333333333333334,
 0.092,
 0.034666666666666665,
 0.034666666666666665,
 0.066,
 0.034666666666666665,
 0.058666666666666666,
 0.034666666666666665,
 0.059333333333333335,
 0.034666666666666665,
 0.058,
 0.034666666666666665,
 0.05266666666666667,
 0.034666666666666665,
 0.05466666666666667,
 0.034666666666666665,
 0.05266666666666667,
 0.058666666666666666,
 0.034666666666666665,
 0.034666666666666665,
 0.034666666666666665,
 0.05,
 0.034666666666666665,
 0.02666666666666667,
 0.02666666666666667]

In [15]:
pd.Series(map(lambda x : x.class_type,packet_flow_reps)).value_counts()

firefox-bin         7316
Safari              5010
Mail                3813
Transmission        1958
amule               1763
Skype                550
Safari Webpage P     492
firefox.exe          481
firefox              407
PubSubAgent          268
privoxy              236
bittorrent.exe       235
thunderbird-bin      164
Safari Webpage       143
ashWebSv.exe          69
opera                 64
adeona-client.ex      50
svn                   28
ssh                   27
kdeinit4              26
Microsoft Messen      21
Skype.exe             19
iTunes                13
SubmitDiagInfo         8
DashboardClient        7
iCal                   6
svchost.exe            5
skype                  5
Adobe Updater          3
Skim                   2
GoogleSoftwareUp       2
Software Update        2
msmsgs.exe             2
ntpd                   2
thunderbird.exe        2
Microsoft AutoUp       1
Microsoft AU Dae       1
SubmitReport           1
SoftwareUpdateCh       1
freshclam              1
