# This file extracts feature sets from pcap files.

### Input & Output

`Input Files`: All files with the pcap extension in the “./pcaps/” folder is read.

`Output Files`: Fingerprint result file named *FP_MAIN.csv*.

###  importing relevant libraries

In [5]:
from scapy.all import*
import math
import pandas as pd
import os
import numpy as np


### Discovering pcap extension files under "pcaps" folder.

In [37]:
def find_the_way(path,file_format):
    count=0
    files_add = []
    for r, d, f in os.walk(path):
        for file in f:
            if file_format in file:
                files_add.append(os.path.join(r,file))  
    return files_add
files_add=find_the_way('./pcaps/','.pcap')

### List of pcap files to be processed

In [38]:
files_add

['./pcaps/blink-cam-02.pcap', './pcaps/schlage-lock-01.pcap']

### Port numbers are classified in this part as:

| Port Numbers | Equivalents |
| :------ | :------ |
|No port| 0|
|Well known ports (between 0 and 1023) |1|
|Rregistered ports (between 1024 and 49151)  |2|
|Dynamic ports (between  49152 and 65535) |3|
# ↓ 

In [39]:
def port_class(port):
    if 0 <= port <= 1023:
        return 1
    elif  1024 <= port <= 49151 :
        return 2
    elif 49152 <=port <= 65535 :
        return 3
    else:
        return 0

### The dictionary to be used for MAC address and device matching.
#### Datasets, their MAC addresses and Devices are given separately.

In [40]:
MAC_list={
 'b8:b7:f1:2a:10:fd': 'august-hub-01',
 'c4:6e:7b:41:5f:28': 'geeni-awarecam-1', 
 'c4:6e:7b:0e:62:5c': 'geeni-awarecam-2',
 '78:db:2f:db:43:48': 'schlage-lock-01',  
 'f4:cf:a2:eb:59:c4': 'sifely-hub-01', 
 '8c:f7:10:a1:a5:9f': 'simplisafe-d1', 
 '6c:21:a2:90:19:b0': 'simplisafe-d2',
 '24:7d:4d:9c:f2:81': 'ring-doorbell-02', 
 '90:e2:02:30:80:a8': 'ring-doorbell-03',   
 '64:16:66:73:e6:e0': 'nest-doorbell-01',   
 'f4:b8:5e:cd:fe:2f': 'blink-cam-03',
 '30:4a:26:12:14:f1': 'geeni-doorbell-01', 
 'd4:d2:d6:3b:27:51': 'geeni-doorbell-02',    
 'f4:b8:5e:ff:2b:1b': 'blink-cam-01',
 '0c:8c:24:7f:34:84': 'geeni-cam-03',
 'a4:cf:12:32:5b:88': 'ultraloq-hub-01', 
 '78:b2:13:e4:a6:ec': 'smartthings-cam-01', 
 '54:2b:57:29:b4:6c': 'nightowl-doorbell-02',
 'f4:b8:5e:35:67:b0': 'blink-cam-02', 
 '58:b3:fc:68:a6:e2': 'geeni-cam-01',
 '7c:a7:b0:dc:a0:1a': 'merkury-cam-01',
 '7c:25:da:2d:a4:70': 'merkury-doorbell-01', 
 '2c:aa:8e:a1:27:65': 'wyze-cam-01', 
 '54:2b:57:29:92:a9': 'nightowl-doorbell-01'
}

In [41]:
len(MAC_list)

24

### Calculating the payload entropy value.


# ↓ 

In [42]:
def pre_entropy(payload):
    
    characters=[]
    for i in payload:
            characters.append(i)
    return shannon(characters)


def shannon(data):
    freq_dict={} 
    for i in data:
        if i in freq_dict:
            freq_dict[i] += 1
        else:
            freq_dict[i] = 1    
    entropy = 0.0
    logarithm_base = 2
    payload_size = len(data) #
    for key in freq_dict.keys():
        frequency = float(freq_dict[key])/payload_size
        if frequency > 0: 
            entropy = entropy + frequency * math.log(frequency, logarithm_base)
    return -entropy

### This section is the main backbone of our program. In this section, the following operations are performed briefly.


#### - The pcap_files variable contains the addresses of the pcap files. The `for` loop moves through the values of this variable, allowing all files to be processed.


#### - The second for loop examines individual packets in the processed pcap file. All features belonging to a packet are extracted and processed as a new line in the fingerprint file at the end of the second for loop.


#### - All properties are initially assigned a value of 0. These properties are then queried in the packet. If the properties have corresponding data, the data is processed in the variable, otherwise, the value of the variable remains as 0.

In [43]:
count=0
ths = open("./dataset/FP_MAIN_PCAP.csv", "w")
header="ARP,LLC,EAPOL,IP,ICMP,ICMP6,TCP,UDP,TCP_w_size,HTTP,HTTPS,DHCP,BOOTP,SSDP,DNS,MDNS,NTP,IP_padding,IP_add_count,IP_ralert,Portcl_src,Portcl_dst,Pck_size,Pck_rawdata,payload_l,Entropy,Label,MAC\n"
ths.write(header)
dst_ip_list={}

for i in MAC_list:
    dst_ip_list[i]=[]
import time

for i in files_add:
    print(i)
    pkt = PcapReader(i)
    print("\n\n"+"========"+ i[8:]+"========"+"\n" )
    print(pkt)

    for jj,j in enumerate(pkt):
        ip_add_count=0
        layer_2_arp = 0
        layer_2_llc = 0

        layer_3_eapol = 0
        layer_3_ip = 0
        layer_3_icmp = 0
        layer_3_icmp6 = 0



        layer_4_tcp = 0
        layer_4_udp = 0
        layer_4_tcp_ws=0


        layer_7_http = 0
        layer_7_https = 0
        layer_7_dhcp = 0
        layer_7_bootp = 0
        layer_7_ssdp = 0
        layer_7_dns = 0
        layer_7_mdns = 0
        layer_7_ntp = 0

        ip_padding = 0
        ip_ralert = 0


        port_class_src = 0
        port_class_dst = 0

        pck_size = 0
        pck_rawdata = 0
        entropy=0

        layer_4_payload_l=0

        try:

            pck_size=j.len

        except:pass

        try:

            if j[IP]:

                layer_3_ip = 1
            temp=str(j[IP].dst)
            if temp not in dst_ip_list[j.src]:
                dst_ip_list[j.src].append(temp)
            ip_add_count=len(dst_ip_list[j.src])

            port_class_src = port_class(j[IP].sport)
            port_class_dst = port_class(j[IP].dport)

        except:pass

        temp=str(j.show)

        if "ICMPv6" in temp:

            layer_3_icmp6 = 1

        try:
            if j[IP].ihl >5:
                if IPOption_Router_Alert(j):
                    pad=str(IPOption_Router_Alert(j).show)
                    if "Padding" in pad:
                        ip_padding=1
                    ip_ralert = 1
        except:pass

        if j.haslayer(ICMP):
            layer_3_icmp = 1


        if j.haslayer(Raw):
            pck_rawdata = 1

        if j.haslayer(UDP):

            layer_4_udp = 1
            if j[UDP].sport==68 or j[UDP].sport==67:
                layer_7_dhcp = 1
                layer_7_bootp = 1
            if j[UDP].sport==53 or j[UDP].dport==53:
                layer_7_dns = 1
            if j[UDP].sport==5353 or j[UDP].dport==5353:
                layer_7_mdns = 1
            if j[UDP].sport==1900 or j[UDP].dport==1900:
                layer_7_ssdp = 1
            if j[UDP].sport==123 or j[UDP].dport==123:
                layer_7_ntp = 1

        try:
            if j[UDP].payload:
                layer_4_payload_l=len(j[UDP].payload)
        except:pass



        if j.haslayer(TCP):
            layer_4_tcp = 1
            layer_4_tcp_ws=j[TCP].window
            if j[TCP].sport==80 or j[TCP].dport==80:
                layer_7_http = 1
            if j[TCP].sport==443 or j[TCP].dport==443:
                layer_7_https = 1
            try:
                if j[TCP].payload:
                    layer_4_payload_l=len(j[TCP].payload)
            except:pass

        if j.haslayer(ARP):
            layer_2_arp = 1

        if j.haslayer(LLC):
            layer_2_llc = 1

        if j.haslayer(EAPOL):
            layer_3_eapol = 1
        try:
            entropy=pre_entropy(j[Raw].original)

        except:pass
        if j.src in MAC_list:
            label=MAC_list[j.src]
    #         else:
    #             label="unknown"
       # label=MAC_list[j.src]
        line=[layer_2_arp, layer_2_llc, layer_3_eapol, layer_3_ip, layer_3_icmp, layer_3_icmp6, layer_4_tcp, layer_4_udp, layer_4_tcp_ws, layer_7_http, layer_7_https, layer_7_dhcp, layer_7_bootp, layer_7_ssdp, layer_7_dns, layer_7_mdns, layer_7_ntp, ip_padding, ip_add_count, ip_ralert, port_class_src, port_class_dst, pck_size, pck_rawdata,layer_4_payload_l,entropy, label,j.src]
        line=str(line).replace("[","")
        line=str(line).replace("]","")
        line=str(line).replace(", ",",")
        line=str(line).replace("\'","")
        if label!="unknown":
            ths.write(str(line)+"\n")
    
ths.close()


./pcaps/blink-cam-02.pcap



<scapy.utils.PcapReader object at 0x11ceb3150>
./pcaps/schlage-lock-01.pcap



<scapy.utils.PcapReader object at 0x11cf5a390>


### Input & Output

`Input Files`: FP_MAIN.csv

`Output Files`: IPAssess.csv

In [6]:
dataset="dataset/FP_MAIN_PCAP.csv"
df=pd.read_csv(dataset)

In [7]:
df

Unnamed: 0,ARP,LLC,EAPOL,IP,ICMP,ICMP6,TCP,UDP,TCP_w_size,HTTP,...,IP_add_count,IP_ralert,Portcl_src,Portcl_dst,Pck_size,Pck_rawdata,payload_l,Entropy,Label,MAC
0,0,0,0,1,0,0,1,0,13638,0,...,1,0,3,1,40,0,6,0.0,august-hub-01,b8:b7:f1:2a:10:fd
1,0,0,0,1,0,0,1,0,65535,0,...,0,0,0,0,40,0,2,0.0,august-hub-01,00:1c:7f:53:d0:28
2,0,0,0,1,0,0,1,0,14000,0,...,1,0,3,1,40,0,6,0.0,august-hub-01,b8:b7:f1:2a:10:fd
3,0,0,0,1,0,0,1,0,65535,0,...,0,0,0,0,40,0,2,0.0,august-hub-01,00:1c:7f:53:d0:28
4,0,0,0,1,0,0,1,0,13638,0,...,1,0,3,1,40,0,6,0.0,august-hub-01,b8:b7:f1:2a:10:fd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19603264,0,0,0,1,0,0,1,0,33580,0,...,6,0,3,1,40,0,6,0.0,blink-cam-03,f4:b8:5e:cd:fe:2f
19603265,0,0,0,1,0,0,1,0,65535,0,...,0,0,0,0,40,0,2,0.0,blink-cam-03,00:1c:7f:53:d0:28
19603266,0,0,0,1,0,0,1,0,65535,0,...,0,0,0,0,40,0,2,0.0,blink-cam-03,00:1c:7f:53:d0:28
19603267,0,0,0,1,0,0,1,0,65535,0,...,0,0,0,0,109,0,0,0.0,blink-cam-03,00:1c:7f:53:d0:28


## Correlation matrix on Main Feature Set i.e FP_Main

In [8]:
dataframe1 = df
del dataframe1["MAC"]
del dataframe1["Label"]
matrix = dataframe1.corr()
matrix = matrix['IP']
matrix

ARP            -0.956375
LLC            -0.063046
EAPOL          -0.281429
IP              1.000000
ICMP            0.031792
ICMP6          -0.019154
TCP             0.123056
UDP             0.078302
TCP_w_size      0.039503
HTTP            0.005134
HTTPS           0.101594
DHCP            0.002145
BOOTP           0.002145
SSDP                 NaN
DNS             0.018426
MDNS                 NaN
NTP             0.004622
IP_padding           NaN
IP_add_count    0.049423
IP_ralert            NaN
Portcl_src      0.121828
Portcl_dst      0.111548
Pck_size        0.065758
Pck_rawdata     0.031207
payload_l       0.026836
Entropy              NaN
Name: IP, dtype: float64

In [5]:
df_IOT_unique=matrix[(matrix['IP']<=(-0.03)) | (matrix['IP']>=0.03)]
df_IOT_unique

Unnamed: 0,ARP,LLC,EAPOL,IP,ICMP,ICMP6,TCP,UDP,TCP_w_size,HTTP,...,NTP,IP_padding,IP_add_count,IP_ralert,Portcl_src,Portcl_dst,Pck_size,Pck_rawdata,payload_l,Entropy
ARP,1.0,-0.000704,-0.003144,-0.956375,-0.030405,-0.000214,-0.117688,-0.074886,-0.03778,-0.00491,...,-0.00442,,-0.047267,,-0.116514,-0.106681,-0.064467,-0.031342,-0.025666,
LLC,-0.000704,1.0,-0.000207,-0.063046,-0.002004,-1.4e-05,-0.007758,-0.004937,-0.002491,-0.000324,...,-0.000291,,-0.003116,,-0.007681,-0.007033,-0.004169,0.022469,-0.001692,
EAPOL,-0.003144,-0.000207,1.0,-0.281429,-0.008947,-6.3e-05,-0.034631,-0.022036,-0.011117,-0.001445,...,-0.001301,,-0.013909,,-0.034286,-0.031393,-0.013188,-0.009223,-0.007553,
IP,-0.956375,-0.063046,-0.281429,1.0,0.031792,-0.019154,0.123056,0.078302,0.039503,0.005134,...,0.004622,,0.049423,,0.121828,0.111548,0.065758,0.031207,0.026836,
ICMP,-0.030405,-0.002004,-0.008947,0.031792,1.0,-0.000609,-0.334963,-0.213141,-0.107529,-0.013976,...,-0.012582,,0.015838,,-0.331522,-0.303368,-0.134208,-0.089206,-0.073049,
TCP,-0.117688,-0.007758,-0.034631,0.123056,-0.334963,-0.002357,1.0,-0.825,0.321019,0.041723,...,-0.048699,,-0.224039,,0.020858,-0.255795,0.133093,-0.345252,-0.24017,
UDP,-0.074886,-0.004937,-0.022036,0.078302,-0.213141,-0.0015,-0.825,1.0,-0.26484,-0.034422,...,0.05903,,0.235886,,0.194551,0.464897,-0.047623,0.418111,0.298293,
TCP_w_size,-0.03778,-0.002491,-0.011117,0.039503,-0.107529,-0.000757,0.321019,-0.26484,1.0,0.06515,...,-0.015633,,-0.110526,,-0.00688,-0.105858,-0.102756,-0.11084,-0.072438,
HTTPS,-0.097161,-0.006405,-0.028591,0.101594,-0.276541,-0.001946,0.825588,-0.68111,0.257901,-0.044661,...,-0.040206,,-0.195725,,-0.001381,-0.336563,0.076414,-0.285061,-0.196994,
IP_add_count,-0.047267,-0.003116,-0.013909,0.049423,0.015838,-0.000947,-0.224039,0.235886,-0.110526,0.061458,...,-0.004509,,1.0,,0.254222,0.357575,0.097085,0.338413,0.036594,


## Create Main with reduced features 

In [6]:
df=pd.read_csv("dataset/FP_MAIN_PCAP.csv")
deleted=["SSDP","IP_padding","MDNS","IP_ralert","Entropy"]
name="Main_reduced.csv"
df=df.drop(columns=deleted)
df.to_csv('dataset/'+name, index=False)
df.columns

Index(['ARP', 'LLC', 'EAPOL', 'IP', 'ICMP', 'ICMP6', 'TCP', 'UDP',
       'TCP_w_size', 'HTTP', 'HTTPS', 'DHCP', 'BOOTP', 'DNS', 'NTP',
       'IP_add_count', 'Portcl_src', 'Portcl_dst', 'Pck_size', 'Pck_rawdata',
       'payload_l', 'Label', 'MAC'],
      dtype='object')

In [7]:
len(df.columns)

23

## Create IPAssess


In [8]:
df=pd.read_csv("dataset/FP_MAIN_PCAP.csv")
deleted=["SSDP","DNS","MDNS","Pck_rawdata","NTP","DHCP","BOOTP","ICMP","IP_padding","IP_ralert","Entropy"]
name="IPAssess_unique.csv"
df=df.drop(columns=deleted)
df.to_csv('dataset/'+name, index=False)
df.columns

Index(['ARP', 'LLC', 'EAPOL', 'IP', 'ICMP6', 'TCP', 'UDP', 'TCP_w_size',
       'HTTP', 'HTTPS', 'IP_add_count', 'Portcl_src', 'Portcl_dst', 'Pck_size',
       'payload_l', 'Label', 'MAC'],
      dtype='object')

In [9]:
len(df.columns)

17

## Create feature with threshold 0.05


In [10]:
df=pd.read_csv("dataset/FP_MAIN_PCAP.csv")
deleted=["payload_l","IP_add_count","HTTP","TCP_w_size","SSDP","DNS","MDNS","Pck_rawdata","NTP","DHCP","BOOTP","ICMP","IP_padding","IP_ralert","Entropy","ICMP6"]
name="Threshold_0.05.csv"
df=df.drop(columns=deleted)
df.to_csv('dataset/'+name, index=False)
df.columns

Index(['ARP', 'LLC', 'EAPOL', 'IP', 'TCP', 'UDP', 'HTTPS', 'Portcl_src',
       'Portcl_dst', 'Pck_size', 'Label', 'MAC'],
      dtype='object')

In [11]:
len(df.columns)

12

## Create feature with threshold 0.04


In [12]:
df=pd.read_csv("dataset/FP_MAIN_PCAP.csv")
deleted=["payload_l","HTTP","TCP_w_size","SSDP","DNS","MDNS","Pck_rawdata","NTP","DHCP","BOOTP","ICMP","IP_padding","IP_ralert","Entropy","ICMP6"]
name="Threshold_0.04.csv"
df=df.drop(columns=deleted)
df.to_csv('dataset/'+name, index=False)
df.columns

Index(['ARP', 'LLC', 'EAPOL', 'IP', 'TCP', 'UDP', 'HTTPS', 'IP_add_count',
       'Portcl_src', 'Portcl_dst', 'Pck_size', 'Label', 'MAC'],
      dtype='object')

In [13]:
len(df.columns)

13

## Create feature with threshold 0.035


In [14]:
df=pd.read_csv("dataset/FP_MAIN_PCAP.csv")
deleted=["payload_l","HTTP","SSDP","DNS","MDNS","Pck_rawdata","NTP","DHCP","BOOTP","ICMP","IP_padding","IP_ralert","Entropy","ICMP6"]
name="Threshold_0.035.csv"
df=df.drop(columns=deleted)
df.to_csv('dataset/'+name, index=False)
df.columns

Index(['ARP', 'LLC', 'EAPOL', 'IP', 'TCP', 'UDP', 'TCP_w_size', 'HTTPS',
       'IP_add_count', 'Portcl_src', 'Portcl_dst', 'Pck_size', 'Label', 'MAC'],
      dtype='object')

In [15]:
len(df.columns)

14

In [18]:
df1=pd.read_csv("dataset/IPAssess_unique.csv")

In [19]:
len(df.columns)

17

In [24]:
df1=pd.read_csv("dataset/Threshold_0.04.csv")
len(df1.columns)

13