# Importing the useful packages

In [1]:
import glob
from os import system as cmd
import os
import shutil
import sys
import time
from multiprocessing import Process, Manager
import multiprocessing as mp

import pyshark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import copy

import nest_asyncio
nest_asyncio.apply()
#simplefilter(action='ignore')

# Reading the .pcap file

In our current directory, we have only one .pcap file and we want to access that file. 

In [2]:
# It will produce a list of all of the files with .pcap extension
list_pcap_file = glob.glob("./*.pcap")

# As we have only one .pcap file we want to access that file
file = list_pcap_file[0]

# Here we will show the general info about this file 
cmd("capinfos "+file)

File name:           ./Final_project_file.pcap
File type:           Wireshark/... - pcapng
File encapsulation:  Ethernet
File timestamp precision:  microseconds (6)
Packet size limit:   file hdr: (not set)
Packet size limit:   inferred: 34 bytes - 96 bytes (range)
Number of packets:   5,000 k
File size:           490 MB
Data size:           5,991 MB
Capture duration:    17.557630 seconds
First packet time:   2019-04-10 07:00:00.056001
Last packet time:    2019-04-10 07:00:17.613631
Data byte rate:      341 MBps
Data bit rate:       2,729 Mbps
Average packet size: 1198.22 bytes
Average packet rate: 284 kpackets/s
SHA256:              17c90cad2fbbde982d4285e4698e2ed6c88999955861894385a1613a7b7997a3
RIPEMD160:           fc1047598207dfb1e98c2b720f6a30ae31287066
SHA1:                968263edd2013c098fa12f906233edb7d3aa1705
Strict time order:   False
Capture application: Editcap (Wireshark) 3.4.2 (Git v3.4.2 packaged as 3.4.2-1~ubuntu16.04.0+wiresharkdevstable1)
Number of interfaces in file:

0

# Extracting general info of the .pcap file

Here we are asked to only extract 1 million of the packets and give some general info about that 1 million records. So first we will extract these packets and then we will give some general information about those extracted packets. 

## Filtering 1 million records

As we are asked to work on only 1 million of the packets, we will __create a directory name 'Splitted_1m'__ and split the original .pcap file to 5 different .pcap file __each one consisting 1 million__ of the packets. 

In [6]:
# The floder that will contain the splitted files
One_million_path = "Splitted_1m"

# Number of the packets in each file 
Limit_of_splitting = 1000000

# Check if the directory is already there
if os.path.isdir(One_million_path): 

    # If the directory is there, remove the whole directory
    shutil.rmtree(One_million_path)

# Create the directory to save the .pickle files 
os.mkdir(One_million_path)

# Creating multiple files from the original file each one consisting 1m packets. 
cmd(f'editcap -c {Limit_of_splitting} {file} {One_million_path}/{Limit_of_splitting}.pcap')
print('The splitting process has been completed!!!')

The splitting process has been completed!!!


Then we will pick the first file that is containing 1m of the packets and give the general info related to the packet in that file. 

In [2]:
list_pcap_file = glob.glob("Splitted_1m/*.pcap")

# As we have only one .pcap file we want to access that file
file = list_pcap_file[0]
file

'Splitted_1m/1000000_00000_20190410070000.pcap'

In [5]:
print("The number of packets in the capture: \n")
# Number of the packets in the capture --> -c
cmd("capinfos -c "+file)
print('-' * 50, end = '\n\n')

print('The average data rate (bit/sec):\n')
# Average data rate, in bit/sec --> -i
cmd("capinfos -i "+file)
print('-' * 50, end = '\n\n')

print("The average packet size: \n")
# Average packet size --> -z
cmd("capinfos -z "+file)
print('-' * 50, end = '\n\n')

print('The start time of the capture: \n')
# Start time of the capture --> -a
cmd("capinfos -a "+file)
print('-' * 50, end = '\n\n')

print('Total length of all of the packets in the file (in bytes): \n')
# Total length of all of the packets in the file, in bytes --> d
cmd("capinfos -d "+file)
print('-' * 50, end = '\n\n')

print("The end time of the capture: \n")
# The end time of the capture --> -e
cmd("capinfos -e "+file)
print('-' * 50, end = '\n\n')

print('The size of the file (in bytes): \n')
# The size of the file, in bytes --> -s
cmd("capinfos -s "+file)
print('-' * 50, end = '\n\n')

print('The average packet rate (packets/sec): \n')
# The average packet rate, in packets/sec --> x
cmd("capinfos -x "+file)
print('-' * 50, end = '\n\n')

print('The average data rate (bytes/sec): \n')
# The average data rate, in bytes/sec ---> y
cmd("capinfos -y "+file)
print('-' * 50, end = '\n\n')

print('General info about the capture: \n')
# Capture duration --> -S
cmd("capinfos "+file)
print('-' * 50, end = '\n\n')

The number of packets in the capture: 

File name:           Splitted_1m/One_million_00000_20190410070000.pcap
Packet size limit:   inferred: 34 bytes - 96 bytes (range)
Number of packets:   1,000 k
--------------------------------------------------

The average data rate (bit/sec):

File name:           Splitted_1m/One_million_00000_20190410070000.pcap
Packet size limit:   inferred: 34 bytes - 96 bytes (range)
Data bit rate:       2,419 Mbps
--------------------------------------------------

The average packet size: 

File name:           Splitted_1m/One_million_00000_20190410070000.pcap
Packet size limit:   inferred: 34 bytes - 96 bytes (range)
Average packet size: 1099.84 bytes
--------------------------------------------------

The start time of the capture: 

File name:           Splitted_1m/One_million_00000_20190410070000.pcap
Packet size limit:   inferred: 34 bytes - 96 bytes (range)
First packet time:   2019-04-10 07:00:00.056001
----------------------------------------------

# Extracting the packets info

Here we want to extract the information of the packets in the .pcap file and analyze the time taken to extract these information in __Sequential and Parallel reading__. 

## Extract packet info

Here we will write a function that will extract the desired info from the packets that are in .pcap file that we have passed to it and writes the produced dataframe in a pickle file to the passed directory. 

In [3]:
def Extract_packet_info(file_name, path_to_save):
    
    # The information that we want to get from the packets
    Columns = ["Label DSCP", "header len", "ds_field","ds_field_ecn", "length", 
          "Protocol" ,"flag_df", "flag_mf", "flag_rb", "fragment_offset", "ttl", 
          "IP_SRC", "IP_DST","src_port", "dst_port","time"] 
    
    # This dataframe will contain the features of all of the packets in the given file
    Packet_info_df = pd.DataFrame(columns = Columns)
    
    # We will read the .pcap file 
    pcap = pyshark.FileCapture(file_name)

    # We will go through the packets in the .pcap file 
    for packet in pcap:
        
        # For each packet we will create a dictionary to store the features of the packet 
        Packet_info = dict()
        
        # We will only extract the features of the IP packets
        if 'IP' in packet :
            
            # Getting the value of the dsfield
            Packet_info['Label DSCP'] = packet.ip.dsfield_dscp
            
            # The length of the IP header
            Packet_info['header len'] = packet.ip.hdr_len
    
            # Differentiated Service
            Packet_info['ds_field'] = int(packet.ip.dsfield,16)
            
            #Explicit Congestion Notification
            Packet_info['ds_field_ecn'] = packet.ip.dsfield_ecn
            
            #Length of the Packet including the header
            Packet_info['length'] = packet.ip.len
            
            #Number of Protocol (e.g. 6 = TCP, 17 = UDP, 1 = ICMP)
            Packet_info['Protocol'] = packet.ip.proto
            
            #Flag Do not Fragment 
            Packet_info['flag_df'] = packet.ip.flags_df
            
            #Flag More Fragment
            Packet_info['flag_mf'] = packet.ip.flags_mf
            
            #Flag Reserved - Must be 0
            Packet_info['flag_rb'] = packet.ip.flags_rb 
            
            #Fragment Offset
            Packet_info['fragment_offset'] = packet.ip.frag_offset
        
            #Time To Live
            Packet_info['ttl'] = packet.ip.ttl
            
            #### Extraction of the Ip Source and Ip Destination###
            Packet_info['IP_SRC'] = packet.ip.src
            Packet_info['IP_DST'] = packet.ip.dst
            
            #### Extraction of the Port ####
            if "UDP" in packet:
                Packet_info['src_port'] = packet.udp.srcport
                Packet_info['dst_port'] = packet.udp.dstport
                
            elif "TCP" in packet :
                Packet_info['src_port'] = packet.tcp.srcport
                Packet_info['dst_port'] = packet.tcp.dstport
                
            else:
                #Protocol as IP and ICMP e Ws.Short Port in src and dst will be set to -1
                Packet_info['src_port'] = -1
                Packet_info['dst_port'] = -1

            # The time that this packet has been sniffed
            Packet_info['time'] = float(packet.sniff_timestamp)
            
            # Adding this new packet to the passed dataframe
            Packet_info_df = Packet_info_df.append(Packet_info, ignore_index = True)
    
    # We just want to keep the name of the file and not the path 
    file_name = file_name.split('/')[-1]
    
    # Write the dataframe to a .pickle file for the faster reading later
    Packet_info_df.to_pickle(path_to_save + '/' + file_name.split('.')[0] + '.pickle')

## Sequential Reading

In the sequential reading we want to go through file that is containing 1 million records and extract the features of the packets in the .pcacp file. 

In the sequential reading, we will send the whole file to the function in order the extract the features for each of the packets in the capture. We will save the result in a pickle file in __'Sequential_reading_pickle'__ directory. 

In [None]:
# This is the directory that we will store the .pickle file for sequential reading
Sequential_reading_path = 'Sequential_reading_pickle'

# Check if the directory is already there
if os.path.isdir(Sequential_reading_path): 

    # If the directory is there, remove the whole directory
    shutil.rmtree(Sequential_reading_path)

# Create the directory to save the .pickle files 
os.mkdir(Sequential_reading_path)

# Start time of calling the function 
start_time = time.time()

# Calling the function to extract the features of the packets
Extract_packet_info(file, Sequential_reading_path)

# End time of processing the packets
Sequential_time = (time.time() - start_time)

print("-" * 50)
print(f'Time taken to extract the features of the packets: {"{:.2f}".format(Sequential_time)} seconds')
print("-" * 50)

Reading the dataframe after sequential reading

In [4]:
Pickle_files_seq = glob.glob(Sequential_reading_path+ '/*.pickle')
All_packet_info_df = pd.read_pickle(Pickle_files_seq[0])
All_packet_info_df

Unnamed: 0,Label DSCP,header len,ds_field,ds_field_ecn,length,Protocol,flag_df,flag_mf,flag_rb,fragment_offset,ttl,IP_SRC,IP_DST,src_port,dst_port,time
0,0,20,0,0,793,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
1,0,20,0,0,60,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
2,0,20,0,0,61,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
3,0,20,0,0,116,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
4,0,20,0,0,54,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
5,0,20,0,0,67,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
6,0,20,0,0,61,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0


## Parallel Reading

In the parallel reading we will split the file into more than one file and retrieve the feature of the packets in parallel. In our case as we should work with 1 million packets we will split this file into 5 files each one consisting of 200,000 packets and extract the packets features in parallel. 

In [5]:
file = 'pcap_trial.pcap'

# The number of the packets that should be in one file 
Limit_of_splitting = 1500

# The directory that will contain the splitted .pcap files
Parallel_splitting_path = "Parallel_splitting"
    
# Check if the directory is already there
if os.path.isdir(Parallel_splitting_path): 

    # If the directory is there, remove the whole directory
    shutil.rmtree(Parallel_splitting_path)

# Create the directory to save the .pickle files 
os.mkdir(Parallel_splitting_path)

# Creating multiple files from the original file each one consisting 1m packets. 
cmd(f'editcap -c {Limit_of_splitting} {file} {Parallel_splitting_path}/{Limit_of_splitting}.pcap')

print(f'The files have been created at: \'{Parallel_splitting_path}\\\'')

The files have been created at: 'Parallel_splitting\'


After splitting the files we will call the function to extract the packets' information for each file and save the result in the .pickle files to be aggregated in the next step. 

In [6]:
# The path to save the pickle file for each of the splitted files 
Parallel_reading_path = 'Parallel_reading_pickle'

# Check if the directory is already there
if os.path.isdir(Parallel_reading_path): 

    # If the directory is there, remove the whole directory
    shutil.rmtree(Parallel_reading_path)

# Create the directory to save the .pickle files 
os.mkdir(Parallel_reading_path)

#Instantiate the manager
manager = Manager()

#Start measuring time
start_time = time.time()

#-----------------------Files-----------------------
# It will produce a list of all of the files with .pcap extension
list_pcap_file = glob.glob(f'{Parallel_splitting_path}/*.pcap')

#Vect to store results
lista_process = []

for file_index, file in enumerate(list_pcap_file):
    print(f'Working on file {file_index}')

    # Send the file f
    p1 = Process(target = Extract_packet_info, args = (file, Parallel_reading_path))

    lista_process.append(p1)

    p1.start()

print('Please wait for the end of the processes !!!')
for process in lista_process:
    print(process)
    process.join()

    
### Finish ####

print(f'All .pickle files have been created at \'{Parallel_reading_path}\\\'')
time_parallel = (time.time() - start_time)
print("--- %s seconds ---" % (time_parallel)) 

Working on file 0
Working on file 1
Working on file 2
Working on file 3
Working on file 4
Working on file 5
Working on file 6
Working on file 7
Please wait for the end of the processes !!!
<Process name='Process-2' pid=16369 parent=16290 started>
<Process name='Process-3' pid=16371 parent=16290 stopped exitcode=0>
<Process name='Process-4' pid=16373 parent=16290 stopped exitcode=0>
<Process name='Process-5' pid=16374 parent=16290 stopped exitcode=0>
<Process name='Process-6' pid=16375 parent=16290 stopped exitcode=0>
<Process name='Process-7' pid=16376 parent=16290 started>
<Process name='Process-8' pid=16378 parent=16290 stopped exitcode=0>
<Process name='Process-9' pid=16384 parent=16290 stopped exitcode=0>


NameError: name 'Parallel_reading_pickle' is not defined

In [11]:
# Reading the pickle files 
Pickle_merged = pd.concat([pd.read_pickle(pickle_file) for pickle_file in glob.glob(f'{Parallel_reading_path}/*.pickle')])

In [18]:
Pickle_merged

Unnamed: 0,Label DSCP,header len,ds_field,ds_field_ecn,length,Protocol,flag_df,flag_mf,flag_rb,fragment_offset,ttl,IP_SRC,IP_DST,src_port,dst_port,time
0,0,20,0,0,793,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
1,0,20,0,0,60,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
0,0,20,0,0,61,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
0,0,20,0,0,54,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
1,0,20,0,0,67,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
0,0,20,0,0,61,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
1,0,20,0,0,116,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0


# 3) Extract the IP which generates the highest amount of sender traffic, evaluate the bit rate (0.1 sec) for the 6 IP addresses mostly used as endpoint

# 4) Top 5 Destination IP (received bytes) and Top 5 Source IP (sent bytes);

In [25]:
Columns = ["Label DSCP", "header len", "ds_field","ds_field_ecn", "length", 
          "Protocol" ,"flag_df", "flag_mf", "flag_rb", "fragment_offset", "ttl", 
          "IP_SRC", "IP_DST","src_port", "dst_port","time"] 
Packets_info = pd.DataFrame(columns = Columns)
Extract_packet_info('small_pcap_trial.pcap', Packets_info)

Unnamed: 0,Label DSCP,header len,ds_field,ds_field_ecn,length,Protocol,flag_df,flag_mf,flag_rb,fragment_offset,ttl,IP_SRC,IP_DST,src_port,dst_port,time
0,0,20,0,0,793,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
1,0,20,0,0,60,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
2,0,20,0,0,61,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
3,0,20,0,0,116,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
4,0,20,0,0,54,17,1,0,0,0,53,142.250.180.78,192.168.43.28,443,50688,1619450000.0
5,0,20,0,0,67,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0
6,0,20,0,0,61,17,1,0,0,0,128,192.168.43.28,142.250.180.78,50688,443,1619450000.0


In [19]:
Packets_info

Unnamed: 0,Label DSCP,header len,ds_field,ds_field_ecn,length,Protocol,flag_df,flag_mf,flag_rb,fragment_offset,ttl,IP_SRC,IP_DST,src_port,dst_port,time


In [22]:
dataFrame = extract_Info_pckt(file_name)
print("Finish the reading part")

Now I'm working on: ./data_00000_20190410070000.pcap



RuntimeError: This event loop is already running

In [15]:
#Save the dataframe in a pickle format, in this way once we saved it we can just reload without losing time
dataFrame.to_pickle("PacketDataframe.pkl")

sys.exit("Error message - Data created")

#Comment here once created dataset

#Reload dataframe
dataFrame = pd.read_pickle("PacketDataframe.pkl")

print("Dataframe overview: \n")
print(dataFrame.head())

NameError: name 'dataFrame' is not defined

# 5) Evaluate bitRate considering all the trace with 3 different sampling rate;

# 6) GeoLocal Referenciation of the 5 sessions with the highest amount of traffic generated;

# 7) 10 Protocol mostly used;

# 8) Port Scanner evaluation (10 Ports mostly used);

# 9) InterArrival Time boxplot between TCP and UDP Sessions;

# 10) Develop your own analysis (e.g. Topology of the network using networkx or evaluation about a variable such as TTL) (BONUS)