Set up imports.

In [101]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

Load in network flow data from file.

In [102]:
filename = "data/IDS2017/TrafficLabelling/Monday-WorkingHours.pcap_ISCX.csv"
new_filename="data/IDS2017/FormattedData/Monday_1.npz"
data = pd.read_csv(filename)

# Label by internal and external IPs
known_ips = ['205.174.165.80', '172.16.0.1', '192.168.10.3',\
             '192.168.10.50', '205.174.165.68', '192.168.10.51',\
             '205.174.165.66', '192.168.10.19','192.168.10.17',\
             '192.168.10.16', '192.168.10.12', '192.168.10.9',\
             '192.168.10.5','192.168.10.8','192.168.10.14',\
             '192.168.10.15','192.168.10.25']


Standardize IPs into expected "node" names of our GNN. This means external IPs will be labeled as "external", and data flows from external IPs will be read as such. 

We also order dataframe by timestamp, and set up our timestamps to register flows by the minute they occur.

In [103]:
data.loc[~data[' Source IP'].isin(known_ips), ' Source IP'] = 'external'
data.loc[~data[' Destination IP'].isin(known_ips), ' Destination IP'] = 'external'

# Order by time
data[' Timestamp'] = data[' Timestamp'].str.replace(' 01:',' 13:')
data[' Timestamp'] = data[' Timestamp'].str.replace(' 02:',' 14:')
data[' Timestamp'] = data[' Timestamp'].str.replace(' 03:',' 15:')
data[' Timestamp'] = data[' Timestamp'].str.replace(' 04:',' 16:')
data[' Timestamp'] = data[' Timestamp'].str.replace(' 05:',' 17:')

data[' Timestamp'] = pd.to_datetime(data[' Timestamp'], format='%m/%d/%Y %H:%M:%S')
data[' Timestamp'] = pd.to_datetime(data[' Timestamp'].dt.strftime('%m/%d/%Y %H:%M'))

data = data.sort_values(' Timestamp')


Modify the data in our dataframe so that it is ready to be compiled from flow data to traffic data at regular time intervals. This involves dropping unwanted columns, removing self directed flows (machines pinging themselves), and creating aggregate columns that are representative of network traffic. 

In [104]:
# Drop unwanted columns
data = data.drop(['Flow ID'], axis=1)
data = data.drop([' Source Port', ' Destination Port'], axis=1)

data = data.drop([' Fwd Packet Length Std'], axis=1)
data = data.drop([' Bwd Packet Length Std'], axis=1)

data = data.drop([' Flow IAT Mean', ' Flow IAT Max',' Flow IAT Min', ' Flow IAT Std'], axis=1)
data = data.drop(['Fwd IAT Total', ' Fwd IAT Mean',' Fwd IAT Std',' Fwd IAT Max',' Fwd IAT Min'], axis=1)
data = data.drop(['Bwd IAT Total', ' Bwd IAT Mean',' Bwd IAT Std',' Bwd IAT Max',' Bwd IAT Min'], axis=1)

data = data.drop([' Min Packet Length', ' Max Packet Length'], axis=1)
data = data.drop([' Fwd Packet Length Min', ' Fwd Packet Length Max'], axis=1)
data = data.drop([' Bwd Packet Length Min', 'Bwd Packet Length Max'], axis=1)
data = data.drop([' Packet Length Std', ' Packet Length Variance'], axis=1)

data = data.drop([' Fwd Header Length.1'], axis=1)
data = data.drop([' Down/Up Ratio'], axis=1)

data = data.drop([' Average Packet Size'], axis=1)
data = data.drop([' Avg Fwd Segment Size', ' Avg Bwd Segment Size'], axis=1)

data = data.drop([' Fwd Avg Packets/Bulk', 'Fwd Avg Bytes/Bulk',' Fwd Avg Bulk Rate'], axis=1)
data = data.drop([' Bwd Avg Packets/Bulk',' Bwd Avg Bytes/Bulk','Bwd Avg Bulk Rate'], axis=1)

data = data.drop(['Subflow Fwd Packets',' Subflow Fwd Bytes'], axis=1)
data = data.drop([' Subflow Bwd Packets',' Subflow Bwd Bytes'], axis=1)
data = data.drop(['Init_Win_bytes_forward',' Init_Win_bytes_backward'], axis=1)

data = data.drop([' min_seg_size_forward'], axis=1)
data = data.drop([' Active Std',' Active Max',' Active Min'], axis=1)
data = data.drop([' Idle Std',' Idle Max',' Idle Min'], axis=1)

labels = data[' Label']
data = data.drop([' Label'], axis=1)

# Drop self directed data
data = data[data[' Source IP'] != data[' Destination IP']]
data = data.drop([' Destination IP'], axis=1)

# Creating an aggregate column for total number of packets transferred
data['Total Packets'] = data[' Total Fwd Packets'] + data[' Total Backward Packets']

# Standardize data types to numerical
data = data.astype({' Flow Duration': 'float64'})
data = data.astype({'Flow Bytes/s': 'float64'})
data = data.astype({' Flow Packets/s': 'float64'})

Verify the remaining features are as expected.

In [105]:
# Remaining features
print(data.keys())
print(data.dtypes)

Index([' Source IP', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Mean', ' Bwd Packet Length Mean', 'Flow Bytes/s',
       ' Flow Packets/s', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags',
       ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length',
       'Fwd Packets/s', ' Bwd Packets/s', ' Packet Length Mean',
       'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count',
       ' PSH Flag Count', ' ACK Flag Count', ' URG Flag Count',
       ' CWE Flag Count', ' ECE Flag Count', ' act_data_pkt_fwd',
       'Active Mean', 'Idle Mean', 'Total Packets'],
      dtype='object')
 Source IP                              object
 Protocol                                int64
 Timestamp                      datetime64[ns]
 Flow Duration                         float64
 Total Fwd Packets                       int64
 Total Backward

Organize into timesteps by aggregating by source IP and minute. In any minute, all flows from a single IP during that minute will be compiled into one line of traffic information.

In [106]:
# Define a lambda function to compute the weighted mean
def try_weighted(x, data, column, default):
    try:
        return np.average(x, weights=data.loc[x.index, (column)])
    except (ZeroDivisionError):
        return default

wm_duration = lambda x: try_weighted(x, data, ' Flow Duration', 0)
wm_packets = lambda x: try_weighted(x, data, ('Total Packets'), 0)
wm_fwd_packets = lambda x: try_weighted(x, data, ' Total Fwd Packets', 0)
wm_bwd_packets = lambda x: try_weighted(x, data, ' Total Backward Packets', 0)

# Define a dictionary with the functions to apply for a given column
agg_func = {\
     ' Flow Duration':'sum',\
     ' Total Fwd Packets':'sum',\
     ' Total Backward Packets':'sum',\
     'Total Length of Fwd Packets':'sum',\
     ' Total Length of Bwd Packets':'sum',\
     ' Fwd Packet Length Mean':wm_fwd_packets,\
     ' Bwd Packet Length Mean':wm_bwd_packets,\
     'Flow Bytes/s':wm_duration,\
     ' Flow Packets/s':wm_duration,\
     'Fwd PSH Flags':'sum',\
     ' Bwd PSH Flags':'sum',\
     ' Fwd URG Flags':'sum',\
     ' Bwd URG Flags':'sum',\
     ' Fwd Header Length':wm_fwd_packets,\
     ' Bwd Header Length':wm_bwd_packets,\
     'Fwd Packets/s':wm_duration,\
     ' Bwd Packets/s':wm_duration,\
     ' Packet Length Mean':wm_packets,\
     'FIN Flag Count':'sum',\
     ' SYN Flag Count':'sum',\
     ' RST Flag Count':'sum',\
     ' PSH Flag Count':'sum',\
     ' ACK Flag Count':'sum',\
     ' URG Flag Count':'sum',\
     ' CWE Flag Count':'sum',\
     ' ECE Flag Count':'sum',\
     ' act_data_pkt_fwd':'sum',\
     'Total Packets':'sum',\
     'Active Mean':wm_duration,\
     'Idle Mean':wm_duration\
      }

# Group By
data = data.groupby([' Timestamp', ' Source IP']).agg(agg_func).reset_index()

print(data.head)


  avg = np.multiply(a, wgt, dtype=result_dtype).sum(axis)/scl


<bound method NDFrame.head of                Timestamp      Source IP   Flow Duration   Total Fwd Packets  \
0    2017-03-07 08:55:00       external    7.000000e+00                   8   
1    2017-03-07 08:56:00   192.168.10.3    6.097584e+06                 144   
2    2017-03-07 08:56:00   192.168.10.9    5.021933e+08                 853   
3    2017-03-07 08:56:00       external    6.000000e+00                   8   
4    2017-03-07 08:57:00  192.168.10.12    1.544303e+09                1314   
...                  ...            ...             ...                 ...   
5980 2017-03-07 17:01:00  192.168.10.19    1.390070e+06                  69   
5981 2017-03-07 17:01:00   192.168.10.3    1.437702e+07                  32   
5982 2017-03-07 17:01:00  192.168.10.50    1.100000e+02                   3   
5983 2017-03-07 17:01:00  192.168.10.51    1.973956e+06                  24   
5984 2017-03-07 17:01:00       external    5.668500e+04                  51   

       Total Backward

Ensure that we have information for every IP at every minute recorded. For IPs that do not have flows in a certain minute, a line indicating no traffic is added.

In [107]:
print(data[' Source IP'].unique())

for time in tqdm(data[' Timestamp'].unique()):
    missing_ips = []
    for ip in known_ips:
        if not ((data[' Timestamp'] == time) & (data[' Source IP'] == ip)).any():
            data = data.append({' Source IP': ip,\
                                ' Timestamp': time,\
                                ' Flow Duration': 0,\
                                'Total Packets': 0,\
                                ' Total Fwd Packets': 0,\
                                ' Total Backward Packets': 0,\
                                'Total Length of Fwd Packets': 0,\
                                ' Total Length of Bwd Packets': 0,\
                                ' Fwd Packet Length Mean': 0,\
                                ' Bwd Packet Length Mean': 0,\
                                'Flow Bytes/s': 0,\
                                ' Flow Packets/s': 0,\
                                'Fwd PSH Flags': 0,\
                                ' Bwd PSH Flags': 0,\
                                ' Fwd URG Flags': 0,\
                                ' Bwd URG Flags': 0,\
                                ' Fwd Header Length': 0,\
                                ' Bwd Header Length': 0,\
                                'Fwd Packets/s': 0,\
                                ' Bwd Packets/s': 0,\
                                ' Packet Length Mean': 0,\
                                'FIN Flag Count': 0,\
                                ' SYN Flag Count': 0,\
                                ' RST Flag Count': 0,\
                                ' PSH Flag Count': 0,\
                                ' ACK Flag Count': 0,\
                                ' URG Flag Count': 0,\
                                ' CWE Flag Count': 0,\
                                ' ECE Flag Count': 0,\
                                ' act_data_pkt_fwd': 0,\
                                'Active Mean': 0,\
                                'Idle Mean': 0},\
                                ignore_index=True)
            #print("Adding entry: ", ip, "\t at \t", time)
            
# Sort dataframe by timestep, and order nodes within timestep
data = data.sort_values(by=[' Timestamp', ' Source IP'])
data = data.reset_index()

['external' '192.168.10.3' '192.168.10.9' '192.168.10.12' '192.168.10.17'
 '192.168.10.50' '192.168.10.25' '192.168.10.19' '192.168.10.14'
 '192.168.10.16' '192.168.10.5' '192.168.10.51' '192.168.10.8'
 '192.168.10.15' '172.16.0.1']


HBox(children=(IntProgress(value=0, max=487), HTML(value='')))




Data is saved in a format that a GNN can interpret. An adjacency matrix between nodes is created.

We remove columns that were necessary for aggregate traffic calculations and ordering but are not necessary for GNN performance. 

In [108]:
# Review dataframe
print("Number of Nodes: ", len(data[' Source IP'].unique()))
print("Number of Features: ", len(data.keys()))
print(data.head())

# Format dataframe in (timesteps x nodes x features)
num_timesteps = len(data[' Timestamp'].unique())
num_nodes = len(data[' Source IP'].unique())
                
timesteps = data[' Timestamp'].unique()
nodes = data[' Source IP'].unique()
idx = 0

# Drop columns used earlier to order data
#data = data.drop([' Timestamp', ' Source IP'], axis=1)

num_features = len(data.keys())-2

data_np = np.zeros((num_timesteps, num_nodes, num_features))
print(data_np.shape)
for time in range(num_timesteps):
    for ip in range(num_nodes):
        data_np[time, ip, :] = data.loc[(data['index'] == idx)]\
                            .drop([' Timestamp', ' Source IP'], axis=1)\
                            .to_numpy()
        idx = idx+1
        
print(data_np.shape)


# Save edited as .csv and .npz
#data.to_csv("data/IDS2017/FormattedData/Monday_1.csv", index = False, header=False)
np.savez_compressed(new_filename, data=data_np)

# Save source and destination nodes in .npz
#node_ids = data[' Source IP'].unique()
#print(node_ids)
#np.savetxt("data/IDS2017/FormattedData/Monday_1_ids.txt",node_ids)

# Save adjacency matrix for network connections (unweighted)
adj_matrix = np.ones((len(node_ids), len(node_ids)))
np.savetxt("data/IDS2017/FormattedData/Monday_1_adj.csv",adj_matrix,delimiter=',')

Number of Nodes:  18
Number of Features:  33
   index           Timestamp      Source IP   Flow Duration  \
0   5986 2017-03-07 08:55:00     172.16.0.1             0.0   
1   5995 2017-03-07 08:55:00  192.168.10.12             0.0   
2   5999 2017-03-07 08:55:00  192.168.10.14             0.0   
3   6000 2017-03-07 08:55:00  192.168.10.15             0.0   
4   5994 2017-03-07 08:55:00  192.168.10.16             0.0   

    Total Fwd Packets   Total Backward Packets  Total Length of Fwd Packets  \
0                   0                        0                          0.0   
1                   0                        0                          0.0   
2                   0                        0                          0.0   
3                   0                        0                          0.0   
4                   0                        0                          0.0   

    Total Length of Bwd Packets   Fwd Packet Length Mean  \
0                           0.0          

ValueError: could not broadcast input array from shape (0,31) into shape (31)