# Labelling CSV Data resulting from FlowMeter (PCAP -> CSV)
Here you will find Python code that shows you how you can assign labels to the ISCX data:
http://www.unb.ca/cic/datasets/botnet.html


The labelling is done according to the guidelines provided in the provided link.

## Correlation between variables
The code also demonstrates how you can check the data for highly correlated variables

In [None]:
import pandas as pd

In [None]:
#load the data
data = pd.read_csv('ISCX_ISCX_Botnet-Training.pcap.csv')

In [None]:
#Here you can look at the first few lines of the data
data.head()

In [None]:
# load ip addresses and the labels (this is prepared manually following the instructions from ISCX team )
ip1 = pd.read_csv('bots1.csv')
ip2 = pd.read_csv('bots2.csv')

In [None]:
ip1.head()

In [None]:
ip2.head()

In [None]:
#this function goes through the data one row at a time (Iteration row), checks the [source IP] and checkes if it exists in ip1
#if so, then this row is given the Botnet label
def find_class1(row):
    sourceIP = str(row['Source IP'])
    #here iterate through the rows of ip1
    for index, ip_row in ip1.iterrows():
        if sourceIP == ip_row['IP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'

In [None]:
# this function goes through the data one row at a time, checks the [source IPs] and [dest IPs] and checkes if they both
# exist in ip2, if so, then this row is given the Botnet label
def find_class2(row):
    sourceIP = str(row['Source IP'])
    destIP = str(row[' Destination IP'])
    #here iterate through the rows of  ip2
    for index, ip_row in ip2.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['SrcIP'] and destIP == ip_row['DestIP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'        

### Now we prepare two lists to hold possible labels using the previous two functions

In [None]:
labels1 = data.apply(find_class1, axis=1)

In [None]:
labels2 = data.apply(find_class2, axis=1)

In [None]:
ls1 = list(labels1.values)

In [None]:
ls2 = list(labels2.values)

In [None]:
label = list()

In [None]:
# now if a label is "Other" in both lists, then it's Normal
# if it's "Other" in one list only, then we assign the label from the other list
for a, b in zip(ls1, ls2):
    if a == 'Other' and b == 'Other':
        label.append('Normal')
    else:
        if a == 'Other':
            label.append(b)
        else:
            label.append(a)

In [None]:
#check unique labels
set(label)

## now add the BotNet_Label column to the dataset

In [None]:
data['BotNet_Label'] = label

In [None]:
#display the columns to make sure BotNet_Label is there!
data.columns

In [None]:
#Here you can see numofRows and numofColoums
data.shape

In [None]:
#remove leading spaces from column names
data = data.rename(columns=lambda x: x.strip())

## This section helps you check highly correlated features

In [None]:
#remove unimportant columns
data.drop(['Source IP','Destination IP','label'],inplace=True,axis=1)

In [None]:
data.to_csv('ISCX_ISCX_Botnet-Training-LABELLED.pcap.csv',index=False)

In [None]:
# This function is taken from: https://stackoverflow.com/a/31384328
# here we plot a correlation heatmap to check highly correlated features
%matplotlib inline
def plot_corr(df,size=10):
    import matplotlib.pyplot as plt
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot'''

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns);
    plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
#Here I can see the highly correclated features
#notice: bright color means high correlation, dark color means low correlation
plot_corr(data,size=30)

In [None]:
# or we can display the correlation matrix
data.corr()