## How to Label the ISCX Data


### 1) Load the Data

In [1]:
import pandas as pd
import numpy as np


In [2]:
%%time
#load the data
data = pd.read_csv('ISCX_ISCX_Botnet.csv')

CPU times: user 940 ms, sys: 190 ms, total: 1.13 s
Wall time: 1.16 s


In [3]:
data.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,8.6.0.1,0,8.0.6.4,0,0,61804233,0.0,0.210342,5150353.0,4942438.0,...,0.0,910704.5,280640.074851,1109147.0,712262.0,9003695.5,4108267.0,15120887.0,5587976.0,ISCX
1,147.32.84.180,1040,147.32.84.171,139,6,14119,129470.925703,1416.530916,743.1053,1697.691,...,184.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
2,147.32.84.180,1041,147.32.84.19,139,6,40800,7205.882353,196.078431,5828.571,11137.09,...,1459.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
3,147.32.84.180,1042,147.32.84.19,139,6,44000,6681.818182,181.818182,6285.714,11020.03,...,1409.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX
4,147.32.84.180,1043,147.32.84.171,139,6,56403,62656.241689,567.345709,1819.452,3344.01,...,291.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ISCX


### Shuffle the data

In [4]:
data = data.sample(frac=1).reset_index(drop=True)

In [5]:
data.columns

Index(['Source IP', ' Source Port', ' Destination IP', ' Destination Port',
       ' Protocol', ' Flow Duration', ' Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min',
       'Active Mean', ' Active Std', ' Active Max', ' Active Min', 'Idle Mean',
       ' Idle Std', ' Idle Max', ' Idle Min', 'label'],
      dtype='object')

In [6]:
data.shape

(309206, 29)

In [7]:
#take a look at the data
#data.head()

In [8]:
#take the first 10000 rows to save time
## feel free to use the entire dataset in your own time
data = data.iloc[:10000,:]
data.shape

(10000, 29)

## Max and Min values in features
The resulting feature values after applying FlowMeter to transform the data from PCAP into csv could be as high as +infinity or as low as -infinity .. if that's the case .. it's a good idea to replace those values with reasonable values such as a very hight or very lowe number

In [9]:
## check the max and min values in the features
print(data.max())
#print(data.min())

Source IP            99.163.132.209
 Source Port                  65507
 Destination IP        99.92.68.235
 Destination Port             65535
 Protocol                        17
 Flow Duration            119999894
 Flow Bytes/s           4.46667e+07
 Flow Packets/s               2e+06
 Flow IAT Mean          1.18755e+08
 Flow IAT Std           1.87895e+11
 Flow IAT Max           3.59954e+09
 Flow IAT Min           1.18755e+08
Fwd IAT Mean            1.19496e+08
 Fwd IAT Std            1.87895e+11
 Fwd IAT Max            3.59954e+09
 Fwd IAT Min            1.19496e+08
Bwd IAT Mean            1.19548e+08
 Bwd IAT Std            1.59473e+11
 Bwd IAT Max            1.19548e+08
 Bwd IAT Min            1.19548e+08
Active Mean             1.00452e+08
 Active Std             5.91558e+07
 Active Max             1.00452e+08
 Active Min             1.00452e+08
Idle Mean                2.1598e+09
 Idle Std               2.03611e+09
 Idle Max               3.59954e+09
 Idle Min               7.20

In [10]:
## Here we replace the +inf or the -inf with a reasonable value
## you can try with the highest possible value in Python: sys.float_info.max
data.replace(np.inf, 1000000000000.0, inplace=True)
#data.replace(-np.inf, -1000000000000.0, inplace=True)

### 2) Load list of IP addresses and their corresponding Botnet Names

Data taken from: https://www.unb.ca/cic/datasets/botnet.html

#### Also write functions to apply labelling according to Source and Destination IP addresses

In [11]:
# load ip addresses and the labels
ip1 = pd.read_csv('bots1.csv')
ip2 = pd.read_csv('bots2.csv')

In [12]:
#this function goes through the data one row at a time, checks the source IP and checkes if it exists in ip1
#if so, then this row is given the Botnet label
def find_class1(row):
    sourceIP = str(row['Source IP'])
    #destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip1.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['IP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'

In [13]:
# this function goes through the data one row at a time, checks the source and dest IPs and checkes if they both
# exist in ip2, if so, then this row is given the Botnet label
def find_class2(row):
    sourceIP = str(row['Source IP'])
    destIP = str(row[' Destination IP'])
    #print(sourceIP+' -- '+destIP)
    for index, ip_row in ip2.iterrows():
        #print('\tcompare to: ' + ip_row['SrcIP']+' -- '+ip_row['DestIP'])
        if sourceIP == ip_row['SrcIP'] and destIP == ip_row['DestIP']:
            s = ip_row['Bot']
            return s
    
    return 'Other'        

In [14]:
labels1 = data.apply(find_class1, axis=1)

In [15]:
len(labels1[labels1 == 'Other']) 

4981

In [16]:
labels2 = data.apply(find_class2, axis=1)

In [17]:
len(labels2[labels2 == 'Other'])

9850

In [18]:
#len(ls1)

In [19]:
ls1 = list(labels1.values)

In [20]:
ls2 = list(labels2.values)

In [21]:
len(ls1)

10000

In [22]:
label = list()

In [23]:
# now if a label is "Other" in both lists, then it's Normal
# if it's "Other" in one list only, then we assign the label from the other list
for a, b in zip(ls1, ls2):
    if a == 'Other' and b == 'Other':
        label.append('Normal')
    else:
        if a == 'Other':
            label.append(b)
        else:
            label.append(a)

In [24]:
label

['Normal',
 'Virut',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Virut',
 'Weasel Bot',
 'Weasel Bot',
 'Murlo',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Weasel Bot',
 'Virut',
 'Weasel Bot',
 'Normal',
 'Weasel Bot',
 'Virut',
 'Virut',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Neris',
 'Weasel Bot',
 'Weasel Bot',
 'Normal',
 'Weasel Bot',
 'Weasel Bot',
 'Neris',
 'Neris',
 'Virut',
 'Normal',
 'Normal',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Virut',
 'Virut',
 'IRC',
 'Normal',
 'Normal',
 'Virut',
 'Normal',
 'Virut',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Virut',
 'Normal',
 'Normal',
 'Virut',
 'Murlo',
 'Normal',
 'Virut',
 'Virut',
 'Weasel Bot',
 'Weasel Bot',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Neris',
 'IRC',
 'Neris',
 'Weasel Bot',
 'Normal',
 'IRC',
 'Virut',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Murlo',
 'Normal',
 'Weasel Bot',
 'Weasel Bot',
 'Normal',
 'Normal',
 'Weasel Bot',
 'Normal',
 'Neris',

#### Here we add 'BotNet_Label' column to the data after we filled it as above

In [25]:
data['BotNet_Label'] = label

In [26]:
#remove spaces from column names
data = data.rename(columns=lambda x: x.strip())

In [27]:
data.columns

Index(['Source IP', 'Source Port', 'Destination IP', 'Destination Port',
       'Protocol', 'Flow Duration', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
       'Idle Std', 'Idle Max', 'Idle Min', 'label', 'BotNet_Label'],
      dtype='object')

In [28]:
# Explore BotNet_Label values
data['BotNet_Label'].value_counts()

Normal                    4831
Weasel Bot                2220
Virut                     1370
Neris                      744
Murlo                      401
IRC                        150
Menti                      148
Zero access                 66
TBot                        27
Black hole 2                16
Zeus                        14
Black hole 3                 5
Sogou                        3
IRCbot and black hole1       2
Weasel Botmaster             2
Smoke bot                    1
Name: BotNet_Label, dtype: int64

In [29]:
#remove unimportant columns
data.drop(['Source IP','Destination IP','label'],inplace=True,axis=1)

### This is how to apply one-hot encoding using Pandas

In [30]:
df_src_port = pd.get_dummies(data['Source Port'],prefix='SrcPort')
df_dest_port = pd.get_dummies(data['Destination Port'],prefix='DestPort')
df_protocol = pd.get_dummies(data['Protocol'],prefix='Protocol')

In [31]:
#df_src_port.head()

In [32]:
data = pd.concat([data, df_src_port,df_dest_port,df_protocol], axis=1)
data.shape

(10000, 7641)

In [33]:
data.drop(['Source Port','Destination Port','Protocol'],inplace=True,axis=1)
data.shape

(10000, 7638)

### Save the Data .. it is ready for further analysis and machine learning

In [34]:
%%time
data.to_csv('ISCX_Botnet_Labelled.csv',index=False)

CPU times: user 12.5 s, sys: 151 ms, total: 12.6 s
Wall time: 12.7 s
