# Process Labelled Botnet Traffic Data
Here you can find Python code to process Labelled Botnet Traffic Data

The code shows you how to 

In [None]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x) ##remove scientific notation
import numpy as np
from time import time
from sklearn.neighbors import LocalOutlierFactor

In [None]:
outliers_fraction = 0.5 #we tell the outlier detection algo to remove 50% of the samples
rng = np.random.RandomState(442) #random seed

## Step 2.1: Load the labelled Dataset
Notice that we have already labelled the dataset according to the guidelines provided on the ISCX web-page

In [None]:
#print('script started!',flush=True)
t0 = time()
data = pd.read_csv('ISCX_ISCX_Botnet-Training-LABELLED.pcap.csv')
t1 = time()
print('data loaded in %f'%(t1-t0),flush=True)
print(data.shape)

## Step 2.2: Randomly Choose a Subset from the Big Dataset
* This is only for demonstration purposes
* The original dataset is **too big** and it takes a long time to finish the following steps

In [None]:
#lets randomly take 5000 samples to make things quick
#in real application we must use all the data
data = data.sample(n=5000,random_state=4646465)
print(data.shape)

In [None]:
#We can see the data still have the source port and disination port (without one hot encoding)
data.head()

## Step 2.3: Missing Value Imputation
* The CSV file resulting from Flow Generator can have missing values
* Here we replace them with the **average** (i.e. the mean)
* We go through columns one by one and do the impuation using the mean of each column
* We can use the **median** instead of the mean
* We can explore other missing value imputation techniques

In [None]:
#here we apply missing value imputation
#replace any missing value in a column with the mean of that column
t0 = time()
data.replace('?', np.NaN,inplace=True)
print('symbol ? replaced with NaN',flush=True)
for c in data.columns:
    if c != 'BotNet_Label':
        #print(c)
        data[c] = pd.to_numeric(data[c], errors='coerce')
        data[c] = data[c].replace(np.NaN,data[c].mean())#median can be used instead of mean!
#save data so we can use it later
data.to_csv('no_nans.csv',index=False)
t1 = time()
print('NaN values replaced with mean in %f'%(t1-t0),flush=True)

## Step 2.4: Perform one hot encoding to convert source port, dest port and protocol columns to binary numeric columns
* Although these columns contain numbers, they are in reality categorical 
* If we leave them as numbers, the machine learning algorithms will use them as numeric columns and results will not be reliable
* This is why we perform one-hot encoding to transform them into numeric columns

In [None]:
# perform one hot encoding to convert source port, dest port and protocol columns to 
# binary numeric columns
# because these columns are categorical and not numerical
# if we leave them as they are, the algorithms will treat them as numerical
t0 = time()
#create dummy variables for source port, dest port and protocol
src_port_df = pd.get_dummies(data['Source Port'])
dest_port_df = pd.get_dummies(data['Destination Port'])
protocol_df = pd.get_dummies(data['Protocol'])
#put all the data together in one data frame
data = pd.concat([src_port_df,dest_port_df,protocol_df,data],axis=1)
#remove source port, dest port and protocol because we now
#have one hot encoding for them (dummy variables)
data.drop(['Source Port', 'Destination Port', 'Protocol'],inplace=True,axis=1)

#because port values are integers, they will appear as integers in column names
#this step converts them to strings
data.columns = data.columns.astype(str)

t1 = time()
print('One hot encoding finished in %f'%(t1-t0),flush=True)
print(data.shape)

In [None]:
data.columns

In [None]:
#We can notice that the one hot encoding applied and each source port and destination port have seperate coloumns
data.head()

## Step 2.5: Remove Highly Correlated features
* I read the paper about ISCX Flow Generator and there the authors there explain the features that this tool extracts from PCAP files
* Some of these features are highly correlated and it is better to remove them because they can affect the performance of machine learning algorithms

In [None]:
# here we remove highly correlated features and prepare the data for outlier detection
t0 = time()

Y = data['BotNet_Label']
data.drop('BotNet_Label', axis=1, inplace=True)

#remove single quote from column names if they exist
rm_quote = lambda x: x.replace('\'', '')
cols = data.columns
data.columns = [rm_quote(x) for x in cols]

#remove highly correlated columns
data.drop(['Flow IAT Max','Flow IAT Min','Fwd IAT Mean','Fwd IAT Std','Fwd IAT Max','Fwd IAT Min','Bwd IAT Max','Bwd IAT Min','Active Max','Active Min','Idle Mean','Idle Max','Idle Min'],axis=1, inplace=True)


t1 = time()
print('X (data) and Y matrices prepared in %f'%(t1-t0),flush=True)

## Step 2.6: Outlier Detection and Remval
* Here we detect and remove outliers because they usually influence models and affect results 

In [None]:
# fit the model that performs outlier detection

clf = LocalOutlierFactor(n_neighbors=50, contamination=outliers_fraction,n_jobs = 60)
y_pred = clf.fit_predict(data)#only for LocalOutlierFactor

data['Outlier'] = y_pred
data['BotNet_Label'] = Y

outlier_mask = data['Outlier'].isin([-1])

print('To apply mask and removed outliters',flush=True)
data = data.loc[~outlier_mask]
data.drop('Outlier', axis=1, inplace=True)
print('data to be saved',flush=True)
data.to_csv('no_outliers.csv',index=False)

## Step 2.7: Split the Big Dataset into smaller sub-datasets According to Label
* We need to have a separate dataset for each Botnet so we can perform transfer learning and other ML algorithms
* Later we can add **Normal** data to each Botnet dataset
* We must make sure Normal data we add to each Botnet dataset is **non-overlapping**

In [None]:
#save subdatasets
labels = list(data['BotNet_Label'].unique())
for label in labels:
    tlbl = label.replace(" ", "_")#if label has space replace it with _
    tdata = data[data['BotNet_Label']==label]
    tdata.to_csv('Subdatasets/ISCX_Testing_'+tlbl+'.csv',index=False)
    print('Done: ',label,len(tdata))

print('all done', flush=True)