In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
dataFrame = pd.read_csv('capture20110815-2.csv')   #Very small capture (use this for speed of testing)
#dataFrame = pd.read_csv('combinedCapture.csv')

In [3]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,?>,147.32.84.118,1567,RA_,0.0,0.0,1,60,60,flow=Background
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,->,147.32.84.229,80,SRA_SA,0.0,0.0,6,388,208,flow=Background-TCP-Established
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,->,147.32.84.229,13363,SRA_SA,0.0,0.0,6,388,208,flow=Background-TCP-Established
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,->,147.32.84.229,443,SRA_SA,0.0,0.0,6,388,208,flow=Background-TCP-Established
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,->,147.32.84.229,443,SR_SA,0.0,0.0,5,308,122,flow=Background-TCP-Established


In [4]:
#Deletes row's where the column values are null,nan,nat, or blank
def deleteNullRow(dataFrame, column):
    newDataFrame = dataFrame
    
    #dataframe dropna won't replace empty values only NaN and NaT so convert blank space to NaN then drop
    newDataFrame[column].replace('', np.nan, inplace=True)
    newDataFrame = newDataFrame.dropna(subset=[column])
    return newDataFrame

In [5]:
# From: https://github.com/mgarzon/cybersec/blob/master/MalwareDetection.ipynb
def preprocessData(dataFrame):

    '''
    This function is used to perform
    the necessary operations to 
    convert the raw data into a
    clean data set.
    '''
    
    #Outputting number of rows and column names before preprocessing
    print("----------Before pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    
    
    
    #dropping columns specified
    listOfFeaturesToDrop = [
    'Dir',
    'sTos',
    'dTos']
    dataFrame = dataFrame.drop(listOfFeaturesToDrop, axis=1)

    #Dropping all null value rows from specified columns
    dataFrame = deleteNullRow(dataFrame,'Sport')
    dataFrame = deleteNullRow(dataFrame,'SrcAddr')
    dataFrame = deleteNullRow(dataFrame,'Dport')
    dataFrame = deleteNullRow(dataFrame,'DstAddr')
    
    
    # TODO
    #dp.convertColumnToTimeStamp(dataFrame,'StartTime') # ?? already a timestamp
    
    
    #Outputting number of rows and column names after preprocessing
    print("\n----------After pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    return dataFrame

In [6]:
dataFrame = preprocessData(dataFrame)

----------Before pre-processing-----------
Number of rows: 531
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']

----------After pre-processing-----------
Number of rows: 530
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'DstAddr', 'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']


In [7]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1,60,60,flow=Background
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6,388,208,flow=Background-TCP-Established
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6,388,208,flow=Background-TCP-Established
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6,388,208,flow=Background-TCP-Established
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,122,flow=Background-TCP-Established


In [8]:
from sklearn import preprocessing 

#Function to perform discretization on the data
def discretizeData(dataFrame):
    
    dfNew = dataFrame
    
    # Binning technique from
    # https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
    quantile_list = [0, .25, .5, .75, 1.] # Change the quantile_list for more or less accuracy
    
    dfNew['TotBytesDisc'] = ""
    dfNew['SrcBytesDisc'] = ""
    dfNew['TotBytesDisc'] = pd.qcut(dataFrame['TotBytes'], quantile_list)
    dfNew['SrcBytesDisc'] = pd.qcut(dataFrame['SrcBytes'], quantile_list)
    
    # Bin Src/Dest port
    # According to 0-1023(WELLKNOWN_PORTNUMBER)
    #              1024-49151(REGISTERED_PORTNUMBER)
    #              49152-65535(DYNAMIC_PORTNUMBER)
    Sport = dataFrame['Sport']#[0x0303].astype('int64')
    Sport = Sport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10)) # TODO, there has to be better way
    dfNew['SportDisc'] = ""
    dfNew['SportDisc'] = pd.cut(Sport, [0, 1023, 49151, 65535])
    
    Dport = dataFrame['Dport']#[0x0303].astype('int64')
    Dport = Dport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10))
    dfNew['DportDisc'] = ""
    dfNew['DportDisc'] = pd.cut(Dport, [0, 1023, 49151, 65535])

    
    #LabelEncoder for unique values for Proto column and stored as column ProtoDisc
    le = preprocessing.LabelEncoder()
    le.fit(dfNew.Proto.unique())
    dfNew["ProtoDisc"] = ""
    dfNew.ProtoDisc = le.transform(dfNew.Proto)
    
    
    #Encoding "label" column to "labelDisc"
    #0 = Background/Normal             1=Botnet
    dfNew["LabelDisc"] = ""
    dfNew['LabelDisc'] = dfNew['Label']
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Background.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Normal.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Botnet.*$)', '1')
    
    
    return dfNew
    


In [9]:
dataFrame = discretizeData(dataFrame)

In [10]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label,TotBytesDisc,SrcBytesDisc,SportDisc,DportDisc,ProtoDisc,LabelDisc
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1,60,60,flow=Background,"(59.999, 244.0]","(59.999, 122.0]","(1023, 49151]","(1023, 49151]",1,0
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6,388,208,flow=Background-TCP-Established,"(244.0, 1007.5]","(122.0, 548.0]","(49151, 65535]","(0, 1023]",1,0
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6,388,208,flow=Background-TCP-Established,"(244.0, 1007.5]","(122.0, 548.0]","(1023, 49151]","(1023, 49151]",1,0
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6,388,208,flow=Background-TCP-Established,"(244.0, 1007.5]","(122.0, 548.0]","(49151, 65535]","(0, 1023]",1,0
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,122,flow=Background-TCP-Established,"(244.0, 1007.5]","(59.999, 122.0]","(1023, 49151]","(0, 1023]",1,0


In [11]:
#helper function to count the distinct values of second column
#where SRCaddr's match in rolling window of size windowSize
def countDistinctMatchingForSrcAddr(sliceDF):
    SrcAddr = sliceDF["SrcAddr"].iloc[-1]     #SrcAddr of the rolling window to calculate for
    DstAddr = sliceDF["DstAddr"].iloc[-1]
    
    returnData = pd.DataFrame()
    
    srcAddrRows = sliceDF[sliceDF.SrcAddr == SrcAddr]
    destAddrRows = sliceDF[sliceDF.DstAddr == DstAddr]
    srcAndDestRows = srcAddrRows[srcAddrRows.DstAddr == DstAddr]
    
    # SrcAddr statistics
    returnData["SrcAddr_App"] = [srcAddrRows.shape[0]]   #counting total SRCAddr matches
    returnData["Src_Dport_unique"] =  srcAddrRows.Dport.nunique() #only counting distinct dports by using set
    returnData["Src_DstAddr_unique"] =  srcAddrRows.DstAddr.nunique()
    returnData["Src_Sport_unique"] =  srcAddrRows.Sport.nunique()
    returnData["Src_TotPkts_mean"] = srcAddrRows.TotPkts.mean()
    returnData["Src_TotBytesDisc_mode"] = srcAddrRows.TotBytesDisc.mode() # not quite mean but close enough
    
    # DstAddr statistics
    returnData["DstAddr_App"] = [destAddrRows.shape[0]]   #counting total SRCAddr matches
    returnData["Dst_Dport_unique"] =  destAddrRows.Dport.nunique()
    returnData["Dst_SrcAddr_unique"] =  destAddrRows.SrcAddr.nunique()
    returnData["Dst_Sport_unique"] =  destAddrRows.Sport.nunique()
    returnData["Dst_TotPkts_mean"] = destAddrRows.TotPkts.mean()
    returnData["Dst_TotBytesDisc_mode"] = destAddrRows.TotBytesDisc.mode() # not quite mean but close enough
    
    # Src+Dstaddr statistics
    returnData["SrcDst_Sport_unique"] =  srcAndDestRows.Sport.nunique()
    returnData["SrcDst_Dport_unique"] =  srcAndDestRows.Dport.nunique()
    
    return returnData

In [50]:
#Function to generate connection based features for the source address
def generateSrcAddrFeaturesConnectionBased(dataFrame, windowSize):
    
    dfNew = dataFrame
    
    #How many times the SRCADDRESS has appeared within the last X netflows (SrcAddr_Dis)
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct destination ports (Src_Dist_Des_Port) 
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct destination addresses (Src_Dist_Des_Addr)
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct source ports (Src_Dist_Src_Port)
    #For any of the flow records that SRCADDRESS AND DSTADDRESS has appeared within the last X netflows, count the distinct source ports   
    #For any of the flow records that SRCADDRESS AND DSTADDRESS has appeared within the last X netflows, count the distinct destinations ports
    
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, average the packets
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, average the bytes
    
    additionalCol = []
    for i in range(windowSize - 1, len(dfNew.index)):
        #Feature generation feedback every 10000 generated rows
        if (i%10000 == 0):
            print(i)
            
        window = dfNew[i - (windowSize-1):i+1]
        
        slice_df = countDistinctMatchingForSrcAddr(window)
        additionalCol.append(slice_df)
    
    # Set the right index
    newCol = pd.concat(additionalCol, axis=0)
    del additionalCol
    newCol.index = np.arange(windowSize - 1, windowSize + len(newCol) - 1)
    dfNew = dfNew.join(newCol)
    
    #Dropping beginning rows of size: windowsize since all the generated features are null
    dfNew = deleteNullRow(dfNew, 'SrcAddr_App')
    
    return dfNew

In [51]:
#Window size 10 for testing, actual use 10,000
dataFrame1 = generateSrcAddrFeaturesConnectionBased(dataFrame,20)

In [52]:
dataFrame1.head(10)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,...,Src_TotPkts_mean,Src_TotBytesDisc_mode,DstAddr_App,Dst_Dport_unique,Dst_SrcAddr_unique,Dst_Sport_unique,Dst_TotPkts_mean,Dst_TotBytesDisc_mode,SrcDst_Sport_unique,SrcDst_Dport_unique
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1,60,...,,,,,,,,,,
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6,388,...,,,,,,,,,,
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6,388,...,,,,,,,,,,
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6,388,...,,,,,,,,,,
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,...,,,,,,,,,,
5,2011/08/15 16:45:27.991372,12.542819,tcp,115.127.24.116,3196,147.32.84.229,13363,SR_SA,5,308,...,,,,,,,,,,
6,2011/08/15 16:45:30.295050,13.308726,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,...,,,,,,,,,,
7,2011/08/15 16:45:57.756664,1.413248,tcp,77.52.60.161,3767,147.32.84.118,6881,S_RA,4,244,...,,,,,,,,,,
8,2011/08/15 16:47:50.502720,5.147244,tcp,77.52.60.161,3823,147.32.84.118,6881,S_RA,4,244,...,,,,,,,,,,
9,2011/08/15 16:47:53.430662,3.001157,tcp,147.32.84.59,52956,77.75.72.72,80,SR_A,3,186,...,,,,,,,,,,


In [43]:
# Time benchmarking
%prun generateSrcAddrFeaturesConnectionBased(dataFrame, 50)

 

In [44]:
# Line-by-line time benchmarking
#%load_ext line_profiler
%lprun -f generateSrcAddrFeaturesConnectionBased generateSrcAddrFeaturesConnectionBased(dataFrame, 50)

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [56]:
%lprun -f countDistinctMatchingForSrcAddr countDistinctMatchingForSrcAddr(dataFrame[0:100])