In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
dataFrame = pd.read_csv('capture20110815-2.csv')   #Very small capture (use this for speed of testing)
#dataFrame = pd.read_csv('combinedCapture.csv')

In [3]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,?>,147.32.84.118,1567,RA_,0.0,0.0,1,60,60,flow=Background
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,->,147.32.84.229,80,SRA_SA,0.0,0.0,6,388,208,flow=Background-TCP-Established
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,->,147.32.84.229,13363,SRA_SA,0.0,0.0,6,388,208,flow=Background-TCP-Established
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,->,147.32.84.229,443,SRA_SA,0.0,0.0,6,388,208,flow=Background-TCP-Established
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,->,147.32.84.229,443,SR_SA,0.0,0.0,5,308,122,flow=Background-TCP-Established


In [4]:
#Deletes row's where the column values are null,nan,nat, or blank
def deleteNullRow(dataFrame, column):
    newDataFrame = dataFrame
    
    #dataframe dropna won't replace empty values only NaN and NaT so convert blank space to NaN then drop
    newDataFrame[column].replace('', np.nan, inplace=True)
    newDataFrame = newDataFrame.dropna(subset=[column])
    return newDataFrame

In [5]:
# From: https://github.com/mgarzon/cybersec/blob/master/MalwareDetection.ipynb
def preprocessData(dataFrame):

    '''
    This function is used to perform
    the necessary operations to 
    convert the raw data into a
    clean data set.
    '''
    
    #Outputting number of rows and column names before preprocessing
    print("----------Before pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    
    
    
    #dropping columns specified
    listOfFeaturesToDrop = [
    'Dir',
    'sTos',
    'dTos']
    dataFrame = dataFrame.drop(listOfFeaturesToDrop, axis=1)

    #Dropping all null value rows from specified columns
    dataFrame = deleteNullRow(dataFrame,'Sport')
    dataFrame = deleteNullRow(dataFrame,'SrcAddr')
    dataFrame = deleteNullRow(dataFrame,'Dport')
    dataFrame = deleteNullRow(dataFrame,'DstAddr')
    
    
    # TODO
    #dp.convertColumnToTimeStamp(dataFrame,'StartTime') # ?? already a timestamp
    
    
    #Outputting number of rows and column names after preprocessing
    print("\n----------After pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    return dataFrame

In [6]:
dataFrame = preprocessData(dataFrame)

----------Before pre-processing-----------
Number of rows: 531
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']

----------After pre-processing-----------
Number of rows: 530
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'DstAddr', 'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']


In [7]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1,60,60,flow=Background
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6,388,208,flow=Background-TCP-Established
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6,388,208,flow=Background-TCP-Established
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6,388,208,flow=Background-TCP-Established
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,122,flow=Background-TCP-Established


In [8]:
from sklearn import preprocessing 

#Function to perform discretization on the data
def discretizeData(dataFrame):
    
    dfNew = dataFrame
    
    # Binning technique from
    # https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
    quantile_list = [0, .25, .5, .75, 1.] # Change the quantile_list for more or less accuracy
    
    dfNew['TotBytesDisc'] = ""
    dfNew['SrcBytesDisc'] = ""
    dfNew['TotBytesDisc'] = pd.qcut(dataFrame['TotBytes'], quantile_list)
    dfNew['SrcBytesDisc'] = pd.qcut(dataFrame['SrcBytes'], quantile_list)
    
    # Bin Src/Dest port
    # According to 0-1023(WELLKNOWN_PORTNUMBER)
    #              1024-49151(REGISTERED_PORTNUMBER)
    #              49152-65535(DYNAMIC_PORTNUMBER)
    Sport = dataFrame['Sport']#[0x0303].astype('int64')
    Sport = Sport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10)) # TODO, there has to be better way
    dfNew['SportDisc'] = ""
    dfNew['SportDisc'] = pd.cut(Sport, [0, 1023, 49151, 65535])
    
    Dport = dataFrame['Dport']#[0x0303].astype('int64')
    Dport = Dport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10))
    dfNew['DportDisc'] = ""
    dfNew['DportDisc'] = pd.cut(Dport, [0, 1023, 49151, 65535])

    
    #LabelEncoder for unique values for Proto column and stored as column ProtoDisc
    le = preprocessing.LabelEncoder()
    le.fit(dfNew.Proto.unique())
    dfNew["ProtoDisc"] = ""
    dfNew.ProtoDisc = le.transform(dfNew.Proto)
    
    
    #Encoding "label" column to "labelDisc"
    #0 = Background/Normal             1=Botnet
    dfNew["LabelDisc"] = ""
    dfNew['LabelDisc'] = dfNew['Label']
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Background.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Normal.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Botnet.*$)', '1')
    
    
    return dfNew
    


In [9]:
dataFrame = discretizeData(dataFrame)

In [10]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label,TotBytesDisc,SrcBytesDisc,SportDisc,DportDisc,ProtoDisc,LabelDisc
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1,60,60,flow=Background,"(59.999, 244.0]","(59.999, 122.0]","(1023, 49151]","(1023, 49151]",1,0
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6,388,208,flow=Background-TCP-Established,"(244.0, 1007.5]","(122.0, 548.0]","(49151, 65535]","(0, 1023]",1,0
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6,388,208,flow=Background-TCP-Established,"(244.0, 1007.5]","(122.0, 548.0]","(1023, 49151]","(1023, 49151]",1,0
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6,388,208,flow=Background-TCP-Established,"(244.0, 1007.5]","(122.0, 548.0]","(49151, 65535]","(0, 1023]",1,0
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,122,flow=Background-TCP-Established,"(244.0, 1007.5]","(59.999, 122.0]","(1023, 49151]","(0, 1023]",1,0


In [11]:
#helper function to count the distinct values of second column
#where SRCaddr's match in rolling window of size windowSize
def countDistinctMatchingForSrcAddr(sliceDF):
    SrcAddr = sliceDF["SrcAddr"].iloc[-1]     #SrcAddr of the rolling window to calculate for
    DstAddr = sliceDF["DstAddr"].iloc[-1]
    
    #[SrcAddr_App, Src_Dist_Des_Port, Src_Dist_Des_Addr,Src_Dist_Src_Port, SrcDst_Dist_Src_Port, SrcDst_Dist_Dst_Port]
    returnData = pd.DataFrame()
    
    SrcMatches = 0     #SrcAddr_Dis
    DestPortMatches = [] #To hold all destinationPorts where SrcAddrMatches
    DestAddrMatches = [] #To hold all destinationPorts where SrcAddrMatches
    SrcPortMatches = [] #To hold all destinationPorts where SrcAddrMatches
    
    SrcAndDstSrcPorts = []
    SrcAndDstDstPorts = []
    
    # Src Address matches last one
    srcAddrRows = sliceDF[sliceDF.SrcAddr == SrcAddr]
    # Src + Dst address match last one
    srcAndDestRows = srcAddrRows[srcAddrRows.DstAddr == DstAddr]
    
    
    #interate over rows to check for matching SrcAddr and find values in other cols
    for row in sliceDF.itertuples(index=True, name='Pandas'):
        #SrcAddress Match
        if (getattr(row, "SrcAddr") == SrcAddr):
            DestPortMatches.append(getattr(row, "Dport"))
            DestAddrMatches.append(getattr(row, "DstAddr"))
            SrcPortMatches.append(getattr(row, "Sport"))
            SrcMatches += 1
            
        #SrcAddr match and DstAddr match
        if ((getattr(row, "SrcAddr") == SrcAddr) and (getattr(row, "DstAddr") == DstAddr)):
            SrcAndDstSrcPorts.append(getattr(row, "Sport"))
            SrcAndDstDstPorts.append(getattr(row, "Dport"))

    returnData["SrcAddr_App"] = [SrcMatches]   #counting total SRCAddr matches
    returnData["Src_Dist_Dst_Port"] =  [len(set(DestPortMatches))] #only counting distinct dports by using set
    returnData["Src_Dist_Dst_Addr"] =  [len(set(DestAddrMatches))]
    returnData["Src_Dist_Src_Port"] =  [len(set(SrcPortMatches))]
    returnData["SrcDst_Dist_Src_Port"] =  [len(set(SrcAndDstSrcPorts))]
    returnData["SrcDst_Dist_Dst_Port"] =  [len(set(SrcAndDstDstPorts))]
    returnData["Src_AVG_Packets"] = srcAddrRows.TotPkts.mean()
    returnData["Src_AVG_Bytes"] = srcAddrRows.TotBytesDisc.mode() # not quite mean but close enough

    return returnData

In [12]:
#helper function to count the distinct values of second column
#where SRCaddr's match in rolling window of size windowSize
def countDistinctMatchingForDestAddr(sliceDF):
    SrcAddr = sliceDF["SrcAddr"].iloc[-1]     #SrcAddr of the rolling window to calculate for
    DstAddr = sliceDF["DstAddr"].iloc[-1]
    
    #[SrcAddr_App, Src_Dist_Des_Port, Src_Dist_Des_Addr,Src_Dist_Src_Port, SrcDst_Dist_Src_Port, SrcDst_Dist_Dst_Port]
    returnData = pd.DataFrame()
    
    DstMatches = 0     #SrcAddr_Dis
    DestPortMatches = [] #To hold all destinationPorts where SrcAddrMatches
    SrcAddrMatches = [] #To hold all destinationPorts where SrcAddrMatches
    SrcPortMatches = [] #To hold all destinationPorts where SrcAddrMatches
    
    SrcAndDstSrcPorts = []
    SrcAndDstDstPorts = []
    
    # Src Address matches last one
    destAddrRows = sliceDF[sliceDF.DstAddr == DstAddr]
    # Src + Dst address match last one
    #srcAndDestRows = srcAddrRows[srcAddrRows.DstAddr == DstAddr]
    
    
    #interate over rows to check for matching SrcAddr and find values in other cols
    for row in sliceDF.itertuples(index=True, name='Pandas'):
        #DstAddr Match
        if (getattr(row, "DstAddr") == DstAddr):
            DestPortMatches.append(getattr(row, "Dport"))
            SrcAddrMatches.append(getattr(row, "SrcAddr"))
            SrcPortMatches.append(getattr(row, "Sport"))
            DstMatches += 1

    returnData["DstAddr_App"] = [DstMatches]   #counting total SRCAddr matches
    returnData["Dst_Dist_Dst_Port"] =  [len(set(DestPortMatches))] #only counting distinct dports by using set
    returnData["Dst_SrcAddrMatches"] =  [len(set(SrcAddrMatches))]
    returnData["Dst_Dist_Src_Port"] =  [len(set(SrcPortMatches))]
    returnData["Dst_AVG_Packets"] = destAddrRows.TotPkts.mean()
    returnData["Dst_AVG_Bytes"] = destAddrRows.TotBytesDisc.mode() # not quite mean but close enough

    return returnData

In [17]:
#Function to generate connection based features for the source address
def generateSrcAddrFeaturesConnectionBased(dataFrame, windowSize):
    
    dfNew = dataFrame
    
    #How many times the SRCADDRESS has appeared within the last X netflows (SrcAddr_Dis)
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct destination ports (Src_Dist_Des_Port) 
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct destination addresses (Src_Dist_Des_Addr)
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct source ports (Src_Dist_Src_Port)
    #For any of the flow records that SRCADDRESS AND DSTADDRESS has appeared within the last X netflows, count the distinct source ports   
    #For any of the flow records that SRCADDRESS AND DSTADDRESS has appeared within the last X netflows, count the distinct destinations ports
    
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, average the packets
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, average the bytes
    
    additionalCol = None
    additionalCol2 = None
    for i in range(windowSize - 1, len(dfNew)):
        window = dfNew[i - (windowSize-1):i+1]
        
        slice_df = countDistinctMatchingForSrcAddr(window)
        slice_df2 = countDistinctMatchingForDestAddr(window)
        #print(slice_df)
        #slice_df_dest = countDistinctMatchingForDstAddr()
        
        if i == windowSize - 1:
            additionalCol = slice_df
            additionalCol2 = slice_df2
        else:
            additionalCol = pd.concat([additionalCol, slice_df])
            additionalCol2 = pd.concat([additionalCol2, slice_df2])
        
        #dfNew.loc[i, :] = slice_df
    
    # Set the right index
    additionalCol = additionalCol.reset_index()
    additionalCol.index += windowSize - 1
    additionalCol2 = additionalCol2.reset_index()
    additionalCol2.index += windowSize - 1
    
    dfNew = dfNew.join(additionalCol).drop(columns=['index'])
    dfNew = dfNew.join(additionalCol2).drop(columns=['index'])
    
    return dfNew
    

In [18]:
#Window size 10 for testing, actual use 10,000
dataFrame1 = generateSrcAddrFeaturesConnectionBased(dataFrame,10)

In [19]:
dataFrame1.head(50)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,...,SrcDst_Dist_Src_Port,SrcDst_Dist_Dst_Port,Src_AVG_Packets,Src_AVG_Bytes,DstAddr_App,Dst_Dist_Dst_Port,Dst_SrcAddrMatches,Dst_Dist_Src_Port,Dst_AVG_Packets,Dst_AVG_Bytes
0,2011/08/15 16:43:28.078942,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1,60,...,,,,,,,,,,
1,2011/08/15 16:43:32.283576,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6,388,...,,,,,,,,,,
2,2011/08/15 16:43:32.456441,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6,388,...,,,,,,,,,,
3,2011/08/15 16:43:32.850648,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6,388,...,,,,,,,,,,
4,2011/08/15 16:45:09.305002,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,...,,,,,,,,,,
5,2011/08/15 16:45:27.991372,12.542819,tcp,115.127.24.116,3196,147.32.84.229,13363,SR_SA,5,308,...,,,,,,,,,,
6,2011/08/15 16:45:30.295050,13.308726,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5,308,...,,,,,,,,,,
7,2011/08/15 16:45:57.756664,1.413248,tcp,77.52.60.161,3767,147.32.84.118,6881,S_RA,4,244,...,,,,,,,,,,
8,2011/08/15 16:47:50.502720,5.147244,tcp,77.52.60.161,3823,147.32.84.118,6881,S_RA,4,244,...,,,,,,,,,,
9,2011/08/15 16:47:53.430662,3.001157,tcp,147.32.84.59,52956,77.75.72.72,80,SR_A,3,186,...,1.0,1.0,3.0,"(59.999, 244.0]",1.0,1.0,1.0,1.0,3.0,"(59.999, 244.0]"


In [None]:
# Time benchmarking
%prun generateSrcAddrFeaturesConnectionBased(dataFrame,10)

In [None]:
# Line-by-line time benchmarking
#%load_ext line_profiler
%lprun -f generateSrcAddrFeaturesConnectionBased generateSrcAddrFeaturesConnectionBased(dataFrame,10)