In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [173]:
dataFrame = pd.read_csv('capture20110815-2.csv')   #Very small capture (use this for speed of testing)
#dataFrame = pd.read_csv('combinedCapture.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [174]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,43:28.1,0.0,tcp,114.33.245.44,6881,?>,147.32.84.118,1567,RA_,0.0,0.0,1.0,60.0,60.0,flow=Background
1,43:32.3,13.431962,tcp,212.93.105.52,49237,->,147.32.84.229,80,SRA_SA,0.0,0.0,6.0,388.0,208.0,flow=Background-TCP-Established
2,43:32.5,13.350228,tcp,212.93.105.52,14906,->,147.32.84.229,13363,SRA_SA,0.0,0.0,6.0,388.0,208.0,flow=Background-TCP-Established
3,43:32.9,13.01009,tcp,212.93.105.52,60349,->,147.32.84.229,443,SRA_SA,0.0,0.0,6.0,388.0,208.0,flow=Background-TCP-Established
4,45:09.3,20.990047,tcp,115.127.24.116,3198,->,147.32.84.229,443,SR_SA,0.0,0.0,5.0,308.0,122.0,flow=Background-TCP-Established


In [175]:
#Deletes row's where the column values are null,nan,nat, or blank
def deleteNullRow(dataFrame, column):
    newDataFrame = dataFrame
    
    #dataframe dropna won't replace empty values only NaN and NaT so convert blank space to NaN then drop
    newDataFrame[column].replace('', np.nan, inplace=True)
    newDataFrame = newDataFrame.dropna(subset=[column])
    return newDataFrame

In [176]:
# From: https://github.com/mgarzon/cybersec/blob/master/MalwareDetection.ipynb
def preprocessData(dataFrame):

    '''
    This function is used to perform
    the necessary operations to 
    convert the raw data into a
    clean data set.
    '''
    
    #Outputting number of rows and column names before preprocessing
    print("----------Before pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    
    
    
    #dropping columns specified
    listOfFeaturesToDrop = [
    'Dir',
    'sTos',
    'dTos']
    dataFrame = dataFrame.drop(listOfFeaturesToDrop, axis=1)

    #Dropping all null value rows from specified columns
    dataFrame = deleteNullRow(dataFrame,'Sport')
    dataFrame = deleteNullRow(dataFrame,'SrcAddr')
    dataFrame = deleteNullRow(dataFrame,'Dport')
    dataFrame = deleteNullRow(dataFrame,'DstAddr')
    
    
    # TODO
    #dp.convertColumnToTimeStamp(dataFrame,'StartTime') # ?? already a timestamp
    
    
    #Outputting number of rows and column names after preprocessing
    print("\n----------After pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    return dataFrame

In [177]:
dataFrame = preprocessData(dataFrame)

----------Before pre-processing-----------
Number of rows: 129832
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']

----------After pre-processing-----------
Number of rows: 498
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'DstAddr', 'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']


In [178]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label
0,43:28.1,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1.0,60.0,60.0,flow=Background
1,43:32.3,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6.0,388.0,208.0,flow=Background-TCP-Established
2,43:32.5,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6.0,388.0,208.0,flow=Background-TCP-Established
3,43:32.9,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6.0,388.0,208.0,flow=Background-TCP-Established
4,45:09.3,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5.0,308.0,122.0,flow=Background-TCP-Established


In [179]:
from sklearn import preprocessing 

#Function to perform discretization on the data
def discretizeData(dataFrame):
    
    dfNew = dataFrame
    
    # Binning technique from
    # https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
    quantile_list = [0, .25, .5, .75, 1.] # Change the quantile_list for more or less accuracy
    
    dfNew['TotBytesDisc'] = ""
    dfNew['SrcBytesDisc'] = ""
    dfNew['TotBytesDisc'] = pd.qcut(dataFrame['TotBytes'], quantile_list)
    dfNew['SrcBytesDisc'] = pd.qcut(dataFrame['SrcBytes'], quantile_list)
    
    # Bin Src/Dest port
    # According to 0-1023(WELLKNOWN_PORTNUMBER)
    #              1024-49151(REGISTERED_PORTNUMBER)
    #              49152-65535(DYNAMIC_PORTNUMBER)
    Sport = dataFrame['Sport']#[0x0303].astype('int64')
    Sport = Sport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10)) # TODO, there has to be better way
    dfNew['SportDisc'] = ""
    dfNew['SportDisc'] = pd.cut(Sport, [0, 1023, 49151, 65535])
    
    Dport = dataFrame['Dport']#[0x0303].astype('int64')
    Dport = Dport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10))
    dfNew['DportDisc'] = ""
    dfNew['DportDisc'] = pd.cut(Dport, [0, 1023, 49151, 65535])

    
    #LabelEncoder for unique values for Proto column and stored as column ProtoDisc
    le = preprocessing.LabelEncoder()
    le.fit(dfNew.Proto.unique())
    dfNew["ProtoDisc"] = ""
    dfNew.ProtoDisc = le.transform(dfNew.Proto)
    
    
    #Encoding "label" column to "labelDisc"
    #0 = Background/Normal             1=Botnet
    dfNew["LabelDisc"] = ""
    dfNew['LabelDisc'] = dfNew['Label']
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Background.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Normal.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Botnet.*$)', '1')
    
    
    return dfNew
    


In [180]:
dataFrame = discretizeData(dataFrame)

In [181]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label,TotBytesDisc,SrcBytesDisc,SportDisc,DportDisc,ProtoDisc,LabelDisc
0,43:28.1,0.0,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1.0,60.0,60.0,flow=Background,"(59.999, 244.0]","(59.999, 122.0]","(1023, 49151]","(1023, 49151]",1,0
1,43:32.3,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6.0,388.0,208.0,flow=Background-TCP-Established,"(244.0, 1138.0]","(122.0, 574.5]","(49151, 65535]","(0, 1023]",1,0
2,43:32.5,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6.0,388.0,208.0,flow=Background-TCP-Established,"(244.0, 1138.0]","(122.0, 574.5]","(1023, 49151]","(1023, 49151]",1,0
3,43:32.9,13.01009,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6.0,388.0,208.0,flow=Background-TCP-Established,"(244.0, 1138.0]","(122.0, 574.5]","(49151, 65535]","(0, 1023]",1,0
4,45:09.3,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5.0,308.0,122.0,flow=Background-TCP-Established,"(244.0, 1138.0]","(59.999, 122.0]","(1023, 49151]","(0, 1023]",1,0


In [186]:
#helper function to count the distinct values of second column
#where SRCaddr's match in rolling window of size windowSize
def countDistinctMatchingForSrcAddr(index, sliceDF):
    SrcAddr = sliceDF["SrcAddr"].iloc[-1]     #SrcAddr of the rolling window to calculate for
    DstAddr = sliceDF["DstAddr"].iloc[-1]
    
    #[SrcAddr_App, Src_Dist_Des_Port, Src_Dist_Des_Addr,Src_Dist_Src_Port, SrcDst_Dist_Src_Port, SrcDst_Dist_Dst_Port]
    returnArray = [None] * 6    #array to return all the values for rolling window calcs

    SrcMatches = 0     #SrcAddr_Dis
    DestPortMatches = [] #To hold all destinationPorts where SrcAddrMatches
    DestAddrMatches = [] #To hold all destinationPorts where SrcAddrMatches
    SrcPortMatches = [] #To hold all destinationPorts where SrcAddrMatches
    
    SrcAndDstSrcPorts = []
    SrcAndDstDstPorts = []

    #interate over rows to check for matching SrcAddr and find values in other cols
    for row in sliceDF.itertuples(index=True, name='Pandas'):
        #SrcAddress Match
        if (getattr(row, "SrcAddr") == SrcAddr):
            DestPortMatches.append(getattr(row, "Dport"))
            DestAddrMatches.append(getattr(row, "DstAddr"))
            SrcPortMatches.append(getattr(row, "Sport"))
            SrcMatches += 1
            
        #SrcAddr match and DstAddr match
        if ((getattr(row, "SrcAddr") == SrcAddr) and (getattr(row, "DstAddr") == DstAddr)):
            SrcAndDstSrcPorts.append(getattr(row, "Sport"))
            SrcAndDstDstPorts.append(getattr(row, "Dport"))

    returnArray[0] = SrcMatches   #counting total SRCAddr matches
    returnArray[1] =  len(set(DestPortMatches)) #only counting distinct dports by using set
    returnArray[2] =  len(set(DestAddrMatches))
    returnArray[3] =  len(set(SrcPortMatches))
    returnArray[4] =  len(set(SrcAndDstSrcPorts))
    returnArray[5] =  len(set(SrcAndDstDstPorts))

    return returnArray

In [187]:
#Function to generate connection based features for the source address
def generateSrcAddrFeaturesConnectionBased(dataFrame, windowSize):
    
    dfNew = dataFrame
    
    
    dfNew["SrcAddr_App"] = np.nan
    dfNew["Src_Dist_Dst_Port"] = np.nan
    dfNew["Src_Dist_Dst_Addr"] = np.nan
    dfNew["Src_Dist_Src_Port"] = np.nan
    
    dfNew["SrcDst_Dist_Src_Port"] = np.nan
    dfNew["SrcDst_Dist_Dst_Port"] = np.nan
    
    #How many times the SRCADDRESS has appeared within the last X netflows (SrcAddr_Dis)
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct destination ports (Src_Dist_Des_Port) 
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct destination addresses (Src_Dist_Des_Addr)
    #For any of the flow records that SRCADDRESS has appeared within the last X netflows, count the distinct source ports (Src_Dist_Src_Port)
    #For any of the flow records that SRCADDRESS AND DSTADDRESS has appeared within the last X netflows, count the distinct source ports   
    #For any of the flow records that SRCADDRESS AND DSTADDRESS has appeared within the last X netflows, count the distinct destinations ports
    for i in range(windowSize-1, len(dfNew)+1):
        if (i>= windowSize-1):
            slice_df_Array = countDistinctMatchingForSrcAddr(i, dfNew[i - (windowSize-1):i+1])
            dfNew.loc[i,"SrcAddr_App"] = slice_df_Array[0]
            dfNew.loc[i,"Src_Dist_Dst_Port"] = slice_df_Array[1]
            dfNew.loc[i,"Src_Dist_Dst_Addr"] = slice_df_Array[2]
            dfNew.loc[i,"Src_Dist_Src_Port"] = slice_df_Array[3]
            dfNew.loc[i,"SrcDst_Dist_Src_Port"] = slice_df_Array[4]
            dfNew.loc[i,"SrcDst_Dist_Dst_Port"] = slice_df_Array[5]
    
    
    return dfNew
    

In [188]:
#Window size 10 for testing, actual use 10,000
dataFrame = generateSrcAddrFeaturesConnectionBased(dataFrame,10)

In [189]:
dataFrame.head(500)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,...,SportDisc,DportDisc,ProtoDisc,LabelDisc,SrcAddr_App,Src_Dist_Dst_Port,Src_Dist_Dst_Addr,Src_Dist_Src_Port,SrcDst_Dist_Src_Port,SrcDst_Dist_Dst_Port
0,43:28.1,0.000000,tcp,114.33.245.44,6881,147.32.84.118,1567,RA_,1.0,60.0,...,"(1023, 49151]","(1023, 49151]",1.0,0,,,,,,
1,43:32.3,13.431962,tcp,212.93.105.52,49237,147.32.84.229,80,SRA_SA,6.0,388.0,...,"(49151, 65535]","(0, 1023]",1.0,0,,,,,,
2,43:32.5,13.350228,tcp,212.93.105.52,14906,147.32.84.229,13363,SRA_SA,6.0,388.0,...,"(1023, 49151]","(1023, 49151]",1.0,0,,,,,,
3,43:32.9,13.010090,tcp,212.93.105.52,60349,147.32.84.229,443,SRA_SA,6.0,388.0,...,"(49151, 65535]","(0, 1023]",1.0,0,,,,,,
4,45:09.3,20.990047,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5.0,308.0,...,"(1023, 49151]","(0, 1023]",1.0,0,,,,,,
5,45:28.0,12.542819,tcp,115.127.24.116,3196,147.32.84.229,13363,SR_SA,5.0,308.0,...,"(1023, 49151]","(1023, 49151]",1.0,0,,,,,,
6,45:30.3,13.308726,tcp,115.127.24.116,3198,147.32.84.229,443,SR_SA,5.0,308.0,...,"(1023, 49151]","(0, 1023]",1.0,0,,,,,,
7,45:57.8,1.413248,tcp,77.52.60.161,3767,147.32.84.118,6881,S_RA,4.0,244.0,...,"(1023, 49151]","(1023, 49151]",1.0,0,,,,,,
8,47:50.5,5.147244,tcp,77.52.60.161,3823,147.32.84.118,6881,S_RA,4.0,244.0,...,"(1023, 49151]","(1023, 49151]",1.0,0,,,,,,
9,47:53.4,3.001157,tcp,147.32.84.59,52956,77.75.72.72,80,SR_A,3.0,186.0,...,"(49151, 65535]","(0, 1023]",1.0,0,1.0,1.0,1.0,1.0,1.0,1.0
