In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [68]:
dataFrame = pd.read_csv('combinedCapture.csv')

In [69]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/16 13:52:03.078339,2.925519,tcp,147.32.86.58,3184,->,77.75.73.9,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt
1,2011/08/16 13:52:17.354431,1.0463,tcp,147.32.3.51,4420,->,147.32.84.46,10010,S_RA,0.0,0.0,4,244,124,flow=Background-TCP-Attempt
2,2011/08/16 13:52:32.449542,2.89805,tcp,147.32.86.58,3246,->,77.75.73.9,80,SR_A,0.0,0.0,3,182,122,flow=Background-TCP-Attempt
3,2011/08/16 13:58:28.305790,8.979935,tcp,212.95.7.124,8811,->,147.32.84.229,13363,SRA_SA,0.0,0.0,7,508,208,flow=Background-TCP-Established
4,2011/08/16 13:58:55.543606,5.832105,tcp,109.166.138.80,58325,->,147.32.84.229,13363,SRA_SA,0.0,0.0,8,538,294,flow=Background-TCP-Established


In [70]:
#Deletes row's where the column values are null,nan,nat, or blank
def deleteNullRow(dataFrame, column):
    newDataFrame = dataFrame
    
    #dataframe dropna won't replace empty values only NaN and NaT so convert blank space to NaN then drop
    newDataFrame[column].replace('', np.nan, inplace=True)
    newDataFrame = newDataFrame.dropna(subset=[column])
    return newDataFrame

In [71]:
# From: https://github.com/mgarzon/cybersec/blob/master/MalwareDetection.ipynb
def preprocessData(dataFrame):

    '''
    This function is used to perform
    the necessary operations to 
    convert the raw data into a
    clean data set.
    '''
    
    #Outputting number of rows and column names before preprocessing
    print("----------Before pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    
    
    
    #dropping columns specified
    listOfFeaturesToDrop = [
    'Dir',
    'sTos',
    'dTos']
    dataFrame = dataFrame.drop(listOfFeaturesToDrop, axis=1)

    #Dropping all null value rows from specified columns
    dataFrame = deleteNullRow(dataFrame,'Sport')
    dataFrame = deleteNullRow(dataFrame,'SrcAddr')
    dataFrame = deleteNullRow(dataFrame,'Dport')
    dataFrame = deleteNullRow(dataFrame,'DstAddr')
    
    
    # TODO
    #dp.convertColumnToTimeStamp(dataFrame,'StartTime') # ?? already a timestamp
    
    
    #Outputting number of rows and column names after preprocessing
    print("\n----------After pre-processing-----------")
    print("Number of rows: " + str(len(dataFrame.index)))
    print("The columns are: " + str(list(dataFrame)))
    return dataFrame

In [72]:
dataFrame = preprocessData(dataFrame)

----------Before pre-processing-----------
Number of rows: 114077
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']

----------After pre-processing-----------
Number of rows: 113196
The columns are: ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'DstAddr', 'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']


In [73]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label
0,2011/08/16 13:52:03.078339,2.925519,tcp,147.32.86.58,3184,77.75.73.9,80,SR_A,3,182,122,flow=Background-TCP-Attempt
1,2011/08/16 13:52:17.354431,1.0463,tcp,147.32.3.51,4420,147.32.84.46,10010,S_RA,4,244,124,flow=Background-TCP-Attempt
2,2011/08/16 13:52:32.449542,2.89805,tcp,147.32.86.58,3246,77.75.73.9,80,SR_A,3,182,122,flow=Background-TCP-Attempt
3,2011/08/16 13:58:28.305790,8.979935,tcp,212.95.7.124,8811,147.32.84.229,13363,SRA_SA,7,508,208,flow=Background-TCP-Established
4,2011/08/16 13:58:55.543606,5.832105,tcp,109.166.138.80,58325,147.32.84.229,13363,SRA_SA,8,538,294,flow=Background-TCP-Established


In [75]:
from sklearn import preprocessing 

#Function to perform discretization on the data
def discretizeData(dataFrame):
    
    dfNew = dataFrame
    
    # Binning technique from
    # https://towardsdatascience.com/understanding-feature-engineering-part-1-continuous-numeric-data-da4e47099a7b
    quantile_list = [0, .25, .5, .75, 1.] # Change the quantile_list for more or less accuracy
    dfNew['TotBytes'] = pd.qcut(dataFrame['TotBytes'], quantile_list)
    dfNew['SrcBytes'] = pd.qcut(dataFrame['SrcBytes'], quantile_list)
    
    # Bin Src/Dest port
    # According to 0-1023(WELLKNOWN_PORTNUMBER)
    #              1024-49151(REGISTERED_PORTNUMBER)
    #              49152-65535(DYNAMIC_PORTNUMBER)
    Sport = dataFrame['Sport']#[0x0303].astype('int64')
    Sport = Sport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10)) # TODO, there has to be better way
    dfNew['Sport'] = pd.cut(Sport, [0, 1023, 49151, 65535])
    
    Dport = dataFrame['Dport']#[0x0303].astype('int64')
    Dport = Dport.apply(lambda x: int(x, 16) if x[0] == '0' and x[1] == 'x' else int(x, 10))
    dfNew['Dport'] = pd.cut(Dport, [0, 1023, 49151, 65535])

    
    #LabelEncoder for unique values for Proto column and stored as column ProtoDisc
    le = preprocessing.LabelEncoder()
    le.fit(dfNew.Proto.unique())
    dfNew["ProtoDisc"] = ""
    dfNew.ProtoDisc = le.transform(dfNew.Proto)
    
    
    #Encoding "label" column to "labelDisc"
    #0 = Background/Normal             1=Botnet
    dfNew["LabelDisc"] = ""
    dfNew['LabelDisc'] = dfNew['Label']
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Background.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Normal.*$)', '0')
    dfNew['LabelDisc'] = dfNew.LabelDisc.str.replace(r'(^.*Botnet.*$)', '1')
    
    
    return dfNew
    


In [76]:
dataFrame = discretizeData(dataFrame)

In [77]:
dataFrame.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,DstAddr,Dport,State,TotPkts,TotBytes,SrcBytes,Label,ProtoDisc,LabelDisc
0,2011/08/16 13:52:03.078339,2.925519,tcp,147.32.86.58,"(1023, 49151]",77.75.73.9,"(0, 1023]",SR_A,3,"(59.999, 214.0]","(81.0, 442.0]",flow=Background-TCP-Attempt,3,0
1,2011/08/16 13:52:17.354431,1.0463,tcp,147.32.3.51,"(1023, 49151]",147.32.84.46,"(1023, 49151]",S_RA,4,"(214.0, 266.0]","(81.0, 442.0]",flow=Background-TCP-Attempt,3,0
2,2011/08/16 13:52:32.449542,2.89805,tcp,147.32.86.58,"(1023, 49151]",77.75.73.9,"(0, 1023]",SR_A,3,"(59.999, 214.0]","(81.0, 442.0]",flow=Background-TCP-Attempt,3,0
3,2011/08/16 13:58:28.305790,8.979935,tcp,212.95.7.124,"(1023, 49151]",147.32.84.229,"(1023, 49151]",SRA_SA,7,"(266.0, 678.0]","(81.0, 442.0]",flow=Background-TCP-Established,3,0
4,2011/08/16 13:58:55.543606,5.832105,tcp,109.166.138.80,"(49151, 65535]",147.32.84.229,"(1023, 49151]",SRA_SA,8,"(266.0, 678.0]","(81.0, 442.0]",flow=Background-TCP-Established,3,0


In [78]:
#dataFrame[["TotBytes"]].describe()

#fig, ax = plt.subplots()
#dataFrame['SrcBytes'].hist(bins=50, color='#A9C5D3', edgecolor='black', grid=False)
#ax.set_title('Bytes', fontsize=12)
#ax.set_xlabel('TotBytes', fontsize=12)
#ax.set_ylabel('SrcBytes', fontsize=12)


#np.array([[1, 2, 3]]).hist()

#pd.DataFrame([1, 2, 3, 1, 4])