In [2]:
import pandas as pd
import glob
import re
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
import datetime

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
# Saving all .csv files in folder to list.
path = "Dataset/MachineLearningCSV/MachineLearningCVE/"
file_names = []

for file in glob.glob(path + "**/*.csv", recursive=True):
    print(file)
    file_names.append(file)


Dataset/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Dataset/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [7]:
# Reading all the csv files into dataframes into a "list of Dataframes"
dataframes = []     
for f in file_names:
    dataframes.append(pd.read_csv(f))


In [8]:
#Dimensions of all datafiles
for d in dataframes:
    print(d.shape)

(288602, 79)
(225745, 79)
(529918, 79)
(692703, 79)
(191033, 79)
(445909, 79)
(170366, 79)
(286467, 79)


In [9]:
# checking all dataframes have same column names or not
same_col_names = 0
for i in range(0,len(dataframes)-1):
    same_col_names = dataframes[i].columns == dataframes[i+1].columns
    
    if False in same_col_names:
        print("file num {} and {} doesnot contain same column names ".format(i,i+1))     
        break
    
    #if same_col_names.any(where=False):
    #    break

same_col_names

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [10]:
# Combining all tables into one dataset
dataset = pd.DataFrame()
for d in dataframes:
    dataset = pd.concat([dataset, d])

#dropping duplicate rows
dataset = dataset.drop_duplicates(keep=False)    #try with keep =  "first"
dataset.reset_index(drop=True, inplace = True)

In [11]:
# By checking the shape of dataset we can confirm that concatenation has been successfull.
dataset.shape

(2427193, 79)

In [12]:
del dataframes

In [None]:
#dtypes of all columns
# dataset.info()

In [13]:
#removing trailing and leading spaces in cloumn  names
dataset.columns = [i.strip() for i in dataset.columns.values]


In [14]:
#OUTPUT LABELS (different types of attacks and BENIGN)
label_names = dataset['Label'].unique()
print(len(label_names))
print(label_names)

15
['BENIGN' 'Infiltration' 'DDoS' 'DoS slowloris' 'DoS Slowhttptest'
 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed' 'Bot' 'FTP-Patator' 'SSH-Patator'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'PortScan']


In [15]:
#removing non-ascii symbols from output labels and bringing into required format
labels = [str(l).replace(chr(65533),"")for l in label_names]
labels = [str(l).replace("  ","_")for l in labels]
labels = [str(l).replace(" ","_")for l in labels]
labels = [str(l).replace("-","_")for l in labels]
labels

['BENIGN',
 'Infiltration',
 'DDoS',
 'DoS_slowloris',
 'DoS_Slowhttptest',
 'DoS_Hulk',
 'DoS_GoldenEye',
 'Heartbleed',
 'Bot',
 'FTP_Patator',
 'SSH_Patator',
 'Web_Attack_Brute_Force',
 'Web_Attack_XSS',
 'Web_Attack_Sql_Injection',
 'PortScan']

In [16]:
#mapping 
replacing_dict = {}
for i in range(len(label_names)):
    replacing_dict[label_names[i]] = labels[i]

In [17]:
#replacing the "label" column values
dataset["Label"] = dataset["Label"].replace(replacing_dict)

In [18]:
dataset['Label'].unique()

array(['BENIGN', 'Infiltration', 'DDoS', 'DoS_slowloris',
       'DoS_Slowhttptest', 'DoS_Hulk', 'DoS_GoldenEye', 'Heartbleed',
       'Bot', 'FTP_Patator', 'SSH_Patator', 'Web_Attack_Brute_Force',
       'Web_Attack_XSS', 'Web_Attack_Sql_Injection', 'PortScan'],
      dtype=object)

In [None]:
# # Saving cleaned dataset.
# dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/Dataset_clean.csv", index=False)

In [19]:
dataset.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,60148,83,1,2,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,123,99947,1,1,48,48,48,48,48.0,0.0,...,40,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,123,37017,1,1,48,48,48,48,48.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,0,111161336,147,0,0,0,0,0,0.0,0.0,...,0,1753752.625,2123197.578,4822992,95,9463032.7,2657727.996,13600000,5700287,BENIGN
4,123,38407,1,1,48,48,48,48,48.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [20]:
#REMOVING NULL

In [21]:
#checking null values
dataset.isnull().values.any()

True

In [22]:
#number of rows with NULL values
dataset.isnull().values.sum()

334

As number of Null values is very small(i.e. 0.01% of dataset) we can safely remove

In [23]:
dataset.dropna(inplace = True)

In [24]:
#number of rows with NULL values
dataset.isnull().values.sum()

0

In [25]:
# Checking if all values are finite.
np.all(np.isfinite(dataset.iloc[:,:-1]))

False

In [26]:
#CALCULATING NUMBER OF nonfinite vlaues present
np.isinf(dataset.iloc[:,:-1]).values.sum()

2264

In [27]:
# Replacing infinite values with NaN values.
dataset = dataset.replace([np.inf, -np.inf], np.nan)

In [28]:
# CALCULATING NUMBER OF nonfinite vlaues present
# np.isinf(dataset.iloc[:,:-1]).values.sum()

In [29]:
dataset.dropna(inplace = True)

In [30]:
dataset.shape

(2425727, 79)

Normalizing the data for the model

In [32]:
# Saving cleaned dataset.
dataset.to_csv("Dataset_clean.csv", index=False)