In [None]:
#from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

from tensorflow.keras.utils import to_categorical
from tensorflow import feature_column
from tensorflow.keras import layers

from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split


pd.options.display.max_columns = 100

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())




> #### Importing the data into pandas DataFrames separately:

In [None]:
monday = pd.read_csv(r"\MachineLearningCVE\Monday-WorkingHours.pcap_ISCX.csv", low_memory=False)
tuesday = pd.read_csv(r"\MachineLearningCVE\Tuesday-WorkingHours.pcap_ISCX.csv", low_memory=False)
wednesday = pd.read_csv(r"\MachineLearningCVE\Wednesday-workingHours.pcap_ISCX.csv", low_memory=False)
thursdayMorning = pd.read_csv(r"\MachineLearningCVE\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv", low_memory=False)
thursdayAfternoon = pd.read_csv(r"\MachineLearningCVE\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv", low_memory=False)
fridayMorning = pd.read_csv(r"\MachineLearningCVE\Friday-WorkingHours-Morning.pcap_ISCX.csv", low_memory=False)
fridayAfternoonPortscan = pd.read_csv(r"\MachineLearningCVE\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv", low_memory=False)
fridayAfternoonDDos = pd.read_csv(r"\MachineLearningCVE\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv", low_memory=False)

> #### Combining DataFrames:

In [None]:
dataset = pd.concat([monday, tuesday, wednesday, thursdayMorning, thursdayAfternoon, fridayMorning, fridayAfternoonPortscan, fridayAfternoonDDos], axis =0)

> #### Checking the line numbers:

In [None]:
monday.shape[0] + tuesday.shape[0] + wednesday.shape[0] + thursdayAfternoon.shape[0] + thursdayMorning.shape[0] + fridayMorning.shape[0] + fridayAfternoonPortscan.shape[0] + fridayAfternoonDDos.shape[0]

In [None]:
dataset.shape

> #### Descriptive statistics summaries

In [None]:
#dataset.describe()

In [None]:
#datasetSummary.index

In [None]:
list(dataset.columns)
dataset.head()

> #### Number of lines with 'Infinity' and 'NaN' values of 'Flow Packets / s' feature
> If the number of rows is not too high (<% 1), missing data will be removed in preprocessing. 
> Pre-processing part of the article was used. https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8804816&tag=1

In [None]:
dataset.loc[dataset[' Flow Packets/s'] == 'Infinity'].shape

In [None]:
dataset.loc[dataset[' Flow Packets/s']== 'NaN'].shape

In [None]:
datasetv1 = dataset.loc[dataset[' Flow Packets/s'] != 'Infinity']

In [None]:
datasetv1.shape

> #### This article uses min-max Normalization. 
> #### The **entity embedding technique** was used to use categorical data in DNN. https://arxiv.org/pdf/1910.02203.pdf

> #### "*Source IP, Destination IP, Source Port, Destination Port*" features are categorical data.

> #### Entity embedding whitepaper -> https://arxiv.org/pdf/1604.06737v1.pdf 

> #### Listing of single values with attributes:

In [None]:
for col in dataset.columns:
    if datasetv1[col].nunique() < 2:
        print(col , datasetv1[col].nunique())
   

> #### İlgili çalışmada veri seti olarak 'TrafficLabelling' olarak hazırlanmış veri seti kullanılmış. Bu veri seti içerisinde nitelik olarak Source IP ve Destination IP gibi değerler bulunmaktadır. Bu değerler veri toplama esnasında kurulan sentetik ortamın niteliklerini yansıtacağı bu sebeple kullanılan veri seti üzerinde yüksek doğruluk verirken farklı bir networkde detection'a bir katkı sağlamayacağı varsayımı doğrultusunda çalışmada Machine Learning için özelleştirilmiş olan veri seti dosyası kullanılmıştır.(https://arxiv.org/pdf/1910.02203.pdf) 

> #### TrafficLabelling formatındaki veri seti için yukarıda belirtilen çalışmada kullanılan "Source IP, Destination IP, Source Port, Protocol" gibi featureların kullanımının yanlış olduğunun gösterimesi:

> #### Dropping single value features in Dataset:

In [None]:
datasetv2 = datasetv1.copy()
# where the column names for the for loop are pre-assigned, they can be returned to the loop as col value in the dropped column labels, so an error is received.
for col in datasetv2.columns:
    if datasetv2[col].nunique() == 1:
        datasetv2.drop([col], axis = 1, inplace = True)

In [None]:
len(datasetv1.columns), len(datasetv2.columns)

> #### z-score normalization:

In [None]:
# Column typeların sayısal değer olması gerekiyor. 
for col in datasetv2.columns:
    print(col,'      ',datasetv2[col].dtypes)
    

In [None]:
# column types converted to float64.
datasetv3 = datasetv2.copy()
count = 0
for col in datasetv3:
    if col != ' Label':
        datasetv3[col] = datasetv3[col].astype('float64')
        count= count +1
      


In [None]:
for col in datasetv3.columns:
    print(col,'      ',datasetv3[col].dtypes)

In [None]:
datasetv3.head()

In [None]:
from scipy.stats  import zscore
#stats.zscore(datasetv3.loc[:, datasetv3.columns != ' Label'], axis=1)
numeric_cols = datasetv3.select_dtypes(include=[np.number]).columns
datasetv4 = datasetv3[numeric_cols].apply(zscore).copy()

In [None]:
datasetv4[' Label'] = datasetv3[' Label']

> #### Converting the values in the label column to numerical values:

In [None]:
datasetv4[' Label'].unique()

In [None]:
datasetv4[' Label'].replace(datasetv4[' Label'].unique(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 , 12, 13, 14], inplace = True)
datasetv4[' Label'].unique()
datasetv4[' Label'] = datasetv4[' Label'].astype('float64')

> #### Columns have been changed because the spaces in column name are troublesome in Tensorflow:

In [None]:
for col in datasetv4.columns:
    datasetv4.rename(columns = {col : col.strip().replace(' ','_')}, inplace=True)

In [None]:
datasetv4.head()
#datasetv4.describe()

In [None]:
train, test = train_test_split(datasetv4, test_size=0.2)

print(len(train), 'train examples')
print(len(test), 'test examples')
print(len(datasetv4), 'total')
train.shape

In [None]:
type(train)

In [None]:
train.Label.value_counts()

In [None]:
test.Label.value_counts()

In [None]:
train_set = train.copy()
train_label = train_set.pop('Label')
train_label_binary = train_label.copy()
train_label_binary.replace(range(1,15),1, inplace = True)


test_set = test.copy()
test_label = test_set.pop('Label')
test_label_binary = test_label.copy()
test_label_binary.replace(range(1,15), 1, inplace = True)

train_set = train_set.to_numpy().reshape(2262300, 70, 1)
train_label = train_label.to_numpy()
train_label_binary = train_label_binary.to_numpy()


test_set = test_set.to_numpy().reshape(565576, 70, 1)
test_label = test_label.to_numpy()
test_label_binary = test_label_binary.to_numpy()




#test_label_binary = to_categorical(test_label_binary)
#train_label_binary = to_categorical(train_label_binary)



> ## Multi-class Model

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Conv1D(16, (2), activation='relu', input_shape=(70, 1)),
  tf.keras.layers.MaxPooling1D(2),
  tf.keras.layers.Conv1D(16, (2), activation='relu'),
  tf.keras.layers.MaxPooling1D(2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(512, activation='relu'),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(15, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()
model.fit(train_set, train_label, epochs=5)

In [None]:
test_loss = model.evaluate(test_set, test_label)

In [None]:
y_pred = model.predict_classes(test_set)
con_mat = tf.math.confusion_matrix(labels = test_label, predictions = y_pred).numpy()

In [None]:
con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)
con_mat_df = pd.DataFrame(con_mat_norm)
con_mat_df.columns = datasetv3[' Label'].unique()
con_mat_df.index = datasetv3[' Label'].unique()
con_mat_df

In [None]:
figure = plt.figure(figsize=(10, 10))
sns.heatmap(con_mat_df, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
# Labels ekleecek https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
print(precision_recall_fscore_support(test_label_binary, y_pred, average='macro'))

print(precision_recall_fscore_support(test_label_binary, y_pred, average='micro'))

print(precision_recall_fscore_support(test_label_binary, y_pred, average='weighted'))

> ## Binary Model

In [None]:
modelv2 = tf.keras.Sequential([
  tf.keras.layers.Conv1D(16, (2), activation='relu', input_shape=(70, 1)),
  tf.keras.layers.MaxPooling1D(2),
  tf.keras.layers.Conv1D(16, (2), activation='relu'),
  tf.keras.layers.MaxPooling1D(2),
  tf.keras.layers.Flatten(),
 # tf.keras.layers.Dense(512, activation='relu'),
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(2, activation='sigmoid')
])

modelv2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
modelv2.summary()
modelv2.fit(train_set, train_label_binary, epochs=5)

In [None]:
test_lossv2 = modelv2.evaluate(test_set, test_label_binary)

In [None]:
y_predv2 = modelv2.predict_classes(test_set)
con_matv2 = tf.math.confusion_matrix(labels = test_label_binary, predictions = y_predv2).numpy()

In [None]:

print(precision_recall_fscore_support(test_label_binary, y_predv2, average='macro'))

print(precision_recall_fscore_support(test_label_binary, y_predv2, average='micro'))

print(precision_recall_fscore_support(test_label_binary, y_predv2, average='weighted'))

In [None]:
con_mat_normv2 = np.around(con_matv2.astype('float') / con_matv2.sum(axis=1)[:, np.newaxis], decimals=2)
con_mat_dfv2 = pd.DataFrame(con_mat_normv2)
con_mat_dfv2.columns = ['Negative', 'Positive']
con_mat_dfv2.index = ['Negative', 'Positive']
con_mat_dfv2

In [None]:
figure = plt.figure(figsize=(6, 6))
sns.heatmap(con_mat_dfv2, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()