In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, BatchNormalization, Dropout
from keras import optimizers
from keras import backend as K
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
import glob
import tensorflow as tf 
import pcap_processor
import numpy as np
import pandas as pd
import cic_2017_setup
import cic_2018_setup

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
full_data, training_data = cic_2017_setup.setup()


Columns (14) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (14,15) have mixed types. Specify dtype option on import or set low_memory=False.



In [3]:
#one hot encode labels
def one_hot_encode_malicious_type(labels):
    labelencoder = LabelEncoder()
    y_labels_as_int_list = []
    for label in labels:
        if label == "Infiltration":
            y_labels_as_int_list.append(1)
        elif label == "DoS Hulk":
            y_labels_as_int_list.append(2)
        elif label == "DDoS":
            y_labels_as_int_list.append(3)
        elif label == "DoS GoldenEye":
            y_labels_as_int_list.append(4)
        elif label == "FTP-Patator":
            y_labels_as_int_list.append(5)
        elif label == "SSH-Patator":
            y_labels_as_int_list.append(6)
        elif label == "DoS slowloris":
            y_labels_as_int_list.append(7)
        elif label == "DoS Slowhttptest":
            y_labels_as_int_list.append(8)
        elif label == "Bot":
            y_labels_as_int_list.append(9)
        elif label == "Web Attack � Brute Force":
            y_labels_as_int_list.append(10)
        elif label == "Web Attack � XSS":
            y_labels_as_int_list.append(11)
        elif label == "Web Attack � Sql Injection ":
            y_labels_as_int_list.append(12)
        elif label == "Heartbleed":
            y_labels_as_int_list.append(13)   
        else:
            y_labels_as_int_list.append(0)
    y_labels_encoded = to_categorical(y_labels_as_int_list)
    #y_labels_encoded = labelencoder.fit_transform(y_labels_as_int_list)
    return y_labels_encoded

def one_hot_encode_class_type(labels):
    labelencoder = LabelEncoder()
    y_labels_as_int_list = []
    for label in labels:
        if label != "BENIGN":
            y_labels_as_int_list.append(1)
        else:
            y_labels_as_int_list.append(0)
    y_labels_encoded = to_categorical(y_labels_as_int_list)
    #_labels_encoded = labelencoder.fit_transform(y_labels_as_int_list)
    return y_labels_encoded
            

In [None]:
nn_input_dim = len(pca_training_data)
nn_input_dim

In [None]:
neural_network = Sequential()
neural_network.add(Dense(32, input_dim=nn_input_dim, activation="relu"))
neural_network.add(BatchNormalization())
neural_network.add(Dense(64, activation="relu"))
neural_network.add(Dropout(.2))
neural_network.add(BatchNormalization())
neural_network.add(Dense(96, activation="relu"))
neural_network.add(Dropout(.2))
neural_network.add(BatchNormalization())
neural_network.add(Dense(2))
neural_network.add(Activation(tf.nn.softmax))

In [24]:
def focal_loss(y_true, y_pred):
    gamma = 2.0
    alpha = 0.25
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))




optimizer = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
neural_network.compile(loss=[focal_loss], optimizer=optimizer, metrics=["accuracy"])

In [25]:
neural_network.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                2528      
_________________________________________________________________
batch_normalization_1 (Batch (None, 32)                128       
_________________________________________________________________
dense_2 (Dense)              (None, 64)                2112      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 64)                256       
_________________________________________________________________
dense_3 (Dense)              (None, 96)                6240      
_________________________________________________________________
dropout_2 (Dropout)          (None, 96)                0         
__________

In [4]:
def training_kfold(x, y, neural_network, n=5, epochs=10, batch_size=512, shuffle=True, random_state=None):
    
    from sklearn.model_selection import StratifiedKFold

    kfold = StratifiedKFold(n_splits=n, shuffle=True, random_state = random_state)
    cvscores = []
    
    for train, test in kfold.split(x, y):
    
        y_labels_encoded = one_hot_encode_class_type(y)
        neural_network.model.fit(x.iloc[train], y_labels_encoded[train], epochs=epochs, batch_size=batch_size)
        scores = neural_network.evaluate(x.iloc[test], y_labels_encoded[test])
        print("%s: %.2f%%" % (neural_network.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
    print(cvscores)
    return neural_network

def training_standard(x, y, neural_network, epochs=10, batch_size=512):
    '''
    class_weight = {0: 1,
                    1: 50,
                    2: 50,
                    3: 50,
                    4: 50,
                    5: 50,
                    6: 50,
                    7: 50,
                    8: 50,
                    9: 50,
                    10: 50,
                    11: 50,
                    12: 50,
                    13: 50,
                    14: 50,
                    15: 50}
    '''
    #class_weight = {[1,0]: 1,
    #                [0,1]: 100}
    
    #normalize x
    #x = (x - x.mean()) / (x.max() - x.min())
    y_labels_encoded = one_hot_encode_class_type(y)
    neural_network.model.fit(x, y_labels_encoded, epochs=epochs, batch_size=batch_size, shuffle=True)
    return neural_network

- Combine lables and data dataframes
- Save all maclicious traffic to maclicious_traffic
- Drop non-benign traffic from data
- Append 2/3 of random benign samples to training_data
- Train on training_data

In [5]:
# Drop non-benign traffic from data

training_labels = training_data["labels"]
training_data = training_data.drop(labels="labels", axis=1)

full_labels = full_data["labels"]
full_data = full_data.drop(labels="labels", axis=1)


In [10]:
le = LabelEncoder()
le.fit(training_labels)
#print(list(le.classes_))
#print(len(le.classes_))
y_classes = le.transform(training_labels)
print(list(le.inverse_transform([7, 7, 0])))


['FTP-Patator', 'FTP-Patator', 'BENIGN']


In [11]:
explore_training_lables = pd.DataFrame(one_hot_encode_class_type(training_labels))
explore_training_lables.columns = ["benign", "malicious"]
print(explore_training_lables["benign"].value_counts())
print(explore_training_lables["malicious"].value_counts())
explore_training_lables

0.0    612584
1.0    223885
Name: benign, dtype: int64
1.0    612584
0.0    223885
Name: malicious, dtype: int64


Unnamed: 0,benign,malicious
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
5,0.0,1.0
6,0.0,1.0
7,0.0,1.0
8,0.0,1.0
9,0.0,1.0


In [17]:
#X_std.drop(labels="labels", axis=1, inplace=True)
pca_training_labels = training_labels
X_std = StandardScaler().fit_transform(training_data)
X_std = pd.DataFrame(X_std)
print(pca_training_labels)
X_std["labels"] = pca_training_labels
X_std.dropna(inplace=True)
pca_training_labels = X_std["labels"]
X_std.drop(labels="labels", axis=1, inplace=True)
#X_std.dtype
sklearn_pca = PCA(n_components=5)
pca_training_data = sklearn_pca.fit_transform(X_std)


Data with input dtype int64, float64 were all converted to float64 by StandardScaler.


Data with input dtype int64, float64 were all converted to float64 by StandardScaler.



11347     FTP-Patator
11348     FTP-Patator
11349     FTP-Patator
11350     FTP-Patator
11351     FTP-Patator
11352     FTP-Patator
11354     FTP-Patator
11355     FTP-Patator
11356     FTP-Patator
11357     FTP-Patator
11358     FTP-Patator
11359     FTP-Patator
11360     FTP-Patator
11361     FTP-Patator
11362     FTP-Patator
11363     FTP-Patator
11364     FTP-Patator
11365     FTP-Patator
11366     FTP-Patator
11367     FTP-Patator
11583     FTP-Patator
11584     FTP-Patator
11585     FTP-Patator
11586     FTP-Patator
11587     FTP-Patator
11588     FTP-Patator
11589     FTP-Patator
11590     FTP-Patator
11591     FTP-Patator
11592     FTP-Patator
             ...     
131185         BENIGN
314419         BENIGN
65681          BENIGN
45821          BENIGN
430983         BENIGN
154800         BENIGN
69972          BENIGN
43546            DDoS
134319           DDoS
360960         BENIGN
28352            DDoS
401740         BENIGN
215181         BENIGN
93530            DDoS
21782     

ValueError: cannot reindex from a duplicate axis

In [None]:
pca_training_data

In [15]:
neural_network = training_standard(training_data, training_labels, neural_network, epochs=15, batch_size=64)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
158912/836469 [====>.........................] - ETA: 1:11 - loss: 9.4560 - acc: 0.7335

KeyboardInterrupt: 

In [None]:
explore_training_lables = pd.DataFrame(training_labels)
explore_training_lables.columns = ["benign", "malicious"]
print(explore_training_lables["benign"].value_counts())
print(explore_training_lables["malicious"].value_counts())

In [None]:
neural_network.predict(x=training_data, y=y_classes, batch_size=128)

In [31]:
def clean_y(y):
    y = y.strip()
    y = y.replace("�", "")
    y = y.replace(" ","_")
    y = y.lower()
    return y
    
def clean_x(x):
    nan_rows = x[x.isnull().any(1)]
    x = x.apply(lambda x: x.fillna(x.mean()),axis=0)
    return x

def training_knn(x, y, n_neighbors):
    
    #y = one_hot_encode_class_type(y)
    
    from sklearn import preprocessing
    from sklearn.neighbors import KNeighborsClassifier
    
    x = clean_x(x)
    
    y = [clean_y(label) for label in y]
    
    lable_encoder = preprocessing.LabelEncoder()
    lable_encoder.fit(y)
    
    neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
    neigh.fit(x, lable_encoder.transform(y))

    score = neigh.score(x,y)
    
    print(score)
    
    return neigh


In [32]:
knn = training_knn(training_data, training_labels, 10)
#print(training_data.dtypes)
#print(training_labels.unique())

0.0



elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [34]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [35]:
knn.predict(clean_x(malicious_traffic.sample(5)))

TypeError: ('Could not convert PortScanDDoSPortScanDDoSDoS Hulk to numeric', 'occurred at index labels')