In [1]:
import os
import time
import nest_asyncio
nest_asyncio.apply()

import pyshark

import numpy as np
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD

import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autotime

time: 142 µs (started: 2021-08-01 13:27:50 +02:00)


In [3]:
## Loading data

time: 167 µs (started: 2021-08-01 13:27:50 +02:00)


In [4]:
def loadDataset(path, extraId=""):
    cap = pyshark.FileCapture(path, use_json=True, include_raw=True)
    cap.load_packets()
    raw_data_list = [c.get_raw_packet() for c in cap]

    dataset = []
    
    for raw_data in raw_data_list:
        arr = np.zeros((75, 20), dtype=np.uint8)
        x, y = 0, 0
        for e in raw_data:
            arr[x, y] = e
            x += 1
            if x == 75:
                x = 0
                y += 1
            if y == 20:
                break
        dataset.append(arr.flatten())  
    return np.asarray(dataset)

time: 686 µs (started: 2021-08-01 13:27:50 +02:00)


In [5]:
badQueries = loadDataset('./dataset/badHttpQueriesFiltered.pcap')

badCount = len(badQueries)

time: 1min 24s (started: 2021-08-01 13:27:50 +02:00)


In [6]:
badCount

153824

time: 4.04 ms (started: 2021-08-01 13:29:14 +02:00)


In [7]:
validQueries = loadDataset('./dataset/goodHttpQueriesFiltered.pcap')

validCount = len(validQueries)

time: 1min 23s (started: 2021-08-01 13:29:14 +02:00)


In [8]:
validCount

148962

time: 1.67 ms (started: 2021-08-01 13:30:38 +02:00)


In [9]:
## Preparing the dataset

time: 367 µs (started: 2021-08-01 13:30:38 +02:00)


In [10]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]

time: 6.9 ms (started: 2021-08-01 13:30:38 +02:00)


In [11]:
X = np.concatenate((badQueries, validQueries), axis=0)
y = yBad + yGood

time: 304 ms (started: 2021-08-01 13:30:38 +02:00)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #splitting data

time: 418 ms (started: 2021-08-01 13:30:38 +02:00)


In [13]:
## KNN

time: 198 µs (started: 2021-08-01 13:30:39 +02:00)


In [36]:
def knn_ball_tree_model():
    knn = KNeighborsClassifier(1500, algorithm='ball_tree', n_jobs=-1)
    knn.fit(X_train, y_train) 
    return knn
def knn_kd_tree_model():
    knn = KNeighborsClassifier(1500, algorithm='kd_tree', n_jobs=-1)
    knn.fit(X_train, y_train) 
    return knn

time: 1.24 ms (started: 2021-08-01 13:43:22 +02:00)


In [15]:
## Logistic Regression
def lr_model():
    lr = LogisticRegression(n_jobs=-1)
    lr.fit(X_train, y_train)
    return lr

time: 280 µs (started: 2021-08-01 13:30:39 +02:00)


In [16]:
## SVM

time: 139 µs (started: 2021-08-01 13:30:39 +02:00)


In [17]:
def svm_model():
    svm = SVC(kernel = 'linear', C = 1) #C to improve model 
    svm.fit(X_train, y_train) 
    return svm

time: 290 µs (started: 2021-08-01 13:30:39 +02:00)


In [18]:
## Decision Tree

time: 173 µs (started: 2021-08-01 13:30:39 +02:00)


In [19]:
def dtree_model():
    dtree = DecisionTreeClassifier()
    dtree.fit(X_train, y_train)
    return dtree

time: 544 µs (started: 2021-08-01 13:30:39 +02:00)


In [20]:
## Random Forest

time: 152 µs (started: 2021-08-01 13:30:39 +02:00)


In [21]:
def rfc_model():
    rfc = RandomForestClassifier(1500, n_jobs=-1)
    rfc.fit(X_train, y_train)
    return rfc

time: 363 µs (started: 2021-08-01 13:30:39 +02:00)


In [22]:
## Multiclass LDA

time: 160 µs (started: 2021-08-01 13:30:39 +02:00)


In [23]:
def mlda_model():
    mlda = LinearDiscriminantAnalysis()
    mlda.fit(X_train, y_train)
    return mlda

time: 280 µs (started: 2021-08-01 13:30:39 +02:00)


In [24]:
## Gradient Boosting

time: 156 µs (started: 2021-08-01 13:30:39 +02:00)


In [25]:
def gbc_model():
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    return gbc

time: 264 µs (started: 2021-08-01 13:30:39 +02:00)


In [26]:
## Bagging

time: 167 µs (started: 2021-08-01 13:30:39 +02:00)


In [27]:
def bc_model():
    bc = BaggingClassifier()
    bc.fit(X_train, y_train)
    return bc

time: 1.96 ms (started: 2021-08-01 13:30:39 +02:00)


In [28]:
## Extra Tree

time: 506 µs (started: 2021-08-01 13:30:39 +02:00)


In [29]:
def etc_model():
    etc = ExtraTreesClassifier(1000, max_features=2, max_depth=None, min_samples_split=2, n_jobs=-1)
    etc.fit(X_train, y_train)
    return etc

time: 472 µs (started: 2021-08-01 13:30:39 +02:00)


In [30]:
def etc_pipe_model():
    etc_pipe = Pipeline([
        ('feature_selection', SelectFromModel(ExtraTreesClassifier(1000))),
        ('classification', ExtraTreesClassifier(1000, max_features=2, max_depth=None, min_samples_split=2, n_jobs=-1))
    ])
    etc_pipe.fit(X_train, y_train)
    return etc_pipe

time: 370 µs (started: 2021-08-01 13:30:39 +02:00)


In [31]:
## RFECV

time: 6.52 ms (started: 2021-08-01 13:30:39 +02:00)


In [32]:
def rfecv_model():
    rfecv = RFECV(estimator = ExtraTreesClassifier(1500), n_jobs=-1, verbose = 1)
    rfecv.fit(X_train, y_train)
    return rfecv

time: 705 µs (started: 2021-08-01 13:30:39 +02:00)


In [33]:
## Metrics

time: 152 µs (started: 2021-08-01 13:30:39 +02:00)


In [40]:
classifiers = [
    'knn_ball_tree', 
    'knn_kd_tree',
#    'svm',  # Kernel freeze
    'dtree', 
    'rfc', 
#    'mlda', # Kernel crash - no more ram ...
    'gbc', 
    'bc', 
    'etc', 
#    'etc_pipe', # Kernel crash
#    'rfecv', # Kernel crash - no more ram ...
]

time: 419 µs (started: 2021-08-01 16:37:25 +02:00)


In [None]:
for classifier in classifiers :
    model_fn = globals()[classifier + "_model"]
    
    print(classifier)
    print("> Training")
    
    start_time = time.time()
    model = model_fn()
    training_duration = round(time.time() - start_time, 3)
    print(f"< {training_duration}s")
    
    print("> Testing")
    start_time = time.time()
    y_pred = model.predict(X_test)
    count_misclassified = (y_test != y_pred).sum()
    total_y = len(y_test)
    testing_duration = round(time.time() - start_time)
    print(f"< {testing_duration}s")

    print("> Result")
    print(f"Misclassified samples: {count_misclassified}/{total_y}")
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}'.format(accuracy))
          
    print()

knn_ball_tree
> Training
< 73.117s
> Testing
< 4372s
> Result
Misclassified samples: 26959/60558
Accuracy: 0.55

knn_kd_tree
> Training
< 92.289s
> Testing
< 4691s
> Result
Misclassified samples: 26956/60558
Accuracy: 0.55

dtree
> Training
< 5.69s
> Testing
< 0s
> Result
Misclassified samples: 0/60558
Accuracy: 1.00

rfc
> Training
< 249.879s
> Testing
< 4s
> Result
Misclassified samples: 0/60558
Accuracy: 1.00

gbc
> Training
< 646.379s
> Testing
< 1s
> Result
Misclassified samples: 0/60558
Accuracy: 1.00

bc
> Training
< 31.88s
> Testing
< 4s
> Result
Misclassified samples: 0/60558
Accuracy: 1.00

etc
> Training
< 275.184s
> Testing
< 5s
> Result
Misclassified samples: 0/60558
Accuracy: 1.00
