In [None]:
import os
import time
import nest_asyncio
nest_asyncio.apply()

import pyshark

import numpy as np
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD

import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
%load_ext autotime

In [None]:
## Loading data

In [None]:
def loadDataset(path, extraId=""):
    cap = pyshark.FileCapture(path, use_json=True, include_raw=True)
    cap.load_packets()
    raw_data_list = [c.get_raw_packet() for c in cap]

    dataset = []
    
    for raw_data in raw_data_list:
        arr = np.zeros((75, 20), dtype=np.uint8)
        x, y = 0, 0
        for e in raw_data:
            arr[x, y] = e
            x += 1
            if x == 75:
                x = 0
                y += 1
            if y == 20:
                break
        dataset.append(arr.flatten())  
    return np.asarray(dataset)

In [None]:
badQueries = loadDataset('./dataset/badHttpQueriesFiltered.pcap')

badCount = len(badQueries)

In [None]:
badCount

In [None]:
validQueries = loadDataset('./dataset/goodHttpQueriesFiltered.pcap')

validCount = len(validQueries)

In [None]:
validCount

In [None]:
## Preparing the dataset

In [None]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]

In [None]:
X = np.concatenate((badQueries, validQueries), axis=0)
y = yBad + yGood

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #splitting data

In [None]:
## KNN

In [None]:
def knn_model():
    knn = KNeighborsClassifier(1500)
    knn.fit(X_train, y_train) 
    return knn

In [None]:
## Logistic Regression
def lr_model():
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    return lr

In [None]:
## SVM

In [None]:
def svm_model():
    svm = SVC(kernel = 'linear', C = 1) #C to improve model 
    svm.fit(X_train, y_train) 
    return svm

In [None]:
## Decision Tree

In [None]:
def dtree_model():
    dtree = DecisionTreeClassifier()
    dtree.fit(X_train, y_train)
    return dtree

In [None]:
## Random Forest

In [None]:
def rfc_model():
    rfc = RandomForestClassifier(1500)
    rfc.fit(X_train, y_train)
    return rfc

In [None]:
## Multiclass LDA

In [None]:
def mlda_model():
    mlda = LinearDiscriminantAnalysis()
    mlda.fit(X_train, y_train)
    return mlda

In [None]:
## Gradient Boosting

In [None]:
def gbc_model():
    gbc = GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    return gbc

In [None]:
## Bagging

In [None]:
def bc_model():
    bc = BaggingClassifier()
    bc.fit(X_train, y_train)
    return bc

In [None]:
## Extra Tree

In [None]:
def etc_model():
    etc = ExtraTreesClassifier(1000, max_features=2, max_depth=None, min_samples_split=2)
    etc.fit(X_train, y_train)
    return etc

In [None]:
def etc_pipe_model():
    etc_pipe = Pipeline([
        ('feature_selection', SelectFromModel(ExtraTreesClassifier(1000))),
        ('classification', ExtraTreesClassifier(1000, max_features=2, max_depth=None, min_samples_split=2))
    ])
    etc_pipe.fit(X_train, y_train)
    return etc_pipe

In [None]:
## RFECV

In [None]:
def rfecv_model():
    rfecv = RFECV(estimator = ExtraTreesClassifier(1500), n_jobs=-1, verbose = 1)
    rfecv.fit(X_train, y_train)
    return rfecv

In [None]:
## Metrics

In [2]:
classifiers = [
    'knn', 
    #'svm',  # Kernel freeze
    'dtree', 
    'rfc', 
#    'mlda', # Kernel crash - no more ram ...
    'gbc', 
    'bc', 
    'etc', 
    'etc_pipe', 
#    'rfecv', # Kernel crash - no more ram ...
]

In [None]:
for classifier in classifiers :
    model_fn = globals()[classifier + "_model"]

    print(classifier)
    print("> Training")
    
    start_time = time.time()
    model = model_fn()
    training_duration = round(time.time() - start_time, 3)
    print(f"< {training_duration}s")
    
    print("> Testing")
    start_time = time.time()
    y_pred = model.predict(X_test)
    count_misclassified = (y_test != y_pred).sum()
    total_y = len(y_test)
    testing_duration = round(time.time() - start_time)
    print(f"< {testing_duration}s")

    print("> Result")
    print(f"Misclassified samples: {count_misclassified}/{total_y}")
    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {:.2f}'.format(accuracy))
          
    print()