In [1]:
import os
import time
import nest_asyncio
nest_asyncio.apply()

import pyshark

import numpy as np
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD

import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
%load_ext autotime

time: 186 µs (started: 2021-08-02 18:19:30 +02:00)


In [3]:
## Loading data

time: 227 µs (started: 2021-08-02 18:19:30 +02:00)


In [4]:
def loadDataset(path, extraId=""):
    cap = pyshark.FileCapture(path, use_json=True, include_raw=True)
    cap.load_packets()
    raw_data_list = [c.get_raw_packet() for c in cap]

    dataset = []
    
    for raw_data in raw_data_list:
        arr = np.zeros((75, 20), dtype=np.uint8)
        x, y = 0, 0
        for e in raw_data:
            arr[x, y] = e
            x += 1
            if x == 75:
                x = 0
                y += 1
            if y == 20:
                break
        dataset.append(arr.flatten())  
    return np.asarray(dataset)

time: 762 µs (started: 2021-08-02 18:19:30 +02:00)


In [5]:
badQueries = loadDataset('./dataset/badHttpQueriesFiltered.pcap')

badCount = len(badQueries)

time: 1min 22s (started: 2021-08-02 18:19:30 +02:00)


In [6]:
badCount

153824

time: 3.72 ms (started: 2021-08-02 18:20:53 +02:00)


In [7]:
validQueries = loadDataset('./dataset/goodHttpQueriesFiltered.pcap')

validCount = len(validQueries)

time: 1min 26s (started: 2021-08-02 18:20:53 +02:00)


In [8]:
validCount

148962

time: 3.7 ms (started: 2021-08-02 18:22:19 +02:00)


In [9]:
## Preparing the dataset

time: 170 µs (started: 2021-08-02 18:22:19 +02:00)


In [10]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]

time: 7.67 ms (started: 2021-08-02 18:22:19 +02:00)


In [11]:
X = np.concatenate((badQueries, validQueries), axis=0)
y = yBad + yGood

time: 212 ms (started: 2021-08-02 18:22:19 +02:00)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #splitting data

time: 416 ms (started: 2021-08-02 18:22:19 +02:00)


In [13]:
## Model

time: 194 µs (started: 2021-08-02 18:22:19 +02:00)


In [14]:
dtree = DecisionTreeClassifier()

time: 634 µs (started: 2021-08-02 18:22:20 +02:00)


In [15]:
## Train

time: 340 µs (started: 2021-08-02 18:22:20 +02:00)


In [16]:
dtree.fit(X_train, y_train)

DecisionTreeClassifier()

time: 7.8 s (started: 2021-08-02 18:22:20 +02:00)


In [17]:
## Metrics

time: 227 µs (started: 2021-08-02 18:22:27 +02:00)


In [18]:
y_pred = dtree.predict(X_test)
count_misclassified = (y_test != y_pred).sum()

time: 113 ms (started: 2021-08-02 18:22:27 +02:00)


In [19]:
total_y = len(y_test)
print("> Result")
print(f"Misclassified samples: {count_misclassified}/{total_y}")
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

> Result
Misclassified samples: 0/60558
Accuracy: 1.00
time: 17.4 ms (started: 2021-08-02 18:22:27 +02:00)


In [20]:
## Testing

time: 173 µs (started: 2021-08-02 18:22:27 +02:00)


In [21]:
def frame_is_bad(frame):
    return "yes" if dtree.predict(np.asarray([frame]))[0] else "no"

time: 310 µs (started: 2021-08-02 18:22:27 +02:00)


In [22]:
print("Is bad ?", frame_is_bad(X_test[0]))
print("Expected:", "yes" if y_test[0] else "no")

Is bad ? no
Expected: no
time: 1.72 ms (started: 2021-08-02 18:22:27 +02:00)


In [23]:
print("Is bad ?", frame_is_bad(X_test[1]))
print("Expected:", "yes" if y_test[1] else "no")

Is bad ? yes
Expected: yes
time: 2.21 ms (started: 2021-08-02 18:22:27 +02:00)


In [24]:
print("Is bad ?", frame_is_bad(X_test[2]))
print("Expected:", "yes" if y_test[2] else "no")

Is bad ? yes
Expected: yes
time: 998 µs (started: 2021-08-02 18:22:28 +02:00)


In [25]:
print("Is bad ?", frame_is_bad(X_test[3]))
print("Expected:", "yes" if y_test[3] else "no")

Is bad ? no
Expected: no
time: 780 µs (started: 2021-08-02 18:22:28 +02:00)


In [26]:
print("Is bad ?", frame_is_bad(X_test[4]))
print("Expected:", "yes" if y_test[4] else "no")

Is bad ? no
Expected: no
time: 2.05 ms (started: 2021-08-02 18:22:28 +02:00)
