In [1]:
import os
import nest_asyncio
nest_asyncio.apply()

import numpy as np
import pyshark
import xxhash
from tqdm import tqdm

In [2]:
%load_ext autotime

time: 277 µs (started: 2021-10-08 15:21:30 +00:00)


In [3]:
TRIMED_DATASET_FOLDER="trimed_dataset"
MALICIOUS_DATASET_FOLDER=f"{TRIMED_DATASET_FOLDER}/malicious"
GOOD_DATASET_FOLDER=f"{TRIMED_DATASET_FOLDER}/good"

time: 2.35 ms (started: 2021-10-08 15:21:30 +00:00)


In [4]:
def dataset_preprocessing(pcap_filepath: str, is_malicious: bool) -> None:
    h = xxhash.xxh64()
    out_folder = MALICIOUS_DATASET_FOLDER if is_malicious else GOOD_DATASET_FOLDER
    
    cap = pyshark.FileCapture(pcap_filepath, use_json=True, include_raw=True)
    #cap.load_packets()
    with tqdm(total=len(cap)) as pbar:
        pbar.set_description(f"Processing {pcap_filepath}")
        for index, frame in enumerate(cap):
            h.update(pcap_filepath)
            h.update(str(index))
            frame_hash = h.intdigest()
            h.reset()

            out_file = f"{out_folder}/{frame_hash}.npy"
            if not os.path.exists(out_file):
                raw_data = np.zeros(1500, dtype=np.bool)
                x = 0
                for e in frame.get_raw_packet():
                    raw_data[x] = e
                    x += 1

                np.save(out_file, raw_data, allow_pickle=True, fix_imports=True)
            pbar.update(1)
            
def dataset_preprocessing_list(filepaths: list[str], is_malicious: bool) -> None:
    for e in filepaths:
        dataset_preprocessing(e, is_malicious)

time: 10.9 ms (started: 2021-10-08 15:21:30 +00:00)


In [5]:
dataset_preprocessing_list(["dataset/badHttpQueries_00000_20210727145629.pcap",
                       "dataset/badHttpQueries_00001_20210727145652.pcap",
                       "dataset/badHttpQueries_00002_20210727145716.pcap",
                       "dataset/badHttpQueries_00003_20210727145738.pcap",
                       "dataset/badHttpQueries_00004_20210727145801.pcap",
                       "dataset/badHttpQueries_00005_20210727145823.pcap",
                      ], True)

Processing dataset/badHttpQueries_00000_20210727145629.pcap: : 50000it [00:25, 1946.47it/s]
Processing dataset/badHttpQueries_00001_20210727145652.pcap: : 50000it [00:23, 2086.20it/s]
Processing dataset/badHttpQueries_00002_20210727145716.pcap: : 50000it [00:24, 2067.55it/s]
Processing dataset/badHttpQueries_00003_20210727145738.pcap: : 50000it [00:24, 2080.61it/s]
Processing dataset/badHttpQueries_00004_20210727145801.pcap: : 50000it [00:23, 2094.59it/s]
Processing dataset/badHttpQueries_00005_20210727145823.pcap: : 13750it [00:07, 1946.16it/s]

time: 2min 8s (started: 2021-10-08 15:21:30 +00:00)





In [6]:
dataset_preprocessing_list(["dataset/goodHttpQueries_00000_20210727145319.pcap",
                            "dataset/goodHttpQueries_00001_20210727145342.pcap",
                            "dataset/goodHttpQueries_00002_20210727145404.pcap",
                            "dataset/goodHttpQueries_00003_20210727145427.pcap",
                            "dataset/goodHttpQueries_00004_20210727145449.pcap",
                            "dataset/goodHttpQueries_00005_20210727145511.pcap"
                           ], False)

Processing dataset/goodHttpQueries_00000_20210727145319.pcap: : 50000it [00:24, 2070.37it/s]
Processing dataset/goodHttpQueries_00001_20210727145342.pcap: : 50000it [00:24, 2017.83it/s]
Processing dataset/goodHttpQueries_00002_20210727145404.pcap: : 50000it [00:23, 2106.55it/s]
Processing dataset/goodHttpQueries_00003_20210727145427.pcap: : 50000it [00:23, 2114.15it/s]
Processing dataset/goodHttpQueries_00004_20210727145449.pcap: : 50000it [00:24, 2061.70it/s]
Processing dataset/goodHttpQueries_00005_20210727145511.pcap: : 5402it [00:02, 1804.49it/s]

time: 2min 3s (started: 2021-10-08 15:23:39 +00:00)



