In [1]:
import os
import nest_asyncio
nest_asyncio.apply()

import numpy as np
import pyshark
import xxhash
from tqdm import tqdm

In [2]:
%load_ext autotime
%run dataset.ipynb

time: 320 ms (started: 2021-11-01 16:31:56 +00:00)


In [19]:
def dataset_preprocessing(pcap_filepath, is_malicious):
    h = xxhash.xxh64()
    out_folder = MALICIOUS_DATASET_FOLDER if is_malicious else GOOD_DATASET_FOLDER
    
    cap = pyshark.FileCapture(pcap_filepath, use_json=True, include_raw=True)
    #cap.load_packets()
    with tqdm(total=len(cap)) as pbar:
        pbar.set_description(f"Processing {pcap_filepath}")
        for index, frame in enumerate(cap):
            h.update(pcap_filepath)
            h.update(str(index))
            frame_hash = h.intdigest()
            h.reset()

            out_file = f"{out_folder}/{frame_hash}.npy"
            if not os.path.exists(out_file):
                raw_data = np.zeros(1500, dtype=np.bool)
                x = 0
                
                if len(frame) > 1500:
                    pbar.update(1)
                    continue
                    
                for e in frame.get_raw_packet():
                    raw_data[x] = e
                    x += 1

                np.save(out_file, raw_data, allow_pickle=True, fix_imports=True)
            pbar.update(1)
            
def dataset_preprocessing_list(filepaths, is_malicious):
    for e in filepaths:
        dataset_preprocessing(e, is_malicious)

time: 2.03 ms (started: 2021-11-01 16:48:24 +00:00)


In [4]:
dataset_preprocessing_list(["dataset/badHttpQueries_00000_20210727145629.pcap",
                       "dataset/badHttpQueries_00001_20210727145652.pcap",
                       "dataset/badHttpQueries_00002_20210727145716.pcap",
                       "dataset/badHttpQueries_00003_20210727145738.pcap",
                       "dataset/badHttpQueries_00004_20210727145801.pcap",
                       "dataset/badHttpQueries_00005_20210727145823.pcap",
                      ], True)

Processing dataset/badHttpQueries_00000_20210727145629.pcap: : 50000it [00:23, 2106.32it/s]
Processing dataset/badHttpQueries_00001_20210727145652.pcap: : 50000it [00:23, 2106.32it/s]
Processing dataset/badHttpQueries_00002_20210727145716.pcap: : 50000it [00:23, 2084.91it/s]
Processing dataset/badHttpQueries_00003_20210727145738.pcap: : 50000it [00:24, 2018.42it/s]
Processing dataset/badHttpQueries_00004_20210727145801.pcap: : 50000it [00:24, 2017.77it/s]
Processing dataset/badHttpQueries_00005_20210727145823.pcap: : 13750it [00:07, 1932.04it/s]

time: 2min 8s (started: 2021-11-01 16:31:56 +00:00)





In [5]:
dataset_preprocessing_list(["dataset/goodHttpQueries_00000_20210727145319.pcap",
                            "dataset/goodHttpQueries_00001_20210727145342.pcap",
                            "dataset/goodHttpQueries_00002_20210727145404.pcap",
                            "dataset/goodHttpQueries_00003_20210727145427.pcap",
                            "dataset/goodHttpQueries_00004_20210727145449.pcap",
                            "dataset/goodHttpQueries_00005_20210727145511.pcap"
                           ], False)

Processing dataset/goodHttpQueries_00000_20210727145319.pcap: : 50000it [00:24, 2048.36it/s]
Processing dataset/goodHttpQueries_00001_20210727145342.pcap: : 50000it [00:24, 2007.42it/s]
Processing dataset/goodHttpQueries_00002_20210727145404.pcap: : 50000it [00:24, 2046.72it/s]
Processing dataset/goodHttpQueries_00003_20210727145427.pcap: : 50000it [00:24, 2031.90it/s]
Processing dataset/goodHttpQueries_00004_20210727145449.pcap: : 50000it [00:24, 2030.39it/s]
Processing dataset/goodHttpQueries_00005_20210727145511.pcap: : 5402it [00:02, 1800.83it/s]

time: 2min 5s (started: 2021-11-01 16:34:04 +00:00)





In [20]:
dataset_preprocessing_list(["dataset/fic-s2-adm01-out_00000_20211101154337.pcapng",
                            "dataset/fic-s2-dc01-out_00000_20211101144008.pcapng",
                            "dataset/fic-s2-l01-out_00000_20211101143136.pcapng",
                            "dataset/fic-s2-l01-out_00001_20211101150744.pcapng",
                            "dataset/fic-s2-l01-out_00002_20211101150803.pcapng",
                            "dataset/fic-s2-l01-out_00003_20211101150824.pcapng",
                            "dataset/fic-s2-l01-out_00004_20211101150901.pcapng",
                            "dataset/fic-s2-l01-out_00005_20211101150918.pcapng",
                            "dataset/fic-s2-l01-out_00006_20211101150936.pcapng",
                            "dataset/fic-s2-l01-out_00007_20211101150951.pcapng",
                            "dataset/fic-s2-l01-out_00008_20211101151006.pcapng",
                            "dataset/fic-s2-pc01-out_00000_20211101154413.pcapng",
                            "dataset/fic-s2-pc01-out_00001_20211101162626.pcapng",
                            "dataset/fic-s2-pc01-out_00002_20211101162740.pcapng",
                            "dataset/fic-s2-pc01-out_00003_20211101164316.pcapng"
                           ], False)

Processing dataset/fic-s2-adm01-out_00000_20211101154337.pcapng: : 6217it [00:01, 3191.72it/s]
Processing dataset/fic-s2-dc01-out_00000_20211101144008.pcapng: : 13921it [05:44, 40.37it/s]  
Processing dataset/fic-s2-l01-out_00000_20211101143136.pcapng: : 50000it [00:21, 2350.99it/s]
Processing dataset/fic-s2-l01-out_00001_20211101150744.pcapng: : 50000it [00:21, 2377.24it/s]
Processing dataset/fic-s2-l01-out_00002_20211101150803.pcapng: : 50000it [00:21, 2377.25it/s]
Processing dataset/fic-s2-l01-out_00003_20211101150824.pcapng: : 50000it [00:21, 2358.56it/s]
Processing dataset/fic-s2-l01-out_00004_20211101150901.pcapng: : 50000it [00:21, 2369.99it/s]
Processing dataset/fic-s2-l01-out_00005_20211101150918.pcapng: : 50000it [00:20, 2383.55it/s]
Processing dataset/fic-s2-l01-out_00006_20211101150936.pcapng: : 50000it [00:21, 2348.44it/s]
Processing dataset/fic-s2-l01-out_00007_20211101150951.pcapng: : 50000it [00:21, 2365.36it/s]
Processing dataset/fic-s2-l01-out_00008_20211101151006.pca

time: 10min 4s (started: 2021-11-01 16:48:27 +00:00)



