In [1]:
# imports
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import pandas as pd

from utils import dtype

In [2]:
# cluster setup
cluster = LocalCluster(n_workers=os.cpu_count())
client = Client(cluster)

In [4]:
dataset_part2: dd.DataFrame = dd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'raw_data', 'Tuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'), dtype=dtype)
dataset_part1: dd.DataFrame = dd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018','raw_data', 'uniform_files', '*.csv'), dtype=dtype)

extra_cols = dataset_part2.columns.difference(dataset_part1.columns)
for col in extra_cols:
    dataset_part1[col] = None
dataset: dd.DataFrame = dd.concat([dataset_part1, dataset_part2])  # type: ignore
dataset['Timestamp'] = dd.to_datetime(dataset['Timestamp'], dayfirst=True)

In [5]:
# attack labelling script

# note: this csv was generated from the table provided by the authors at: https://www.unb.ca/cic/datasets/ids-2018.html
attack_list: pd.DataFrame = pd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'attack_list.csv'))  
attack_list['start datetime'] = pd.to_datetime(attack_list['Date'] + ' ' + attack_list['Attack Start Time'] + ':00', dayfirst=True)
attack_list['end datetime'] = pd.to_datetime(attack_list['Date'] + ' ' + attack_list['Attack Finish Time'] + ':59', dayfirst=True)

dataset['attack name'] = 'Benign'
def label_attack(partition):
    for _, attack in attack_list.iterrows():
        is_part_of_attack = (attack['start datetime'] <= partition['Timestamp']) & (partition['Timestamp'] <= attack['end datetime']) & (partition['Label'] == attack['Attack Type'])
        partition.loc[is_part_of_attack, 'attack name'] = attack['Attack Name']
    return partition
metalabelled_dataset: dd.DataFrame = dataset.map_partitions(label_attack)  # type: ignore

In [6]:
metalabelled_dataset.to_csv(os.path.join('..', '..', 'data', 'CSE-CIC-IDS-2018', 'meta_labelled_data'), single_file=False, index=False)

['/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/000.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/001.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/002.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/003.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/004.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/005.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/006.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/007.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/008.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/009.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/010.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/011.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_labelled_data/012.part',
 '/Users/admin/Documents/Code/Git/FYP/data/meta_lab