In [1]:
# imports
import os

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import pandas as pd
import numpy as np

from utils import dtype_cic_ids2018

In [2]:
dataset_part2: dd.DataFrame = dd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'raw_data', 'Tuesday-20-02-2018_TrafficForML_CICFlowMeter.csv'), dtype=dtype_cic_ids2018)
dataset_part1: dd.DataFrame = dd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018','raw_data', 'uniform_files', '*.csv'), dtype=dtype_cic_ids2018)

extra_cols = dataset_part2.columns.difference(dataset_part1.columns)
for col in extra_cols:
    dataset_part1[col] = None
dataset: dd.DataFrame = dd.concat([dataset_part1, dataset_part2])  # type: ignore


In [3]:
# Loads already meta-labelled data (useful for relabelling)
# dataset: dd.DataFrame = dd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'meta_labelled_data', '*.part'), dtype=dtype)

In [4]:
dataset['Timestamp'] = dd.to_datetime(dataset['Timestamp'], dayfirst=True)

def parse_date_time(part):  # datetime disambiguation
    mask = part['Timestamp'].dt.hour < 8
    part.loc[mask, 'Timestamp'] += pd.Timedelta(hours=12)
    return part

dataset = dataset.map_partitions(parse_date_time)

In [5]:
# dataset['Label'].isna().any().compute()

In [6]:
# attack labelling script

# note: this csv was generated from the table provided by the authors at: https://www.unb.ca/cic/datasets/ids-2018.html
attack_list: pd.DataFrame = pd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'attack_list.csv'))  
attack_list['start datetime'] = pd.to_datetime(attack_list['Date'] + ' ' + attack_list['Attack Start Time'] + ':00', dayfirst=True)
attack_list['end datetime'] = pd.to_datetime(attack_list['Date'] + ' ' + attack_list['Attack Finish Time'] + ':59', dayfirst=True)
attack_list['Date'] = pd.to_datetime(attack_list['Date'], dayfirst=True).dt.date

cat_list = pd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'category_list.csv'))
cat_list = pd.Series(cat_list['Attack Category'].values, index=cat_list['Label'])

dataset['attack name'] = 'Benign'
dataset['attack category'] = 'Benign'
def label_attacks(partition, add_categories=True):
    for _, attack in attack_list.iterrows():
        if attack['Label'] in ['Infilteration', 'Bot']:
            is_correct_date = np.logical_and(attack['start datetime'] <= partition['Timestamp'], partition['Timestamp'] <= attack['end datetime'])  
        else:
            is_correct_date = attack['Date'] == partition['Timestamp'].dt.date
        is_part_of_attack = np.logical_and(partition['Label'] == attack['Label'], is_correct_date)
        partition.loc[is_part_of_attack, 'attack name'] = attack['Attack Name']  # attack label
        
        if add_categories:  # note: dask sends test data to this function so its important 'foo' is in category_list
            partition['attack category'] = partition['Label'].map(cat_list)  # add category label
        
    partition.loc[(partition['Label'] != 'Benign') & (partition['attack name'] == 'Benign'), 'attack name'] = 'Unknown'
    return partition

# meta = {**dtype_cic_ids2018, **{'attack category': 'str', 'attack name': 'str', 'Timestamp': 'datetime64[ns]'}}
meta_frame = dataset.partitions[0].copy()
meta_frame['attack name'] = 'string'
meta_frame['attack category'] = 'string'
metalabelled_dataset: dd.DataFrame = dataset.map_partitions(label_attacks, meta=meta_frame) # type: ignore

In [7]:
# code to convert to one file in case its ever necessary
# dat2 = dd.read_csv(os.path.join('..', '..', 'data','CSE-CIC-IDS-2018', 'meta_labelled_data', '*.part'), dtype=dtype_cic_ids2018)
# dat2.to_csv(os.path.join('..', '..', 'data', 'CSE-CIC-IDS-2018', 'meta_labelled_data2'), single_file=True, index=False)

In [8]:
metalabelled_dataset.to_csv(os.path.join('..', '..', 'data', 'CSE-CIC-IDS-2018', 'meta_labelled_data'), single_file=False, index=False)

['/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/000.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/001.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/002.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/003.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/004.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/005.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/006.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/007.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/008.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/009.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/010.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/011.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/012.part',
 '/home/calvin/FYP/data/CSE-CIC-IDS-2018/meta_labelled_data/013.part',
 '/hom