In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from cuml.neighbors import NearestNeighbors
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter
from cuml.preprocessing.TargetEncoder import TargetEncoder
from sklearn.preprocessing import LabelEncoder
import gc
from tqdm.auto import tqdm

In [2]:
def save_balanced_test_train(path, drop_cols=['traffic_type'], target_col="traffic_type", cat_features=None):
    df = pd.read_parquet(path)
    X = df.drop(drop_cols, axis=1)
    y = df[target_col].astype("category")
    del df

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    del X, y

    print("Shape of X_train before balancing:")
    print(X_train.shape)

    print("Distribution of classes before balancing")
    print(dict(Counter(y_train)))

    print("Balancing now")
    enn = EditedNearestNeighbours(
        sampling_strategy="majority",
        n_neighbors=NearestNeighbors(n_neighbors=3),
        kind_sel="mode"
    )
    X_balanced, y_balanced = enn.fit_resample(X_train, y_train)
    del X_train, y_train

    print("Shape of X_train after balancing:")
    print(X_balanced.shape)

    print("Distribution of classes after balancing")
    print(dict(Counter(y_balanced)))
    
    if cat_features is not None:
        X_test.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)

        FOLDS = 10
        SMOOTH = 0.001
        print("Starting target encode on categorical columns")
        for col in tqdm(cat_features):
            encoder = TargetEncoder(n_folds=FOLDS, smooth=SMOOTH)

            X_balanced[col] = pd.Series(encoder.fit_transform(X_balanced[col].to_numpy(), y_balanced.cat.codes))
            X_test[col] = pd.Series(encoder.transform(X_test[col].to_numpy()))

    pd.concat([X_balanced, y_balanced.rename("traffic_type")], axis=1).to_parquet(path.parent / (path.stem + "_train.parquet"))
    del X_balanced, y_balanced
    pd.concat([X_test, y_test.rename("traffic_type")], axis=1).to_parquet(path.parent / (path.stem + "_test.parquet"))
    del X_test, y_test
    gc.collect()

In [3]:
save_balanced_test_train(Path("data/BaIoT/danmini_multiclass.parquet"))

Shape of X_train before balancing:
(814638, 115)
Distribution of classes before balancing
{'bl_junk': 23255, 'mirai_ack': 81756, 'bl_udp': 84699, 'bl_combo': 47774, 'mirai_udp': 190132, 'benign': 39638, 'mirai_syn': 98058, 'mirai_scan': 86148, 'mirai_udpplain': 65586, 'bl_scan': 23879, 'bl_tcp': 73713}
Balancing now
[I] [01:48:42.259266] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(720665, 115)
Distribution of classes after balancing
{'benign': 39638, 'bl_combo': 47774, 'bl_junk': 23255, 'bl_scan': 23879, 'bl_tcp': 73713, 'bl_udp': 84699, 'mirai_ack': 81756, 'mirai_scan': 86148, 'mirai_syn': 98058, 'mirai_udp': 96159, 'mirai_udpplain': 65586}


In [4]:
save_balanced_test_train(Path("data/BaIoT/danmini_binary.parquet"))

Shape of X_train before balancing:
(814638, 115)
Distribution of classes before balancing
{'malicious': 775000, 'benign': 39638}
Balancing now
[I] [01:49:16.084399] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(813932, 115)
Distribution of classes after balancing
{'benign': 39638, 'malicious': 774294}


In [5]:
save_balanced_test_train(Path("data/DoHBrw/binary.parquet"), cat_features=['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'DoH'])

Shape of X_train before balancing:
(927392, 34)
Distribution of classes before balancing
{'malicious': 199749, 'benign': 727643}
Balancing now
[I] [01:50:29.858825] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(923229, 34)
Distribution of classes after balancing
{'benign': 723480, 'malicious': 199749}
Starting target encode on categorical columns


  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
save_balanced_test_train(Path("data/DoHBrw/multiclass.parquet"), cat_features=['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'DoH'])

Shape of X_train before balancing:
(927392, 34)
Distribution of classes before balancing
{'benign': 727643, 'dns2tcp': 133854, 'dnscat2': 28661, 'iodine': 37234}
Balancing now
[I] [01:51:02.505382] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(923281, 34)
Distribution of classes after balancing
{'benign': 723532, 'dns2tcp': 133854, 'dnscat2': 28661, 'iodine': 37234}
Starting target encode on categorical columns


  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
save_balanced_test_train(Path("data/CCD-INIDv1/binary.parquet"), cat_features=['expiration_id', 'src_ip', 'src_ip_is_private', 
                                                                               'src_port', 'dst_ip', 'dst_ip_is_private', 
                                                                               'dst_port', 'protocol', 'ip_version', 'vlan_id',
                                                                               'application_name', 'application_category_name', 'application_is_guessed',
                                                                               'requested_server_name', 'client_fingerprint',
                                                                               'splt_direction', 'splt_ps', 'splt_piat_ms'])

Shape of X_train before balancing:
(73332, 81)
Distribution of classes before balancing
{'normal': 37785, 'attack': 35547}
Balancing now
[I] [01:51:32.224007] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(73332, 81)
Distribution of classes after balancing
{'attack': 35547, 'normal': 37785}
Starting target encode on categorical columns


  0%|          | 0/18 [00:00<?, ?it/s]

In [8]:
save_balanced_test_train(Path("data/CCD-INIDv1/multiclass.parquet"), drop_cols=['atk_type'], 
                         target_col='atk_type', cat_features=['expiration_id', 'src_ip', 'src_ip_is_private', 
                                                                               'src_port', 'dst_ip', 'dst_ip_is_private', 
                                                                               'dst_port', 'protocol', 'ip_version', 'vlan_id',
                                                                               'application_name', 'application_category_name', 'application_is_guessed',
                                                                               'requested_server_name', 'client_fingerprint',
                                                                               'splt_direction', 'splt_ps', 'splt_piat_ms'])

Shape of X_train before balancing:
(73332, 81)
Distribution of classes before balancing
{'none': 37785, 'arp_dos': 9255, 'mitm': 6139, 'hydra_brute': 8454, 'udp_dos': 3935, 'slowloris': 7764}
Balancing now
[I] [01:51:34.374873] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(73332, 81)
Distribution of classes after balancing
{'arp_dos': 9255, 'hydra_brute': 8454, 'mitm': 6139, 'none': 37785, 'slowloris': 7764, 'udp_dos': 3935}
Starting target encode on categorical columns


  0%|          | 0/18 [00:00<?, ?it/s]