In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from cuml.neighbors import NearestNeighbors
from imblearn.over_sampling import BorderlineSMOTE, SMOTENC
from collections import Counter

In [61]:
def save_balanced_test_train(path, drop_cols=['traffic_type'], target_col="traffic_type", smote=BorderlineSMOTE(k_neighbors=NearestNeighbors(n_neighbors=5), m_neighbors=NearestNeighbors(n_neighbors=10))):
    df = pd.read_parquet(path)
    X = df.drop(drop_cols, axis=1)
    y = df[target_col]
    del df

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    del X, y

    print("Shape of X_train before balancing:")
    print(X_train.shape)

    print("Distribution of classes before balancing")
    print(dict(Counter(y_train)))

    print("Balancing now")
    X_balanced, y_balanced = smote.fit_resample(X_train, y_train)
    del X_train, y_train

    print("Shape of X_train after balancing:")
    print(X_balanced.shape)

    print("Distribution of classes after balancing")
    print(dict(Counter(y_balanced)))

    pd.concat([X_balanced, y_balanced], axis=1).to_parquet(path.parent / (path.stem + "_train.parquet"))
    del X_balanced, y_balanced
    pd.concat([X_test, y_test], axis=1).to_parquet(path.parent / (path.stem + "_test.parquet"))
    del X_test, y_test

In [3]:
save_balanced_test_train(Path("data/BaIoT/danmini_multiclass.parquet"))

Shape of X_train before balancing:
(814638, 115)
Distribution of classes before balancing
{'bl_junk': 23255, 'mirai_ack': 81756, 'bl_udp': 84699, 'bl_combo': 47774, 'mirai_udp': 190132, 'benign': 39638, 'mirai_syn': 98058, 'mirai_scan': 86148, 'mirai_udpplain': 65586, 'bl_scan': 23879, 'bl_tcp': 73713}
Balancing now
[I] [19:47:06.609649] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [19:47:06.609981] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(2091452, 115)
Distribution of classes after balancing
{'bl_junk': 190132, 'mirai_ack': 190132, 'bl_udp': 190132, 'bl_combo': 190132, 'mirai_udp': 190132, 'benign': 190132, 'mirai_syn': 190132, 'mirai_scan': 190132, 'mirai_udpplain': 190132, 'bl_scan': 190132, 'bl_tcp': 190132}


In [4]:
save_balanced_test_train(Path("data/BaIoT/danmini_binary.parquet"))

Shape of X_train before balancing:
(814638, 115)
Distribution of classes before balancing
{'malicious': 775000, 'benign': 39638}
Balancing now
[I] [19:48:41.661612] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [19:48:41.662867] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(1550000, 115)
Distribution of classes after balancing
{'malicious': 775000, 'benign': 775000}


In [5]:
save_balanced_test_train(Path("data/DoHBrw/binary.parquet"), smote=SMOTENC(k_neighbors=NearestNeighbors(n_neighbors=5), categorical_features=[33]))

Shape of X_train before balancing:
(927392, 34)
Distribution of classes before balancing
{'malicious': 199749, 'benign': 727643}
Balancing now
[I] [19:49:02.972501] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [19:49:04.391752] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(1455286, 34)
Distribution of classes after balancing
{'malicious': 727643, 'benign': 727643}


In [7]:
save_balanced_test_train(Path("data/DoHBrw/multiclass.parquet"), smote=SMOTENC(k_neighbors=NearestNeighbors(n_neighbors=5), categorical_features=[33]))

Shape of X_train before balancing:
(927392, 34)
Distribution of classes before balancing
{'benign': 727643, 'dns2tcp': 133854, 'dnscat2': 28661, 'iodine': 37234}
Balancing now
[I] [20:33:34.534668] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:33:35.778197] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(2910572, 34)
Distribution of classes after balancing
{'benign': 727643, 'dns2tcp': 727643, 'dnscat2': 727643, 'iodine': 727643}


In [59]:
save_balanced_test_train(Path("data/CCD-INIDv1/binary.parquet"), smote=SMOTENC(k_neighbors=NearestNeighbors(n_neighbors=5), categorical_features=[0, 2, 5, 7, 8, 9, 76, 77, 78, 79, 80]))

Shape of X_train before balancing:
(73332, 81)
Distribution of classes before balancing
{1: 37785, 0: 35547}
Balancing now
[I] [20:47:53.978307] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:47:54.203360] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(75570, 81)
Distribution of classes after balancing
{1: 37785, 0: 37785}


In [62]:
save_balanced_test_train(Path("data/CCD-INIDv1/multiclass.parquet"), drop_cols=['atk_type'], target_col='atk_type', smote=SMOTENC(k_neighbors=NearestNeighbors(n_neighbors=5), categorical_features=[0, 2, 5, 7, 8, 9, 76, 77, 78, 79, 80]))

Shape of X_train before balancing:
(73332, 81)
Distribution of classes before balancing
{3: 37785, 0: 9255, 2: 6139, 1: 8454, 5: 3935, 4: 7764}
Balancing now
[I] [20:48:22.772535] Unused keyword parameter: n_jobs during cuML estimator initialization
[I] [20:48:22.986860] Unused keyword parameter: n_jobs during cuML estimator initialization
Shape of X_train after balancing:
(226710, 81)
Distribution of classes after balancing
{3: 37785, 0: 37785, 2: 37785, 1: 37785, 5: 37785, 4: 37785}
