In [4]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

sys.path.append(os.path.abspath(".."))
from common import common

In [5]:
base_path = common.base_path

In [6]:
def get_tii_ssrc_df():
    config = {
        'TARGET_COLUMN': 'Traffic Type',
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': [],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/tii-ssrc/sampled_data.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [9]:
def get_processed_tii_ssrc_df():
    all_df, main_labels, config = get_tii_ssrc_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Preprocess
    DROP_COLUMNS = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp']
    all_df = all_df.drop(columns=DROP_COLUMNS)
    
    # Filter out duplicates within the same target
    all_df = all_df.round(3)
    all_df = all_df.drop_duplicates()
    all_df = all_df.drop(columns=['Label', 'Traffic Subtype'])

    numerical_cols = all_df.select_dtypes(include=[np.number]).columns.to_list()
    numerical_cols.remove('Protocol')
    print('numerical_cols', numerical_cols)
    categorical_cols = all_df.select_dtypes(include=[object]).columns.to_list()
    categorical_cols.append('Protocol')
    print('categorical_cols', categorical_cols)
    
    # Pipelines for Numerical and Categorical Data Transformations
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Column Transformer combining both pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    # Apply preprocessor to train and test data
    preprocessor.fit(all_df)

    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])
    
    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    le, all_df = common.label_encode(all_df, config['ORDINAL_COLUMNS'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['Bruteforce']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = list(all_df.columns)
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [10]:
df,m,c=get_processed_tii_ssrc_df()

Normal class:  0    DoS
Name: Traffic Type, dtype: object
numerical_cols ['Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG F

KeyError: 'Deny'

In [12]:
all_df, main_labels, config = get_tii_ssrc_df()
# print('main_labels', main_labels)
target_column = config['TARGET_COLUMN']

# Preprocess
DROP_COLUMNS = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp']
all_df = all_df.drop(columns=DROP_COLUMNS)

# Filter out duplicates within the same target
all_df = all_df.round(3)
all_df = all_df.drop_duplicates()
all_df = all_df.drop(columns=['Label', 'Traffic Subtype'])

numerical_cols = all_df.select_dtypes(include=[np.number]).columns.to_list()
numerical_cols.remove('Protocol')
print('numerical_cols', numerical_cols)
categorical_cols = all_df.select_dtypes(include=[object]).columns.to_list()
categorical_cols.append('Protocol')
print('categorical_cols', categorical_cols)

# Pipelines for Numerical and Categorical Data Transformations
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer combining both pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Apply preprocessor to train and test data
preprocessor.fit(all_df)

# Label Encoder
le, all_df = common.label_encode(all_df, [target_column])

Normal class:  0    DoS
Name: Traffic Type, dtype: object
numerical_cols ['Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG F

In [13]:
all_df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Traffic Type
0,6.0,18921.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000,0.000,...,20.0,0.0,0.000,0.0,0.0,0.000000e+00,0.000,0.0,0.0,4
1,6.0,3728459.0,2.0,0.0,100.0,0.0,50.0,50.0,50.000,0.000,...,20.0,0.0,0.000,0.0,0.0,0.000000e+00,0.000,0.0,0.0,3
2,6.0,2430207.0,2.0,1.0,100.0,0.0,50.0,50.0,50.000,0.000,...,20.0,0.0,0.000,0.0,0.0,0.000000e+00,0.000,0.0,0.0,3
3,6.0,14657232.0,2.0,1.0,100.0,0.0,50.0,50.0,50.000,0.000,...,20.0,0.0,0.000,0.0,0.0,1.462543e+07,0.000,14625427.0,14625427.0,3
4,6.0,10131029.0,2.0,0.0,2712.0,0.0,1356.0,1356.0,1356.000,0.000,...,20.0,0.0,0.000,0.0,0.0,1.013103e+07,0.000,10131029.0,10131029.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342238,6.0,1754144.0,2.0,1.0,60.0,0.0,30.0,30.0,30.000,0.000,...,40.0,0.0,0.000,0.0,0.0,0.000000e+00,0.000,0.0,0.0,3
1342239,6.0,30093094.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,...,20.0,0.0,0.000,0.0,0.0,3.009309e+07,0.000,30093094.0,30093094.0,4
1342240,6.0,12165313.0,2.0,1.0,1000.0,0.0,500.0,500.0,500.000,0.000,...,20.0,0.0,0.000,0.0,0.0,1.202213e+07,0.000,12022128.0,12022128.0,3
1342242,17.0,37036295.0,7.0,0.0,2150.0,0.0,500.0,50.0,307.143,240.535,...,8.0,3353826.5,3756054.348,6009758.0,697895.0,1.010955e+07,1739275.006,12111202.0,8966905.0,3


In [18]:
config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
print('TARGET_DICT', config['TARGET_DICT'])
le, all_df = common.label_encode(all_df, config['ORDINAL_COLUMNS'])

config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['Bruteforce']
print('NORMAL_TARGET', config['NORMAL_TARGET'])

# One Hot Encoder
ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])

main_labels = list(all_df.columns)
print('main_labels', main_labels)

TARGET_DICT {0: 'Audio', 1: 'Background', 2: 'Bruteforce', 3: 'DoS', 4: 'Information Gathering', 5: 'Mirai', 6: 'Text', 7: 'Video'}
NORMAL_TARGET 2
main_labels ['Protocol', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance