In [None]:
import numpy as np
import pandas as pd
import sys
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

sys.path.append(os.path.abspath(".."))
from common import common

In [5]:
base_path = common.base_path

In [6]:
def get_tii_ssrc_df():
    config = {
        'TARGET_COLUMN': 'Traffic Type',
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': [],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/tii-ssrc/sampled_data.csv')
    
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df, main_labels, config)

In [9]:
def get_processed_tii_ssrc_df():
    all_df, main_labels, config = get_tii_ssrc_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    # Preprocess
    DROP_COLUMNS = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp']
    all_df = all_df.drop(columns=DROP_COLUMNS)
    
    # Filter out duplicates within the same target
    all_df = all_df.round(3)
    all_df = all_df.drop_duplicates()
    all_df = all_df.drop(columns=['Label', 'Traffic Subtype'])

    numerical_cols = all_df.select_dtypes(include=[np.number]).columns.to_list()
    numerical_cols.remove('Protocol')
    print('numerical_cols', numerical_cols)
    categorical_cols = all_df.select_dtypes(include=[object]).columns.to_list()
    categorical_cols.append('Protocol')
    print('categorical_cols', categorical_cols)
    
    # Pipelines for Numerical and Categorical Data Transformations
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Column Transformer combining both pipelines
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    # Apply preprocessor to train and test data
    preprocessor.fit(all_df)

    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])
    
    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    le, all_df = common.label_encode(all_df, config['ORDINAL_COLUMNS'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['Bruteforce']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = list(all_df.columns)
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)