In [4]:
import numpy as np
import pandas as pd
import sys
import os
import re
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

sys.path.append(os.path.abspath(".."))
from common import common

In [5]:
base_path = common.base_path

In [6]:
def get_botnet_df():
    config = {
        'TARGET_COLUMN': 'category',
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['pkSeqID', 'flgs_number', 'proto_number', 'pkts', 'bytes', 'state_number', 'seq', 'spkts', 'dpkts', 
                              'sbytes', 'dbytes', 'TnBPSrcIP', 'TnBPDstIP', 'TnP_PSrcIP', 'TnP_PDstIP', 'TnP_PerProto', 'TnP_Per_Dport',
                              'N_IN_Conn_P_DstIP', 'N_IN_Conn_P_SrcIP', 'Pkts_P_State_P_Protocol_P_DestIP', 'Pkts_P_State_P_Protocol_P_SrcIP', 
                              'stime', 'ltime', 'dur', 'mean', 'stddev', 'sum', 'min', 'max', 'rate', 'srate', 'drate', 
                              'AR_P_Proto_P_SrcIP', 'AR_P_Proto_P_DstIP', 'AR_P_Proto_P_Sport', 'AR_P_Proto_P_Dport'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': ['flgs', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'state'],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.DataFrame()
    for dirname, _, filenames in os.walk(f'{base_path}/datasources/botnet/'):
        for filename in filenames:
            if re.match(r'reduced_.*\.csv$', filename):
                print(os.path.join(dirname, filename))
                temp_df = pd.read_csv(os.path.join(dirname, filename), low_memory = False)
                all_df = pd.concat([all_df, temp_df], axis = 0, ignore_index = True)
            
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df.sample(50000), main_labels, config)

In [7]:
def get_processed_botnet_df():
    all_df, main_labels, config = get_botnet_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']
    print(all_df.shape)

    # Drop column
    all_df = all_df.drop(columns='attack')
    all_df = all_df.drop(columns='subcategory')

    # Label Encoder
    le, all_df = common.label_encode(all_df, [target_column])

    config['TARGET_DICT'] = {index: label for index, label in enumerate(le.classes_)}
    config['INV_TARGET_DICT'] = {v: k for k, v in config['TARGET_DICT'].items()}
    print('TARGET_DICT', config['TARGET_DICT'])
    
    config['NORMAL_TARGET'] = config['INV_TARGET_DICT']['DDoS']
    print('NORMAL_TARGET', config['NORMAL_TARGET'])

    # One Hot Encoder
    print(config)
    ohe, all_df = common.one_hot_encode(all_df, config['CATEGORICAL_COLUMNS'])
    
    main_labels = all_df.columns
    print('main_labels', main_labels)
    
    return (all_df, main_labels, config)

In [None]:
df,m,c = get_processed_botnet_df()

/Users/suyeetan/Downloads/CS5344_Project/work//datasources/botnet/reduced_data_1.csv
/Users/suyeetan/Downloads/CS5344_Project/work//datasources/botnet/reduced_data_2.csv
/Users/suyeetan/Downloads/CS5344_Project/work//datasources/botnet/reduced_data_3.csv
/Users/suyeetan/Downloads/CS5344_Project/work//datasources/botnet/reduced_data_4.csv
Normal class:  0    DDoS
Name: category, dtype: object
(50000, 46)
TARGET_DICT {0: 'DDoS', 1: 'DoS', 2: 'Normal', 3: 'Reconnaissance', 4: 'Theft'}
NORMAL_TARGET 0
{'TARGET_COLUMN': 'category', 'NUMERICAL_COLUMNS': ['pkSeqID', 'flgs_number', 'proto_number', 'pkts', 'bytes', 'state_number', 'seq', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'TnBPSrcIP', 'TnBPDstIP', 'TnP_PSrcIP', 'TnP_PDstIP', 'TnP_PerProto', 'TnP_Per_Dport', 'N_IN_Conn_P_DstIP', 'N_IN_Conn_P_SrcIP', 'Pkts_P_State_P_Protocol_P_DestIP', 'Pkts_P_State_P_Protocol_P_SrcIP', 'stime', 'ltime', 'dur', 'mean', 'stddev', 'sum', 'min', 'max', 'rate', 'srate', 'drate', 'AR_P_Proto_P_SrcIP', 'AR_P_Proto_P