In [None]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(".."))
from common import common

In [35]:
base_path = common.base_path

In [36]:
def get_ctu13_df():
    config = {
        'TARGET_COLUMN': 'Label',
        'NORMAL_TARGET': 0,
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['Dur', 'TotPkts', 'TotBytes', 'SrcBytes'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
        'TARGET_DICT': {
            0: 'Other', 
            1: 'Botnet',
            2: 'Normal'
        },
        'INV_TARGET_DICT': {
            'Other': 0, 
            'Botnet': 1,
            'Normal': 2
        }
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/ctu13/sampled_data.csv')
            
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df.sample(50000), main_labels, config)

In [None]:
def get_processed_ctu13_df():
    df, main_labels, config = get_ctu13_df()
    # print('main_labels', main_labels)

    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    lst=[]
    for i in df['Label']:
        if 'Botnet' in i:
            lst.append(1)
        elif 'Normal' in i:
            lst.append(2)
        else:
            lst.append(0)
    df['Label'] = lst

    protocol_number = []
    for i in df['Proto']:
        if i == 'udp':
            protocol_number.append(17)
        elif i == 'tcp':
            protocol_number.append(6)
        elif i == 'icmp':
            protocol_number.append(1)
        else:
            protocol_number.append(0)
    
    df['Proto'] = protocol_number

    direction_number = []
    for i in df['Dir']:
        if i == '  <->':
            direction_number.append(1)
        elif i == '   ->':
            direction_number.append(2)
        else:
            direction_number.append(0)
    
    df['Dir'] = direction_number

    one_hot_lst = ['Dir','Proto','sTos','dTos']
    column_list = ['Dur','TotPkts','TotBytes','SrcBytes',
                   'Dir_   ->','Dir_  <->','Dir_others',
    	             'Proto_icmp','Proto_tcp','Proto_udp','Proto_others',
                   'sTos_0.0','sTos_9.0','sTos_others','dTos_0.0','dTos_9.0','dTos_others','Label']
    
    def one_hot_encoding(df):
        df_new = pd.DataFrame()
        other_list = []
        for i in one_hot_lst:
            if i == 'Dir':
                for j in df[i]:
                    if j == '  <->' or j == '   ->':
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'Dir_others'
                df[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df, columns=[i])
            elif i == 'Proto':
                for j in df[i]:
                    if j == 'tcp' or j == 'udp' or j == 'icmp':
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'Proto_others'
                df_new[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df_new, columns=[i])
            elif i == 'sTos':
                for j in df[i]:
                    if j == 0 or j == 9:
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'sTos_others'
                df_new[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df_new, columns=[i])
            elif i == 'dTos':
                for j in df[i]:
                    if j == 0 or j == 9:
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'dTos_others'
                df_new[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df_new, columns=[i])
        for col in df_new.columns:
            if col not in column_list:
                df_new = df_new.drop([col], axis=1)
        return df_new
    
    df = one_hot_encoding(df)
    
    main_labels = df.columns
    print('main_labels', main_labels)
    
    return (df, main_labels, config)