In [34]:
import numpy as np
import pandas as pd
import sys
import os
import re
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

sys.path.append(os.path.abspath(".."))
from common import common

In [35]:
base_path = common.base_path

In [36]:
def get_ctu13_df():
    config = {
        'TARGET_COLUMN': 'Label',
        'NORMAL_TARGET': 0,
        
        # List of numerical columns (these are to be standardized)
        'NUMERICAL_COLUMNS': ['Dur', 'TotPkts', 'TotBytes', 'SrcBytes'],
        # List of categorical columns (these are to be one hot encoded)
        'CATEGORICAL_COLUMNS': [],
        # List of ordinal columns (these are to be label encoded)
        'ORDINAL_COLUMNS': [],
        'TARGET_DICT': {
            0: 'Other', 
            1: 'Botnet',
            2: 'Normal'
        },
        'INV_TARGET_DICT': {
            'Other': 0, 
            'Botnet': 1,
            'Normal': 2
        }
    }
    target_column = config['TARGET_COLUMN']
    all_df = pd.read_csv(f'{base_path}/datasources/ctu13/sampled_data.csv')
            
    # Headers of column
    main_labels = all_df.columns
    
    print('Normal class: ', all_df[target_column].mode())
    return (all_df.sample(50000), main_labels, config)

In [37]:
def get_processed_ctu13_df():
    df, main_labels, config = get_ctu13_df()
    # print('main_labels', main_labels)
    target_column = config['TARGET_COLUMN']

    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    lst=[]
    for i in df['Label']:
        if 'Botnet' in i:
            lst.append(1)
        elif 'Normal' in i:
            lst.append(2)
        else:
            lst.append(0)
    df['Label'] = lst

    protocol_number = []
    for i in df['Proto']:
        if i == 'udp':
            protocol_number.append(17)
        elif i == 'tcp':
            protocol_number.append(6)
        elif i == 'icmp':
            protocol_number.append(1)
        else:
            protocol_number.append(0)
    
    df['Proto'] = protocol_number

    direction_number = []
    for i in df['Dir']:
        if i == '  <->':
            direction_number.append(1)
        elif i == '   ->':
            direction_number.append(2)
        else:
            direction_number.append(0)
    
    df['Dir'] = direction_number

    one_hot_lst = ['Dir','Proto','sTos','dTos']
    column_list = ['Dur','TotPkts','TotBytes','SrcBytes',
                   'Dir_   ->','Dir_  <->','Dir_others',
    	             'Proto_icmp','Proto_tcp','Proto_udp','Proto_others',
                   'sTos_0.0','sTos_9.0','sTos_others','dTos_0.0','dTos_9.0','dTos_others','Label']
    
    def one_hot_encoding(df):
        df_new = pd.DataFrame()
        other_list = []
        for i in one_hot_lst:
            if i == 'Dir':
                for j in df[i]:
                    if j == '  <->' or j == '   ->':
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'Dir_others'
                df[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df, columns=[i])
            elif i == 'Proto':
                for j in df[i]:
                    if j == 'tcp' or j == 'udp' or j == 'icmp':
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'Proto_others'
                df_new[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df_new, columns=[i])
            elif i == 'sTos':
                for j in df[i]:
                    if j == 0 or j == 9:
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'sTos_others'
                df_new[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df_new, columns=[i])
            elif i == 'dTos':
                for j in df[i]:
                    if j == 0 or j == 9:
                        other_list.append(0)
                    else:
                        other_list.append(1)
                other_name = 'dTos_others'
                df_new[other_name] = other_list
                other_list.clear()
                df_new = pd.get_dummies(df_new, columns=[i])
        for col in df_new.columns:
            if col not in column_list:
                df_new = df_new.drop([col], axis=1)
        return df_new
    
    df = one_hot_encoding(df)
    
    main_labels = df.columns
    print('main_labels', main_labels)
    
    return (df, main_labels, config)

In [31]:
# df,m,c = get_processed_ctu13_df()

Normal class:  0    flow=To-Background-UDP-CVUT-DNS-Server
Name: Label, dtype: object
main_labels Index(['Dur', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label', 'Dir_others',
       'Proto_others', 'sTos_others', 'sTos_0.0', 'dTos_others', 'dTos_0.0'],
      dtype='object')


In [33]:
# df['Label'].value_counts()

Label
0    48029
1     1080
2      891
Name: count, dtype: int64

In [25]:
# df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [26]:
# lst=[]
# for i in df['Label']:
#     if 'Botnet' in i:
#         lst.append(1)
#     elif 'Normal' in i:
#         lst.append(2)
#     else:
#         lst.append(0)
# df['Label'] = lst
# df['Label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Label'] = lst


Label
0    48001
1     1095
2      904
Name: count, dtype: int64

In [27]:
# protocol_number = []
# for i in df['Proto']:
#     if i == 'udp':
#         protocol_number.append(17)
#     elif i == 'tcp':
#         protocol_number.append(6)
#     elif i == 'icmp':
#         protocol_number.append(1)
#     else:
#         protocol_number.append(0)

# df['Proto'] = protocol_number

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Proto'] = protocol_number


In [22]:
# direction_number = []
# for i in df['Dir']:
#     if i == '  <->':
#         direction_number.append(1)
#     elif i == '   ->':
#         direction_number.append(2)
#     else:
#         direction_number.append(0)

# df['Dir'] = direction_number
# df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Dir'] = direction_number


Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
7109554,2011/08/10 11:56:27.103838,0.000362,17,147.32.84.59,64513,0,147.32.80.9,53,CON,0.0,0.0,2,251,67,0
2512358,2011/08/16 20:57:42.985615,0.000234,17,147.32.85.25,37929,0,147.32.80.9,53,CON,0.0,0.0,2,208,79,0
2672987,2011/08/14 22:01:43.465835,8.255054,6,147.32.86.89,4524,0,77.75.72.72,80,FSPA_FSPA,0.0,0.0,19,8264,5441,0
3565069,2011/08/18 13:00:19.903390,0.267941,17,67.232.114.157,28230,0,147.32.84.229,13363,CON,0.0,0.0,2,243,178,0
2767785,2011/08/13 14:47:28.430614,3599.998291,17,147.32.87.7,427,0,195.113.44.15,1091,REQ,0.0,,12,1320,1320,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3879846,2011/08/11 09:59:13.758366,0.000162,17,147.32.84.138,53625,0,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
5789739,2011/08/15 13:06:44.889975,0.000546,17,147.32.84.138,36342,0,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
3210439,2011/08/14 00:58:05.912871,0.000239,17,147.32.84.138,42488,0,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
5171888,2011/08/18 11:57:10.813866,0.000358,17,147.32.84.138,41048,0,147.32.80.9,53,CON,0.0,0.0,2,214,81,0


In [29]:
# one_hot_lst = ['Dir','Proto','sTos','dTos']
# column_list = ['Dur','TotPkts','TotBytes','SrcBytes',
#                'Dir_   ->','Dir_  <->','Dir_others',
# 	             'Proto_icmp','Proto_tcp','Proto_udp','Proto_others',
#                'sTos_0.0','sTos_9.0','sTos_others','dTos_0.0','dTos_9.0','dTos_others','Label']

# def one_hot_encoding(df):
#     df_new = pd.DataFrame()
#     other_list = []
#     for i in one_hot_lst:
#         if i == 'Dir':
#             for j in df[i]:
#                 if j == '  <->' or j == '   ->':
#                     other_list.append(0)
#                 else:
#                     other_list.append(1)
#             other_name = 'Dir_others'
#             df[other_name] = other_list
#             other_list.clear()
#             df_new = pd.get_dummies(df, columns=[i])
#         elif i == 'Proto':
#             for j in df[i]:
#                 if j == 'tcp' or j == 'udp' or j == 'icmp':
#                     other_list.append(0)
#                 else:
#                     other_list.append(1)
#             other_name = 'Proto_others'
#             df_new[other_name] = other_list
#             other_list.clear()
#             df_new = pd.get_dummies(df_new, columns=[i])
#         elif i == 'sTos':
#             for j in df[i]:
#                 if j == 0 or j == 9:
#                     other_list.append(0)
#                 else:
#                     other_list.append(1)
#             other_name = 'sTos_others'
#             df_new[other_name] = other_list
#             other_list.clear()
#             df_new = pd.get_dummies(df_new, columns=[i])
#         elif i == 'dTos':
#             for j in df[i]:
#                 if j == 0 or j == 9:
#                     other_list.append(0)
#                 else:
#                     other_list.append(1)
#             other_name = 'dTos_others'
#             df_new[other_name] = other_list
#             other_list.clear()
#             df_new = pd.get_dummies(df_new, columns=[i])
#     for col in df_new.columns:
#         if col not in column_list:
#             df_new = df_new.drop([col], axis=1)
#     return df_new

# df = one_hot_encoding(df)
# df

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes,Label,Dir_others,Proto_others,sTos_others,sTos_0.0,dTos_others,dTos_0.0
1520572,0.000201,2,214,81,0,1,1,0,True,0,True
726258,0.000236,2,214,81,0,1,1,0,True,0,True
3069910,14.990603,12,3243,1080,0,1,1,0,True,0,True
1799679,18.679251,20,1480,767,0,1,1,0,True,0,True
4579341,0.000282,2,331,86,0,1,1,0,True,0,True
...,...,...,...,...,...,...,...,...,...,...,...
2051747,0.000000,1,62,62,0,1,1,0,True,1,False
536633,7.990914,10,1310,1310,0,1,1,0,True,1,False
6559289,0.761299,15,3140,1751,0,1,1,0,True,0,True
2407868,0.000201,2,207,66,0,1,1,0,True,0,True
