In [16]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [17]:
# Dataset: KDD
# Define variables
NORMAL_TARGET = 'normal'
TARGET_COLUMN = 'attack'

# # List of target class names
# TARGET_DICT = {
#     # 'back': 'dos',
#     # 'buffer_overflow': 'u2r',
#     # 'ftp_write': 'r2l',
#     # 'guess_passwd': 'r2l',
#     # 'imap': 'r2l',
#     # 'ipsweep': 'probe',
#     # 'land': 'dos',
#     # 'loadmodule': 'u2r',
#     # 'multihop': 'r2l',
#     # 'neptune': 'dos',
#     # 'nmap': 'probe',
#     # 'perl': 'u2r',
#     # 'phf': 'r2l',
#     # 'pod': 'dos',
#     # 'portsweep': 'probe',
#     # 'rootkit': 'u2r',
#     # 'satan': 'probe',
#     # 'smurf': 'dos',
#     # 'spy': 'r2l',
#     # 'teardrop': 'dos',
#     # 'warezclient': 'r2l',
#     # 'warezmaster': 'r2l',
#     # --
#     'normal':'normal',
#     'snmpgetattack': 'snmpgetattack',
#     'named': 'named',
#     'xlock': 'xlock',
#     'smurf': 'smurf',
#     'ipsweep': 'ipsweep',
#     'multihop':'multihop',
#     'xsnoop': 'xsnoop',
#     'sendmail': 'sendmail',
#     'guess_passwd': 'guess_passwd',
#     'saint': 'saint',
#     'buffer_overflow':'buffer_overflow',
#     'portsweep': 'portsweep',
#     'pod': 'pod',
#     'apache2': 'apache2',
#     'phf': 'phf',
#     'udpstorm': 'udpstorm',
#     'warezmaster':'warezmaster',
#     'perl': 'perl',
#     'satan': 'satan',
#     'xterm': 'xterm',
#     'mscan': 'mscan',
#     'processtable': 'processtable',
#     'ps': 'ps',
#     'nmap':'nmap',
#     'rootkit': 'rootkit',
#     'neptune': 'neptune',
#     'loadmodule': 'loadmodule',
#     'imap': 'imap',
#     'back': 'back',
#     'httptunnel': 'httptunnel',
#     'worm':'worm',
#     'mailbomb': 'mailbomb',
#     'ftp_write': 'ftp_write',
#     'teardrop': 'teardrop',
#     'land': 'land',
#     'sqlattack': 'sqlattack',
#     'snmpguess':'snmpguess',
# }

# List of numerical columns (these are to be standardized)
NUMERICAL_COLUMNS = ['duration', 'src_bytes', 'dst_bytes',
                     'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 
                     'root_shell', 'su_attempted', 'num_file_creations', 'num_shells', 'num_access_files', 
                     'num_outbound_cmds', 'count', 'srv_count', 'serror_rate', 
                     'rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
                     'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']
# List of categorical columns (these are to be one hot encoded)
CATEGORICAL_COLUMNS = ['protocol_type', 'service', 'flag']
# List of ordinal columns (these are to be label encoded)
ORDINAL_COLUMNS = []

In [14]:
%run common.ipynb

In [10]:
def get_kdd_df():
    cols = open("./datasets/kdd/kddcup.names",'r').read()
    cols = [c[:c.index(':')] for c in cols.split('\n')[1:-1]]
    cols.append('attack')
    
    all_df = pd.read_csv("./datasets/kdd/corrected", names = cols)
    
    main_labels = cols
    
    print('Normal class: ', all_df[TARGET_COLUMN].mode())
    print('Feature names: ', main_labels)
    return (all_df, main_labels)

In [18]:
# Dataset KDD
all_df, main_labels = get_kdd_df()

encoder = LabelEncoder()
all_df['attack'] = all_df['attack'].str[:-1]
all_df['attack'] = encoder.fit_transform(all_df['attack'])
TARGET_DICT = {index: label for index, label in enumerate(encoder.classes_)}
INV_TARGET_DICT = {v: k for k, v in TARGET_DICT.items()}
NORMAL_TARGET = INV_TARGET_DICT['normal']
print('NORMAL_TARGET', NORMAL_TARGET)
print('TARGET_DICT', TARGET_DICT)

ohe, all_df = one_hot_encode(all_df, CATEGORICAL_COLUMNS)
main_lables = all_df.columns
print('main_labels', main_labels)
    
all_df.head()

['normal.' 'snmpgetattack.' 'named.' 'xlock.' 'smurf.' 'ipsweep.'
 'multihop.' 'xsnoop.' 'sendmail.' 'guess_passwd.' 'saint.'
 'buffer_overflow.' 'portsweep.' 'pod.' 'apache2.' 'phf.' 'udpstorm.'
 'warezmaster.' 'perl.' 'satan.' 'xterm.' 'mscan.' 'processtable.' 'ps.'
 'nmap.' 'rootkit.' 'neptune.' 'loadmodule.' 'imap.' 'back.' 'httptunnel.'
 'worm.' 'mailbomb.' 'ftp_write.' 'teardrop.' 'land.' 'sqlattack.'
 'snmpguess.']
Normal class:  0    smurf.
Name: attack, dtype: object
Feature names:  ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_

Unnamed: 0,duration,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16
1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16
2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,16
3,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,28
4,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,28
