### Dataset creation

In [1]:
import pandas as pd
import numpy as np

- load data and labels (normal and different attack types)

In [2]:
df_data = pd.read_csv("../../datasets/kddcup/kddcup.data_10_percent.gz", header=None)
df_labels = pd.DataFrame(df_data.iloc[:,-1])   
df_data = df_data.iloc[:, :-1] 

- read column names and types

In [3]:
col_names=[]
col_datatypes=[]

with open("../../datasets/kddcup/kddcup.names") as file:
    next(file)    ### skip first line
    for line in file:
        name, datatype = line.split(": ")
        col_names.append(name)
        col_datatypes.append(datatype.replace(".\n",""))
        
df_data.columns = col_names
df_labels.columns = ["labels"]

In [4]:
df_data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0


- check integrity of the data

In [5]:
print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


### Enumerate labels 

- ### normal samples are named as "anomaly" since they are minority (stated in paper)

In [6]:
df_data["label"] = np.where(df_labels["labels"].str.contains("normal"), "anomaly", "normal")  

In [7]:
### number of normal and abnormal
print("number of normal:", len(np.where(df_data["label"] == "normal")[0]))
print("number of anomaly:", len(np.where(df_data["label"] == "anomaly")[0]))

number of normal: 396743
number of anomaly: 97278


In [8]:
## check in which location data is abnormal

np.where(df_data["label"] == "normal")

(array([   744,    745,   4049, ..., 490962, 490963, 490964]),)

In [9]:
### looks like working

print(df_data.iloc[740:750, -1])
print(df_labels.iloc[740:750, -1])


740    anomaly
741    anomaly
742    anomaly
743    anomaly
744     normal
745     normal
746    anomaly
747    anomaly
748    anomaly
749    anomaly
Name: label, dtype: object
740             normal.
741             normal.
742             normal.
743             normal.
744    buffer_overflow.
745    buffer_overflow.
746             normal.
747             normal.
748             normal.
749             normal.
Name: labels, dtype: object


In [10]:
### for each object column in data

cols_toOneHot = df_data.select_dtypes(include='object').columns.tolist()

cols_all = df_data.columns.tolist()
cols_toNormalize = list(set(cols_all) - set(cols_toOneHot))

In [11]:
print("columns to normalize:\n%s\n" % cols_toNormalize)
print("columns to one-hot:\n%s\n" % cols_toOneHot)

columns to normalize:
['src_bytes', 'num_access_files', 'dst_host_srv_count', 'urgent', 'num_compromised', 'su_attempted', 'num_shells', 'wrong_fragment', 'land', 'dst_host_srv_diff_host_rate', 'duration', 'num_failed_logins', 'num_root', 'hot', 'srv_serror_rate', 'dst_bytes', 'count', 'dst_host_same_srv_rate', 'same_srv_rate', 'srv_rerror_rate', 'dst_host_srv_rerror_rate', 'srv_diff_host_rate', 'num_outbound_cmds', 'dst_host_diff_srv_rate', 'num_file_creations', 'dst_host_same_src_port_rate', 'logged_in', 'is_guest_login', 'root_shell', 'dst_host_srv_serror_rate', 'rerror_rate', 'is_host_login', 'diff_srv_rate', 'serror_rate', 'dst_host_rerror_rate', 'dst_host_serror_rate', 'srv_count', 'dst_host_count']

columns to one-hot:
['protocol_type', 'service', 'flag', 'label']



In [12]:
len(cols_toNormalize)

38

### Dataset preprocessing in numerical columns

In [13]:
df_data.duration.dtype == "object"

False

In [14]:
def checkStats(df_data):
    i=0
    for col in df_data.columns.tolist():
        if col in cols_toOneHot:    ### skip if column is not numerical  ### TODO: check column type??
            i += 1
            continue

        min_ = df_data[col].values.min()
        max_ = df_data[col].values.max()
        std_ = df_data[col].values.std()

        print("column:%d min:%f max:%f std:%f" % (i, min_, max_, std_))
        i += 1

In [15]:
## before normalization

checkStats(df_data)

column:0 min:0.000000 max:58329.000000 std:707.745756
column:4 min:0.000000 max:693375640.000000 std:988217.100872
column:5 min:0.000000 max:5155468.000000 std:33039.967812
column:6 min:0.000000 max:1.000000 std:0.006673
column:7 min:0.000000 max:3.000000 std:0.134805
column:8 min:0.000000 max:3.000000 std:0.005510
column:9 min:0.000000 max:30.000000 std:0.782102
column:10 min:0.000000 max:5.000000 std:0.015520
column:11 min:0.000000 max:1.000000 std:0.355344
column:12 min:0.000000 max:884.000000 std:1.798324
column:13 min:0.000000 max:1.000000 std:0.010551
column:14 min:0.000000 max:2.000000 std:0.007793
column:15 min:0.000000 max:993.000000 std:2.012716
column:16 min:0.000000 max:28.000000 std:0.096416
column:17 min:0.000000 max:2.000000 std:0.011020
column:18 min:0.000000 max:8.000000 std:0.036482
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:0.000000 std:0.000000
column:21 min:0.000000 max:1.000000 std:0.037211
column:22 min:0.000000 max:511.000000 std

In [16]:
### normalization

### min substr and max-min div
for col in cols_toNormalize:
    min = df_data[col].min()
    max = df_data[col].max()
    
    if max - min == 0:
        continue
    
    df_data[col] = (df_data[col] - min) / (max - min)



"""
### mean substr and std dev div
for col in cols_toNormalize:
    mean = df_data[col].mean()
    std = df_data[col].std()
    
    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!
        continue
    
    df_data[col] = (df_data[col] - mean) / std
"""

"\n### mean substr and std dev div\nfor col in cols_toNormalize:\n    mean = df_data[col].mean()\n    std = df_data[col].std()\n    \n    if mean == 0 or std == 0:     ### columns 'num_outbound_cmds', 'is_host_login' has zero mean and std dev!!!\n        continue\n    \n    df_data[col] = (df_data[col] - mean) / std\n"

In [17]:
checkStats(df_data)

column:0 min:0.000000 max:1.000000 std:0.012134
column:4 min:0.000000 max:1.000000 std:0.001425
column:5 min:0.000000 max:1.000000 std:0.006409
column:6 min:0.000000 max:1.000000 std:0.006673
column:7 min:0.000000 max:1.000000 std:0.044935
column:8 min:0.000000 max:1.000000 std:0.001837
column:9 min:0.000000 max:1.000000 std:0.026070
column:10 min:0.000000 max:1.000000 std:0.003104
column:11 min:0.000000 max:1.000000 std:0.355344
column:12 min:0.000000 max:1.000000 std:0.002034
column:13 min:0.000000 max:1.000000 std:0.010551
column:14 min:0.000000 max:1.000000 std:0.003896
column:15 min:0.000000 max:1.000000 std:0.002027
column:16 min:0.000000 max:1.000000 std:0.003443
column:17 min:0.000000 max:1.000000 std:0.005510
column:18 min:0.000000 max:1.000000 std:0.004560
column:19 min:0.000000 max:0.000000 std:0.000000
column:20 min:0.000000 max:0.000000 std:0.000000
column:21 min:0.000000 max:1.000000 std:0.037211
column:22 min:0.000000 max:1.000000 std:0.417118
column:23 min:0.000000 max:

### One hot encoding of required columns

In [18]:
### obtain one hot encoding

df_data = pd.get_dummies(df_data, columns=cols_toOneHot)

- Check the resulting dataset

In [19]:
df_data.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_anomaly,label_normal
0,0.0,2.610418e-07,0.001057,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,1,0
1,0.0,3.446905e-07,9.4e-05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,1,0
2,0.0,3.389216e-07,0.000259,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,1,0
3,0.0,3.158461e-07,0.000259,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,1,0
4,0.0,3.129617e-07,0.000394,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,1,0


In [20]:
### shape is correct

df_data.shape

(494021, 120)

In [21]:
df_data.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,label_anomaly,label_normal
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,0.000823,4.363595e-06,0.000168,4.5e-05,0.002144,5e-06,0.001151,3e-05,0.148247,1.2e-05,...,2.2e-05,0.001828,0.17612,0.000115,4.9e-05,2e-05,0.76604,0.000217,0.196911,0.803089
std,0.012134,0.001425228,0.006409,0.006673,0.044935,0.001837,0.02607,0.003104,0.355345,0.002034,...,0.004719,0.042714,0.380923,0.010741,0.00697,0.004499,0.423347,0.014715,0.397665,0.397665
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,6.489989e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
50%,0.0,7.499542e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,1.488371e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
### check again

checkStats(df_data)

column:0 min:0.000000 max:1.000000 std:0.012134
column:1 min:0.000000 max:1.000000 std:0.001425
column:2 min:0.000000 max:1.000000 std:0.006409
column:3 min:0.000000 max:1.000000 std:0.006673
column:4 min:0.000000 max:1.000000 std:0.044935
column:5 min:0.000000 max:1.000000 std:0.001837
column:6 min:0.000000 max:1.000000 std:0.026070
column:7 min:0.000000 max:1.000000 std:0.003104
column:8 min:0.000000 max:1.000000 std:0.355344
column:9 min:0.000000 max:1.000000 std:0.002034
column:10 min:0.000000 max:1.000000 std:0.010551
column:11 min:0.000000 max:1.000000 std:0.003896
column:12 min:0.000000 max:1.000000 std:0.002027
column:13 min:0.000000 max:1.000000 std:0.003443
column:14 min:0.000000 max:1.000000 std:0.005510
column:15 min:0.000000 max:1.000000 std:0.004560
column:16 min:0.000000 max:0.000000 std:0.000000
column:17 min:0.000000 max:0.000000 std:0.000000
column:18 min:0.000000 max:1.000000 std:0.037211
column:19 min:0.000000 max:1.000000 std:0.417118
column:20 min:0.000000 max:1.0

In [23]:
## check integrity of the data again

print(df_data.isnull().values.any())

print(df_data.isna().values.any())

False
False


In [24]:
### check again
print(df_data.iloc[740:750, -1])

print(df_data.iloc[740:750, -2])

print(df_labels.iloc[740:750, -1])

740    0
741    0
742    0
743    0
744    1
745    1
746    0
747    0
748    0
749    0
Name: label_normal, dtype: uint8
740    1
741    1
742    1
743    1
744    0
745    0
746    1
747    1
748    1
749    1
Name: label_anomaly, dtype: uint8
740             normal.
741             normal.
742             normal.
743             normal.
744    buffer_overflow.
745    buffer_overflow.
746             normal.
747             normal.
748             normal.
749             normal.
Name: labels, dtype: object


### Save dataset

In [25]:
dataset = df_data.values

In [26]:
dataset.shape

(494021, 120)

In [29]:
np.savez_compressed("../../datasets/kddcup/kdd99_dataset.npz", dataset=dataset)

### Produce train and test splits

In [55]:
random_state = None

In [56]:
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(dataset, test_size=0.5, random_state=random_state)

In [57]:
x_train.shape

(247010, 120)

In [58]:
x_test.shape

(247011, 120)

-  Normal class is anomaly (attack) in this case, because non-attack rows are minority

In [59]:
x_train_normal = x_train[np.where(x_train[:,-1] == 1)]   ##last column is the normal column
x_train_anomaly = x_train[np.where(x_train[:,-1] == 0)]   

In [60]:
x_train_normal.shape

(198377, 120)

In [61]:
x_train_anomaly.shape   

(48633, 120)

In [62]:
x_train = x_train_normal

- x_train consist of "normal" values

In [63]:
x_train.shape

(198377, 120)

In [64]:
x_test.shape

(247011, 120)

- Assign arbitrary output as labels

In [66]:
y_train = np.zeros((len(x_train),4))
y_train[:,0] = 1
y_train.shape

(198377, 4)

In [70]:
train_name = "kdd99_train-randomState_"+str(random_state)+".npz"
test_name = "kdd99_test-randomState_"+str(random_state)+".npz"

np.savez_compressed(train_name, x_train=x_train, y_train=y_train)
np.savez_compressed(test_name, x_test=x_test)