# Dataset : KDDCup 1999

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
from keras.layers import InputLayer, Dense, Dropout

import matplotlib.pyplot as plt
import math

from time import time

pd.options.display.max_columns = 1000

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Load data

In [2]:
df = pd.read_csv("/data/kddcup.data", header=None)

The file kddcup.data does not have a header. In the description file, you can find the column names. Parse the columns names.

In [3]:
columns = [f.split(":")[0] for f in """
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
""".split("\n") if len(f)>0]

columns.append("Category")
print(columns)

['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Category']


Set the column names in the dataframe

In [4]:
df.columns = columns

In [5]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Category
0,0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


Take 70% data into training keeping aside the rest for testing. Do not shuffle the data since it is a chornological dataset.

In [6]:
training_size = int(df.shape[0] * 0.7)

Find any columns that have 0 standard deviation in the training set and eliminate it from analysis. 0 standard deviation will create devide by zero problem when calculating the Z-score for the columns.

In [7]:
df.iloc[:training_size, :].describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0,3428901.0
mean,17.98946,1336.918,1438.338,4.957857e-06,0.0003858379,7.290966e-06,0.01484645,4.112105e-05,0.1762547,0.009721191,8.778323e-05,3.995449e-05,0.01470529,0.001266295,8.45752e-05,0.001073522,0.0,2.916386e-07,0.0009624075,359.0027,336.7904,0.1210919,0.1211769,0.0176913,0.017748,0.8816183,0.01436075,0.03370236,228.5571,214.8149,0.8554925,0.01907444,0.6781845,0.007311792,0.1211629,0.120989,0.0178789,0.01766435
std,552.4748,379681.8,770914.1,0.002226619,0.03276238,0.004148088,0.5147092,0.00838351,0.3810368,4.518491,0.009368861,0.008383516,4.594144,0.124434,0.009322081,0.03639156,0.0,0.0005400358,0.03100777,216.2727,238.0757,0.3253301,0.3259454,0.129789,0.1309576,0.3111246,0.08070478,0.1513811,69.41384,87.54385,0.3338119,0.08883612,0.4579518,0.04097323,0.3252498,0.3258422,0.1276196,0.128635
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,255.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,122.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
50%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,511.0,511.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,511.0,511.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,1309937000.0,1.0,3.0,5.0,77.0,5.0,1.0,7479.0,1.0,2.0,7468.0,40.0,2.0,9.0,0.0,1.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


num_outbound_cmds has 0 standard deviation, so eliminate it.

In [8]:
df.drop(columns=["num_outbound_cmds"], inplace=True)

Look at the column types. We would focus on numeric columns. For 3 categorical feature columns, you can do one hot encoding.

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 41 columns):
duration                       int64
protocol_type                  object
service                        object
flag                           object
src_bytes                      int64
dst_bytes                      int64
land                           int64
wrong_fragment                 int64
urgent                         int64
hot                            int64
num_failed_logins              int64
logged_in                      int64
num_compromised                int64
root_shell                     int64
su_attempted                   int64
num_root                       int64
num_file_creations             int64
num_shells                     int64
num_access_files               int64
is_host_login                  int64
is_guest_login                 int64
count                          int64
srv_count                      int64
serror_rate                    fl

Find only numeric columns

In [10]:
df_numeric = df.select_dtypes(include=[np.float64, np.int64])
df_numeric.shape

(4898431, 37)

Column names for the numeric columns

In [11]:
num_columns = df_numeric.columns
num_columns

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'is_host_login', 'is_guest_login',
       'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
       'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
       'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
       'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
       'dst_host_serror_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'],
      dtype='object')

Form training and test dataframes.

In [12]:
df_train = df.iloc[:training_size, :]
df_test = df.iloc[training_size:, :]

Find mean and standard deviation for training dataset.

In [13]:
feature_mean = df_numeric.values[:training_size].mean(axis = 0)
feature_std = df_numeric.values[:training_size].std(axis = 0)

Verify that there is 0 standard deviation for any column

In [14]:
feature_std

array([5.52474763e+02, 3.79681720e+05, 7.70913959e+05, 2.22661900e-03,
       3.27623707e-02, 4.14808705e-03, 5.14709113e-01, 8.38350879e-03,
       3.81036739e-01, 4.51849039e+00, 9.36885919e-03, 8.38351443e-03,
       4.59414297e+00, 1.24434017e-01, 9.32207915e-03, 3.63915531e-02,
       5.40035691e-04, 3.10077613e-02, 2.16272661e+02, 2.38075635e+02,
       3.25330051e-01, 3.25945337e-01, 1.29788998e-01, 1.30957597e-01,
       3.11124558e-01, 8.07047659e-02, 1.51381121e-01, 6.94138253e+01,
       8.75438367e+01, 3.33811862e-01, 8.88361104e-02, 4.57951771e-01,
       4.09732266e-02, 3.25249721e-01, 3.25842135e-01, 1.27619624e-01,
       1.28634967e-01])

Build the model

In [15]:
from keras.layers import LSTM, Dense

In [16]:
n_x = 37# Dimension of each transactional record
sequence_length = 10 # number of time steps that we look into to make prediction

size_of_hidden_state_within_cell = 12 

model = keras.Sequential()
model.add(InputLayer((sequence_length, n_x)))
model.add(LSTM(size_of_hidden_state_within_cell, activation="tanh"))
model.add(Dropout(0.8))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10, 37)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 12)                2400      
_________________________________________________________________
dropout_1 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 13        
Total params: 2,413
Trainable params: 2,413
Non-trainable params: 0
_________________________________________________________________


Find baseline accuracy. As it shows below, the baseline accuracy is 0.84. So, our model should give better accuracy thatn 0.84.

In [17]:
pd.Series(df.Category == "normal.").value_counts()/df.shape[0]

False    0.80141
True     0.19859
Name: Category, dtype: float64

Refresh the numpy slicing behavior

In [18]:
[0, 1, 2, 3, 4, 5, 6, 7, 8][2:8], [0, 1, 2, 3, 4, 5, 6, 7, 8][8] 

([2, 3, 4, 5, 6, 7], 8)

# Batching of records

```
0, 1, 2, 3, ... 100 - these are the index of the record. Each record of dim 37
```
Goal is to predict the label for record i

Logistic regression
- to do this we look at features of only record i

In RNN
- to do this we look at the features of record, i, i - 1, i - 2, ... i - sequence_len



```


Batch size: 16, and sequence length: 10


Batch 0: 
    0: X: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] y: [9]
    1: X: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] y: [10]
    ...
    15: X: [15, 16, 17, 18, 19, 20, 21, 22, 23, 24] y: [24]

Batch 1: 
    0: X: [16, 17, 18, 19, 20, 21, 22, 23, 24, 25], y: [25]
    1: X: [17, 18, 19, 20, 21, 22, 23, 24, 25, 26], y: [26]
    ...
    15: X: [31, 32, 33, 34, 35, 36, 37, 38, 39, 40], y: [40]
....

Total number of batches: (100 - 10)/16 [(total_count - sequence_len)/batch_size]

Last Batch:
    0: X: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89], y: [89]
    ...
    9: X: [89, 90, 91, 92, 93, 94, 95, 96, 97, 98], y: [98]
    10: X: [90, 91, 92, 93, 94, 95, 96, 97, 98, 99], y: [99]

```
Last batch has 11 ... fewer than the batch size



Let's build out generator function, that takes a dataframe and create batches of sequential data that is ready for RNN models.

In [19]:
def generator(df, batch_size = 128, sequence_length = 10):
    X = df[num_columns].values
    X = (X - feature_mean)/feature_std
    y = np.where(df.Category == "normal.", 0, 1)
    size = X.shape[0]
    i = sequence_length - 1
    X_batch = []
    y_batch = []
    num_batches = 0
    while True:
        X_batch.append(X[i-sequence_length+1:i+1])
        y_batch.append(y[i])
        
        if (len(X_batch) == batch_size) or (i == size - 1):
            X_batch = np.array(X_batch)
            y_batch = np.array(y_batch)
            num_batches += 1
            #print("i", i, "batch size", len(X_batch), "num_batches", num_batches,)
            yield X_batch, y_batch
            X_batch = []
            y_batch = []
            if i == size - 1:
                i = sequence_length - 1
        else:    
            i += 1
        
count = 0
df = df_train.iloc[:100, :]
for batch in generator(df, 16):
    print(batch[0].shape, batch[1].shape)
    count += 1
    # count > math.ceil((df.shape[0] - sequence_length)/16)
    if count >= 20:
        break

(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)
(16, 10, 37) (16,)


Use the generator to fit the model.

In [20]:
tbaord = keras.callbacks.TensorBoard(log_dir="/tmp/tf/logs/%d" % time())
df = df_train #.iloc[:100, :]
batch_size = 128
batch_count = math.ceil((df.shape[0] - sequence_length)/batch_size)
print("Batch count", batch_count)
model.fit_generator(generator(df, batch_size)
                    , steps_per_epoch=batch_count
                    , epochs=10
                    , verbose=1
                    , callbacks=[tbaord])

Batch count 26789
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x181aa4f710>

Evaluate the performance of the model

In [21]:
batch_size = 256
df = df_test

batch_count = math.ceil((df.shape[0] - sequence_length)/batch_size)
model.evaluate_generator(generator(df, batch_size), batch_count)

[0.018972053827949795, 0.9980383698397491]

Calculate prediction probabilities, classes and confusion matrix.

In [22]:
batch_size = 256
df = df_test
batch_count = math.ceil((df.shape[0] - sequence_length)/batch_size)

y_test, y_test_prob = [], []
count = 0 
for batch in generator(df, batch_size):
    count += 1
    y_test += list(batch[1])
    y_test_prob += list(model.predict(batch[0]).flatten())
    if count >= batch_count:
        break
y_test = np.array(y_test)
y_test_prob = np.array(y_test_prob)
y_test.shape, y_test_prob.shape

((1469696,), (1469696,))

In [26]:
import sklearn 
y_test_pred = np.where(y_test_prob > 0.5, 1, 0)
sklearn.metrics.confusion_matrix(y_test, y_test_pred)

array([[ 222089,    2683],
       [    200, 1244724]])

Results after 2 epochs
```
array([[ 221800,    2972],
       [    453, 1244471]])
```

In [44]:
n_x = len(num_columns)

tf.set_random_seed(1)
np.random.seed(1)

model = keras.Sequential()
model.add(InputLayer((n_x, )))
model.add(Dense(units=10, activation="relu"))
model.add(Dropout(rate=0.5))
model.add(Dense(units=5, activation="relu"))
model.add(Dropout(rate=0.5))
model.add(Dense(units=1, activation="sigmoid"))

model.compile(loss=keras.losses.binary_crossentropy
            , metrics=["accuracy"], optimizer="adam")

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 37)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 10)                380       
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 5)                 55        
_________________________________________________________________
dropout_3 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 6         
Total params: 441
Trainable params: 441
Non-trainable params: 0
_________________________________________________________________


In [45]:
def build_dataset(df):
    X = df[num_columns].values
    X = (X - feature_mean)/feature_std
    y = np.where(df.Category == "normal.", 0, 1)
    return X, y

X_train, y_train = build_dataset(df_train)
X_test, y_test = build_dataset(df_test)

In [46]:
tensor_board = keras.callbacks.TensorBoard(log_dir="/tmp/tf/logs/%d" % time())

model.fit(X_train, y_train, batch_size=256
          , validation_data = (X_test, y_test)
          , epochs=10
          , verbose = 1
          , callbacks=[tensor_board])

Train on 3428901 samples, validate on 1469530 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1833bb8c88>

In [47]:
y_test_pred = model.predict_classes(X_test)
sklearn.metrics.confusion_matrix(y_test, y_test_pred)

array([[ 221769,    7699],
       [    770, 1239292]])