In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

def load_data(data_path, file_name):
    csv_path = os.path.join(data_path, file_name)
    return pd.read_csv(csv_path)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [2]:
KDD_DATA_PATH = "data/NSL-KDD"
kdd20 = load_data(KDD_DATA_PATH, "KDD20Train.csv")

kdd20.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'attack_type',
'unknown number']

In [3]:
# Output: DataFrame
kdd20_data = kdd20.drop(["attack_type", "unknown number"], axis=1)
kdd20_labels = kdd20["attack_type"].copy()

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
    
kdd20_data_num = kdd20_data.drop(["protocol_type","service","flag"], axis=1)
    
num_attribs = list(kdd20_data_num)
cat_attribs = ["protocol_type","service","flag"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(n_values='auto')),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])

kdd20_prepared = full_pipeline.fit_transform(kdd20_data)

### Five Class
- nomal - 0
- dos  - 1
- probe - 2
- U2R - 3
- R2L - 4

In [5]:
kdd20_five_labels = kdd20_labels.replace([
'back',
'buffer_overflow',
'ftp_write',
'guess_passwd',
'imap',
'ipsweep',
'land',
'loadmodule',
'multihop',
'neptune',
'nmap',
'perl',
'phf',
'pod',
'portsweep',
'rootkit',
'satan',
'smurf',
'spy',
'teardrop',
'warezclient',
'warezmaster',
'normal'], [1,3,4,4,4,2,1,3,4,1,2,3,4,1,2,3,2,1,4,1,4,4,0])

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(kdd20_prepared, kdd20_five_labels, test_size=0.3, random_state=0)

## Softmax Linear Classification

In [7]:
X = X_train.toarray()
y = y_train.values

D = 118 # dimensionality
K = 5 # number of classes

# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]

for i in range(1000):  
    # evaluate class scores, [N x K]
    scores = np.dot(X, W) + b 
    
    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
    
    # compute the loss: average cross-entropy loss and regularization
    correct_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(correct_logprobs)/num_examples
    reg_loss = 0.5*reg*np.sum(W*W)
    loss = data_loss + reg_loss
    
    if i % 100 == 0:
        print ('iteration %d: loss %f' % (i, loss))
        
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y] -= 1
    dscores /= num_examples
    
    
    # backpropate the gradient to the parameters (W,b)
    dW = np.dot(X.T, dscores)
    db = np.sum(dscores, axis=0, keepdims=True)
    
    dW += reg*W # regularization gradient
    
    # perform a parameter update
    W += -step_size * dW
    b += -step_size * db

iteration 0: loss 1.602569
iteration 100: loss 0.128250
iteration 200: loss 0.114309
iteration 300: loss 0.109089
iteration 400: loss 0.106420
iteration 500: loss 0.104824
iteration 600: loss 0.103782
iteration 700: loss 0.103058
iteration 800: loss 0.102535
iteration 900: loss 0.102143


In [8]:
# evaluate test set accuracy
X = X_test.toarray()
y = y_test.values

scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print ('test accuracy: %.4f' % (np.mean(predicted_class == y)))

test accuracy: 0.9734


## Simple Neural Network

In [9]:
X = X_train.toarray()
y = y_train.values

# initialize parameters randomly
h = 100 # size of hidden layer
W = 0.01 * np.random.randn(D,h)
b = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,K)
b2 = np.zeros((1,K))

# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength

# gradient descent loop
num_examples = X.shape[0]

for i in range(1000):
    # evaluate class scores, [N x K]
    hidden_layer = np.maximum(0, np.dot(X, W) + b) # note, ReLU activation
    scores = np.dot(hidden_layer, W2) + b2
    
    # compute the class probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
    
    # compute the loss: average cross-entropy loss and regularization
    correct_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(correct_logprobs)/num_examples
    reg_loss = 0.5*reg*np.sum(W*W) + 0.5*reg*np.sum(W2*W2)
    loss = data_loss + reg_loss
    if i % 100 == 0:
        print ('iteration %d: loss %f' % (i, loss))
        
    # compute the gradient on scores
    dscores = probs
    dscores[range(num_examples),y] -= 1
    dscores /= num_examples
    
    # backpropate the gradient to the parameters
    # first backprop into parameters W2 and b2
    dW2 = np.dot(hidden_layer.T, dscores)
    db2 = np.sum(dscores, axis=0, keepdims=True)
    # next backprop into hidden layer
    dhidden = np.dot(dscores, W2.T)
    # backprop the ReLU non-linearity
    dhidden[hidden_layer <= 0] = 0
    # finally into W,b
    dW = np.dot(X.T, dhidden)
    db = np.sum(dhidden, axis=0, keepdims=True)
    
    # add regularization gradient contribution
    dW2 += reg * W2
    dW += reg * W
    
    # perform a parameter update
    W += -step_size * dW
    b += -step_size * db
    W2 += -step_size * dW2
    b2 += -step_size * db2

iteration 0: loss 1.610169
iteration 100: loss 0.093974
iteration 200: loss 0.068944
iteration 300: loss 0.060447
iteration 400: loss 0.057150
iteration 500: loss 0.055441
iteration 600: loss 0.054439
iteration 700: loss 0.053742
iteration 800: loss 0.053241
iteration 900: loss 0.052863


In [10]:
# evaluate test set accuracy
X = X_test.toarray()
y = y_test.values

hidden_layer = np.maximum(0, np.dot(X, W) + b)
scores = np.dot(hidden_layer, W2) + b2
predicted_class = np.argmax(scores, axis=1)
print ('testing accuracy: %.4f' % (np.mean(predicted_class == y)))

testing accuracy: 0.9921
