# Anomaly Detection in Keras

- [Source](https://github.com/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_14_03_anomaly.ipynb)
    - Instructor: [Jeff Heaton](https://sites.wustl.edu/jeffheaton/), McKelvey School of Engineering, [Washington University in St. Louis](https://engineering.wustl.edu/Programs/Pages/default.aspx)
    - For more information visit the [class website](https://sites.wustl.edu/jeffheaton/t81-558/).

In [119]:
import pandas as pd
from tensorflow.keras.utils import get_file

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn import metrics
import numpy as np
from IPython.display import display, HTML
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [58]:
try:
    path = get_file(''kddcup.data_10_percent.gz'', origin='http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
except:
    print('Error downloading')
    raise
    
print(path) 

C:\Users\PARK\.keras\datasets\kddcup.data_10_percent.gz


In [109]:
df = pd.read_csv(path, header = None)

print("Read {} rows.".format(len(df)))

df.dropna(inplace = True, axis = 1)

df.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

df.head()

Read 494021 rows.


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [110]:
df_columns = df.columns.tolist()#.tolist().delete('protocol_type','service','flag')
categorical_columns_index = [1,2,3,6,11,20,21,41]

numerical_columns = np.delete(df_columns, categorical_columns_index)
categorical_columns = ['protocol_type', 'service', 'flag', 'land',
                       'is_host_login', 'is_guest_login']

In [111]:
std = StandardScaler()

for col in numerical_columns:
    df[col] = std.fit_transform(df[[col]])

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis = 1, inplace = True)

In [112]:
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_text_dummy(df, 'logged_in')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')

df.dropna(inplace = True, axis =1)
df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,...,flag-S1,flag-S2,flag-S3,flag-SF,flag-SH,logged_in-0,logged_in-1,is_host_login-0,is_guest_login-0,is_guest_login-1
0,-0.067792,-0.002879,0.138664,0,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,...,0,0,0,1,0,0,1,1,1,0
1,-0.067792,-0.00282,-0.011578,0,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,...,0,0,0,1,0,0,1,1,1,0
2,-0.067792,-0.002824,0.014179,0,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,...,0,0,0,1,0,0,1,1,1,0
3,-0.067792,-0.00284,0.014179,0,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,...,0,0,0,1,0,0,1,1,1,0
4,-0.067792,-0.002842,0.035214,0,-0.04772,-0.002571,-0.044136,-0.009782,-0.005679,-0.010552,...,0,0,0,1,0,0,1,1,1,0


In [113]:
normal_mask = df['outcome'] == 'normal.'
attack_mask = df['outcome'] != 'normal.'

df.drop(columns = 'outcome', inplace = True)

df_normal = df[normal_mask]
df_attack = df[attack_mask]

print("Normal Count: ", len(df_normal))
print("Attack Count:", len(df_attack))

Normal Count:  97278
Attack Count: 396743


In [114]:
x_normal = df_normal.values
x_attack = df_attack.values

x_normal_train, x_normal_test = train_test_split(x_normal, test_size = 0.25, random_state = 42)

print("Normal Train Count:", len(x_normal_train))
print("Normal Test Count:", len(x_normal_test))

Normal Train Count: 72958
Normal Test Count: 24320


In [133]:
model = Sequential()
model.add(Dense(25, input_dim = x_normal.shape[1], activation = 'relu'))
model.add(Dense(3, activation = 'relu'))
model.add(Dense(25, activation = 'relu'))
model.add(Dense(x_normal.shape[1]))

model.compile(loss = 'mean_squared_error', optimizer = 'adam')
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_28 (Dense)             (None, 25)                3025      
_________________________________________________________________
dense_29 (Dense)             (None, 3)                 78        
_________________________________________________________________
dense_30 (Dense)             (None, 25)                100       
_________________________________________________________________
dense_31 (Dense)             (None, 120)               3120      
Total params: 6,323
Trainable params: 6,323
Non-trainable params: 0
_________________________________________________________________


In [127]:
model.fit(x_normal_train, x_normal_train, batch_size = 50,
          validation_split = 0.25,
          verbose = 1, epochs = 30,
         callbacks = [EarlyStopping(monitor = 'val_loss', patience = 3)])

Train on 54718 samples, validate on 18240 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30


<tensorflow.python.keras.callbacks.History at 0x28c2bf786d8>

In [128]:
pred = model.predict(x_normal_test)
score1 = np.sqrt(metrics.mean_squared_error(pred, x_normal_test))
pred = model.predict(x_normal)
score2 = np.sqrt(metrics.mean_squared_error(pred, x_normal))
pred = model.predict(x_attack)
score3 = np.sqrt(metrics.mean_squared_error(pred, x_attack))

print(f"Insample Normal Score (RMSE): {score1}".format(score1))
print(f"Out of Sample Normal Score (RMSE): {score2}")
print(f"Attack Underway Score (RMSE): {score3}")

Insample Normal Score (RMSE): 0.5065291448357727
Out of Sample Normal Score (RMSE): 0.45399143728813646
Attack Underway Score (RMSE): 0.5344689181824828
