# KDDCup99 10%Data Evaluation
- Import KDDCup99 10%data from network and check performance of anomaly detection.
- To execute this notebook, need python(3.6), tensorflow, pandas, numpy, sklearn.

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

from dagmm_v2 import DAGMM

2023-02-21 23:39:17.958224: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data Import

In [2]:
url_base = "http://kdd.ics.uci.edu/databases/kddcup99"

# KDDCup 10% Data
url_data = f"kddcup.data_10_percent.gz"
# info data (column names, col types)
url_info = f"kddcup.names"

In [3]:
# Import info data
df_info = pd.read_csv(url_info, sep=":", skiprows=1, index_col=False, names=["colname", "type"])
colnames = df_info.colname.values
colnames

array(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'], dtype=object)

In [4]:
coltypes = np.where(df_info["type"].str.contains("continuous"), "float", "str")
colnames = np.append(colnames, ["status"])

In [5]:
coltypes = np.append(coltypes, ["str"])

# Import data
df = pd.read_csv(url_data, names=colnames, index_col=False,
                 dtype=dict(zip(colnames, coltypes)))
df['status'].unique()

array(['normal.', 'buffer_overflow.', 'loadmodule.', 'perl.', 'neptune.',
       'smurf.', 'guess_passwd.', 'pod.', 'teardrop.', 'portsweep.',
       'ipsweep.', 'land.', 'ftp_write.', 'back.', 'imap.', 'satan.',
       'phf.', 'nmap.', 'multihop.', 'warezmaster.', 'warezclient.',
       'spy.', 'rootkit.'], dtype=object)

In [6]:
# Dumminize
X = pd.get_dummies(df.iloc[:,:-1]).values



In [7]:
# Create Traget Flag
# Anomaly data when status is normal, Otherwise, Not anomaly.
y = np.where(df.status == "normal.", 1, 0)

In [8]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,status
0,0.0,tcp,http,SF,181.0,5450.0,0,0.0,0.0,0.0,...,9.0,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal.
1,0.0,tcp,http,SF,239.0,486.0,0,0.0,0.0,0.0,...,19.0,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
2,0.0,tcp,http,SF,235.0,1337.0,0,0.0,0.0,0.0,...,29.0,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0.0,tcp,http,SF,219.0,1337.0,0,0.0,0.0,0.0,...,39.0,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
4,0.0,tcp,http,SF,217.0,2032.0,0,0.0,0.0,0.0,...,49.0,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0.0,tcp,http,SF,310.0,1881.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494017,0.0,tcp,http,SF,282.0,2286.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494018,0.0,tcp,http,SF,203.0,1200.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494019,0.0,tcp,http,SF,291.0,1200.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [9]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=123)
X_train, y_train = X_train[y_train == 0], y_train[y_train == 0]

## Fit Data to DAGMM Model
next points are different from original paper:
- $\lambda_2$ is set to 0.0001 (paper: 0.005)
- Add small value($10^{-6}$) to diagonal elements of GMM covariance (paper: no additional value)

Standard Scaler is applied to input data (This DAGMM implementation default)

In [10]:
model = DAGMM(
    comp_hiddens=[60, 30, 10, 1], comp_activation=tf.nn.tanh,
    est_hiddens=[10, 4], est_dropout_ratio=0.5, est_activation=tf.nn.tanh,
    learning_rate=0.0001, epoch_size=200, minibatch_size=1024, random_seed=1111
)

In [11]:
model.fit(X_train)

  z = tf.compat.v1.layers.dense(z, size, activation=self.activation,
  z = tf.compat.v1.layers.dense(z, self.hidden_layer_sizes[-1],
  z = tf.compat.v1.layers.dense(z, size, activation=self.activation,
  x_dash = tf.compat.v1.layers.dense(z, self.input_size,
  z = tf.compat.v1.layers.dense(z, size, activation=self.activation,
  z = tf.compat.v1.layers.dropout(z, dropout_ratio,
  logits = tf.compat.v1.layers.dense(z, size, activation=None, name="logits")


TypeError: Failed to convert elements of <keras.layers.activation.softmax.Softmax object at 0x7f6bd45f47f0> to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.

## Apply model to test data

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Energy thleshold to detect anomaly = 80% percentile of energies
anomaly_energy_threshold = np.percentile(y_pred, 80)
print(f"Energy thleshold to detect anomaly : {anomaly_energy_threshold:.3f}")

In [None]:
# Detect anomalies from test data
y_pred_flag = np.where(y_pred >= anomaly_energy_threshold, 1, 0)

In [None]:
prec, recall, fscore, _ = precision_recall_fscore_support(y_test, y_pred_flag, average="binary")
print(f" Precision = {prec:.3f}")
print(f" Recall    = {recall:.3f}")
print(f" F1-Score  = {fscore:.3f}")