# Anamoly Detection System

### Library Imports

In [36]:
#data manipulation 
import pandas as pd
import numpy as np
import sklearn
import fastparquet

#data visualization
import matplotlib.pyplot as plt

#modeling
import tensorflow as tf

#statistical analysis
from scipy import stats

pd.set_option('display.max_columns', None)

## Read Datasets

In [37]:
unsw_testing_set = pd.read_csv('datasets/UNSW-NB15/UNSW_NB15_testing-set.csv') # testing set
unsw_training_set = pd.read_csv('datasets/UNSW-NB15/UNSW_NB15_training-set.csv') # training set
unsw_features = pd.read_csv('datasets/UNSW-NB15/NUSW-NB15_features.csv', encoding='cp1252') # features

#unsw_df_1 = pd.read_csv('datasets/UNSW-NB15/UNSW-NB15_1.csv')
#unsw_df_2 = pd.read_csv('datasets/UNSW-NB15/UNSW-NB15_2.csv')
#unsw_df_3 = pd.read_csv('datasets/UNSW-NB15/UNSW-NB15_3.csv')
#unsw_df_4 = pd.read_csv('datasets/UNSW-NB15/UNSW-NB15_4.csv')

In [38]:
unsw_testing_set.head(n=3)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,252,254,14158.94238,8495.365234,0,0,24.2956,8.375,30.177547,11.830604,255,621772692,2202533631,255,0.0,0.0,0.0,43,43,0,0,1,0,1,1,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,252,8395.112305,503571.3125,2,17,49.915,15.432865,61.426934,1387.77833,255,1417884146,3077387971,255,0.0,0.0,0.0,52,1106,0,0,43,1,1,1,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,252,1572.271851,60929.23047,1,6,231.875571,102.737203,17179.58686,11420.92623,255,2116150707,2963114973,255,0.111897,0.061458,0.050439,46,824,0,0,7,1,2,1,1,3,0,0,0,2,6,0,Normal,0


## Preprocess Data

In [39]:
#shuffle dataset
shuffled_data = unsw_training_set.sample(frac=1).reset_index(drop=True)

In [40]:
#removing duplicates
shuffled_data = shuffled_data.drop_duplicates()

In [41]:
#split dataset
y_shuffled_target = shuffled_data['label']
x_shuffled_target = shuffled_data.drop(columns=['id', 'label', 'proto'])

#fill in any null values
x_shuffled_target.fillna(0, inplace=True)
y_shuffled_target.fillna(0, inplace=True)

In [42]:
y_shuffled_target.head()

0    0
1    1
2    1
3    0
4    0
Name: label, dtype: int64

### Encode Categorical Features

In [43]:
import category_encoders as ce

encoder = ce.OneHotEncoder(cols=['service', 'state', 'attack_cat'])
#encoder.fit(x_shuffled_target)
df_encoded = encoder.fit_transform(x_shuffled_target)

### Scale Data

In [44]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_encoded)

## Developing the Model

In [45]:
sequence_length = 100 #number of time steps per input sequence 
feature_dimentionality = 69 #number of features per time step

In [290]:
model = tf.keras.models.Sequential() #keras API

#model.add(tf.keras.layers.LSTM(64, input_shape=(sequence_length, feature_dimentionality), return_sequences=True))
#model.add(tf.keras.layers.LSTM(32, return_sequences=True))
model.add(tf.keras.layers.LSTM(256, input_shape=(sequence_length, feature_dimentionality), return_sequences=True))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.LSTM(128, return_sequences=True))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.LSTM(32, return_sequences=False))
#model.add(tf.keras.layers.BatchNormalization())


model.add(tf.keras.layers.Dropout(0.3))


model.add(tf.keras.layers.Dense(64, activation='relu'))
#model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(32, activation='relu'))
#model.add(tf.keras.layers.Dropout(0.3))

model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [291]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.00001)

In [292]:
model.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics=['accuracy'])

In [293]:
model.summary()

In [294]:
df_encoded.shape

(82332, 69)

In [295]:
x_data = df_encoded.values.reshape((82332, 1, 69))
lstm = model.fit(x_data, y_shuffled_target, epochs=10, batch_size=32)


Epoch 1/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.7215 - loss: 0.6078
Epoch 2/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7518 - loss: 0.5090
Epoch 3/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7526 - loss: 0.4968
Epoch 4/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.7566 - loss: 0.4885
Epoch 5/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7592 - loss: 0.4822
Epoch 6/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7595 - loss: 0.4821
Epoch 7/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.7593 - loss: 0.4801
Epoch 8/10
[1m2573/2573[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.7606 - loss: 0.4760
Epoch 9/10
[1m2573/257

## Evaluation and Performance

In [296]:
test_shuffled_data = unsw_testing_set.sample(frac=1).reset_index(drop=True)

y_test = test_shuffled_data['label']
x_test = test_shuffled_data.drop(columns=['id', 'label', 'proto'])

y_test.fillna(0, inplace=True)
x_test.fillna(0, inplace=True)

#encode categorical values in x test
x_test_ohe = encoder.transform(x_test)

#reshape x test
x_test_reshaped = x_test_ohe.values.reshape((175341, 1, 69))
#y_test_reshaped = y_test.values.reshape(-1,1)


In [297]:
x_test_ohe.shape

(175341, 69)

In [298]:
x_test_ohe.shape

(175341, 69)

In [299]:
x_shuffled_target.head(n=5)

Unnamed: 0,dur,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat
0,1.081198,-,FIN,10,8,2516,354,15.7233,254,252,16759.19,2293.751953,2,1,108.134444,139.788281,5899.10695,206.223656,255,1402870757,3955881499,255,0.237535,0.102677,0.134858,252,44,0,0,2,1,2,2,2,2,0,0,0,2,2,0,Normal
1,9e-06,-,INT,2,0,168,0,111111.1072,254,0,74666660.0,0.0,0,0,0.009,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,84,0,0,0,3,2,1,1,1,1,0,0,0,2,1,0,Reconnaissance
2,8e-06,dns,INT,2,0,114,0,125000.0003,254,0,57000000.0,0.0,0,0,0.008,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,57,0,0,0,2,2,3,2,2,2,0,0,0,2,2,0,Generic
3,0.6897,-,FIN,10,8,1612,354,24.648398,254,252,16830.51,3595.766357,2,1,76.633333,83.527711,4360.10806,142.322906,255,1449866213,2030364865,255,0.153697,0.063012,0.090685,161,44,0,0,6,1,2,2,1,6,0,0,0,2,6,0,Normal
4,0.490562,http,FIN,10,8,808,1112,34.654132,62,252,11872.1,15867.51563,2,2,54.506889,54.781285,2957.205464,80.540797,255,3922528077,153344301,255,0.165766,0.084503,0.081263,81,139,1,154,4,1,2,2,2,3,0,0,4,3,3,0,Normal


In [300]:
x_test.head(n=10)


Unnamed: 0,dur,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat
0,0.431684,ftp-data,FIN,8,8,364,852,34.74764,62,252,5911.732,13824.93,1,2,61.669145,59.198,4156.806659,125.171383,255,3138415674,4115457086,255,0.065162,0.014311,0.050851,46,107,0,0,1,1,2,1,1,2,0,0,0,2,1,0,Exploits
1,3e-06,dns,INT,2,0,114,0,333333.3215,254,0,152000000.0,0.0,0,0,0.003,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,57,0,0,0,14,2,12,12,12,14,0,0,0,12,14,0,Generic
2,8e-06,-,INT,2,0,1984,0,125000.0003,254,0,992000000.0,0.0,0,0,0.008,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,992,0,0,0,3,2,2,1,1,2,0,0,0,3,3,0,Fuzzers
3,0.001703,-,CON,4,4,544,304,4110.393314,31,29,1916618.0,1071051.0,0,0,0.44,0.206333,0.617304,0.284024,0,0,0,0,0.0,0.0,0.0,136,76,0,0,11,0,4,1,1,3,0,0,0,6,9,0,Normal
4,9e-06,-,INT,2,0,376,0,111111.1072,254,0,167111100.0,0.0,0,0,0.009,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,188,0,0,0,10,2,2,2,1,10,0,0,0,2,9,0,Fuzzers
5,0.00106,dns,CON,2,2,146,178,2830.188576,31,29,550943.4,671698.1,0,0,0.01,0.007,0.0,0.0,0,0,0,0,0.0,0.0,0.0,73,89,0,0,4,0,4,2,1,2,0,0,0,3,4,0,Normal
6,1.795238,http,FIN,12,24,894,21968,19.496022,62,252,3654.112,93817.09,2,9,163.117733,73.414174,10592.93665,6806.344272,255,560157143,1274455209,255,0.225018,0.106711,0.118307,75,915,1,10240,3,1,1,4,1,2,0,0,1,4,1,0,Exploits
7,0.590904,http,FIN,10,6,884,268,25.384834,254,252,10776.71,3032.642,2,1,62.649333,92.967,3181.487278,135.187266,255,3530067858,124632946,255,0.205579,0.126068,0.079511,88,45,1,0,2,1,1,1,1,1,0,0,1,1,1,0,Reconnaissance
8,5e-06,dns,INT,2,0,114,0,200000.0051,254,0,91200000.0,0.0,0,0,0.005,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,57,0,0,0,22,2,9,9,9,22,0,0,0,9,22,0,Generic
9,6e-06,dns,INT,2,0,114,0,166666.6608,254,0,76000000.0,0.0,0,0,0.006,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,57,0,0,0,40,2,34,34,16,40,0,0,0,34,40,0,Generic


In [301]:
loss, accuracy = model.evaluate(x_test_reshaped, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m5480/5480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.8042 - loss: 0.3958
Test Loss: 0.3965193033218384
Test Accuracy: 80.36%


In [302]:
y_probability = model.predict(x_test_reshaped)
y_pred = (y_probability > 0.5).astype("int32")
y_flat = y_pred.flatten()

[1m5480/5480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step


In [303]:
print(y_flat)

[1 1 1 ... 1 1 0]


In [304]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [305]:
print(y_test)

0         1
1         1
2         1
3         0
4         1
         ..
175336    1
175337    0
175338    1
175339    1
175340    1
Name: label, Length: 175341, dtype: int64


In [306]:
print(y_pred)


[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [0]]


In [307]:
precision = precision_score(y_test, y_flat)
recall = recall_score(y_test, y_flat)
f1s = f1_score(y_test, y_flat)

In [308]:
print(precision)
print(recall)
print(f1s)

0.9233646164740144
0.7757853545721923
0.8431660048814251
