In [2]:
#REQUIRED LIBRARIES
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten
from tensorflow.keras.layers import CuDNNGRU,GRU
from sklearn.preprocessing import StandardScaler
import pandas as pd
from time import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from numpy import newaxis
from collections import Counter
from imblearn.under_sampling import TomekLinks
import os
import random
import lightgbm as gbm
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score

In [3]:
#REPRODUCABLE RESULT
seed_value= 2020
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.set_random_seed(seed_value)

In [4]:
#DATASET LOADING
training = pd.read_csv("UNSW_NB15_total_numeric_training.csv")
testing = pd.read_csv("UNSW_NB15_total_numeric_testing.csv")

WRAPPER = ["service","state","sbytes","dbytes","sttl","dttl","sload","sloss","dloss",
       "smean","dmean","response_body_len","ct_srv_src","ct_dst_sport_ltm","ct_src_dport_ltm","ct_dst_src_ltm",
       "is_ftp_login","ct_ftp_cmd","ct_srv_dst"]

FILTER = ["service","sbytes","sttl","smean","ct_dst_sport_ltm","ct_src_dport_ltm","proto"]

THRES = ['smean','sbytes','dmean','proto','ct_srv_dst','dbytes','ct_dst_src_ltm','service','sload',
         'response_body_len','dur','sttl','ct_srv_src','synack','ct_src_ltm','sjit','sloss','dload','stcpb','djit',
         'ackdat','dpkts','tcprtt','dtcpb']

x_tr = training.drop(["id","label","attack_cat"],axis=1)
x_tr = x_tr[THRES]
y_tr = training[["attack_cat"]]
                      
x_ts = testing.drop(["id","label","attack_cat"],axis=1)
x_ts = x_ts[THRES]
y_ts = testing[["attack_cat"]]

In [None]:
#DATASET DIVIDING FOR TESTING
x_tr_p1,x_tr_p2,y_tr_p1,y_tr_p2 = train_test_split(x_tr,y_tr,test_size=0.5,stratify=y_tr,random_state=34)

In [None]:
#LIGHTGBM FEATURE SELECTION PROCESS
start = time()
lgb_train = gbm.Dataset(x_tr_p1,y_tr_p1,
                        feature_name=['dur','proto','service','state','spkts','dpkts','sbytes','dbytes',
                                      'rate','sttl','dttl','sload','dload','sloss','dloss','sinpkt','dinpkt',
                                      'sjit','djit','swin','stcpb','dtcpb','dwin','tcprtt','synack','ackdat',
                                      'smean','dmean','trans_depth','response_body_len','ct_srv_src',
                                      'ct_state_ttl','ct_dst_ltm','ct_src_dport_ltm','ct_dst_sport_ltm',
                                      'ct_dst_src_ltm','is_ftp_login','ct_ftp_cmd','ct_flw_http_mthd',
                                      'ct_src_ltm','ct_srv_dst','is_sm_ips_ports'],
                        categorical_feature=['service','state','sttl','dttl','is_ftp_login','ct_ftp_cmd'])

lgb_eval = gbm.Dataset(x_tr_p1,y_tr_p1,reference=lgb_train)
params = {
    'task': 'train',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class':10,
    'learning_rate': 0.03,
    'verbose': 0,
    'tree_learner': 'voting',
}
evals={}
clf = gbm.train(params,lgb_train,num_boost_round=50,valid_sets=lgb_eval,evals_result=evals,early_stopping_rounds=3)
print("Training %.2f seconds:" % ((time() - start)))

In [None]:
%matplotlib inline
gbm.plot_importance(clf,figsize=(5, 5))

In [None]:
imp=clf.feature_importance()
name=clf.feature_name()
feature_sorted=[]
for i in np.argsort(imp):
     feature_sorted.append(name[i])
feature_sorted.reverse()
f_set = feature_sorted

In [None]:
acc_array = []
for a in range(len(f_set)):
    d_train=gbm.Dataset(x_tr_p2, label=y_tr_p2)
    params = {
        'task': 'train',
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class':10,
        'learning_rate': 0.03,
        'verbose': 0,
        'tree_learner': 'voting',
    }
    #training the model
    clf_2 = gbm.train(params,d_train,50)
    y_pred = [np.argmax(line) for line in clf_2.predict(x_tr_p2)]
    acc_score = precision_score(y_pred,y_tr_p2,average=None).mean()
    acc_array.append(acc_score)
    print ("Top ",len(f_set)," feature accuracy result:",acc_score)
    x_tr_p2 = x_tr_p2.drop([f_set.pop()],axis=1)

In [None]:
print ("Max accuracy",np.max(acc_array),"with",42-np.argmax(acc_array),"features")

In [None]:
import matplotlib.pyplot as plt
acc_array.reverse()
plt.plot(acc_array)
plt.ylabel('Accuracy')
plt.xlabel('Number of Features')
plt.show()

In [None]:
imp=clf.feature_importance()
name=clf.feature_name()
feature_sorted=[]
for i in np.argsort(imp):
     feature_sorted.append(name[i])
feature_sorted.reverse()
f_set = feature_sorted

In [None]:
x_tr_rnn = x_tr#[f_set[:23]]
x_ts_rnn = x_ts#[f_set[:23]]
y_tr_rnn = y_tr
u_list = [0.0,5.0,6.0,7.0]
t = TomekLinks(sampling_strategy=u_list,random_state=34)
x_tr_rnn, y_tr_rnn = t.fit_resample(x_tr_rnn,y_tr_rnn)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_tr_rnn = scaler.fit_transform(x_tr_rnn)
x_ts_rnn = scaler.transform(x_ts_rnn)

In [None]:
#PREPROCESSING FOR RECURRENT NETWORK
x = np.array(x_tr_rnn,dtype=np.float32)
y = np.array(y_tr_rnn,dtype=np.int32)
max_features = 10
y2 = np.zeros((y.shape[0], max_features),dtype=np.float32)
y2[np.arange(y.shape[0]), y] = 1.0
x = x[:,:,newaxis]

In [None]:
x_ts_rnn = np.array(x_ts_rnn,dtype=np.float32)
x_ts_rnn = x_ts_rnn[:,:,newaxis]

In [None]:
#GRUGBM_IDS
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
from tensorflow.keras import regularizers
model = Sequential()
model.add(CuDNNGRU(120,input_shape=(None,1)))
model.add(Dense(10,activation='sigmoid'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'],)
#TRAINING
start = time()
model.fit(x,y2,epochs=10)
print("Training %.2f seconds:" % ((time() - start)))

In [None]:
#TESTING DATA/WRAPPER FEATURES WITH GRU
pred_new = model.predict(x_ts_rnn)
pred_new_clas = np.argmax(pred_new,axis=1)
acc = accuracy_score(y_ts,pred_new_clas)
conf=confusion_matrix(y_ts,pred_new_clas)
print (acc)
print (conf)
print(classification_report(y_ts,pred_new_clas))

In [6]:
#DECISIONTREE / threshold
from sklearn.tree import DecisionTreeClassifier
clf3 = DecisionTreeClassifier(random_state=42,class_weight='balanced')
x_tr_dt = x_tr
x_ts_dt = x_ts
clf3.fit(x_tr_dt,y_tr)
dt_pred = clf3.predict(x_ts_dt)
print (accuracy_score(y_ts,dt_pred))
print (confusion_matrix(y_ts,dt_pred))
print(classification_report(y_ts,dt_pred))

0.725477335665
[[28205   555     8   151   797  7067    37    27   152     1]
 [   96    44   469    12    28     0    16    12     0     0]
 [   83    46   411    13     9     2     1    12     6     0]
 [  119   134  2628   557   488    58    29    45    30     1]
 [  382   194  2773   540  6521   228   158   247    79    10]
 [ 1713    92   966   158   408  2513    21    51   140     0]
 [    7    13    34    71   243    44 18434     4    14     7]
 [   14    12   326    33   267    18     7  2787    31     1]
 [   15     0     7    18    51    46     4     3   234     0]
 [    1     0     1     1    16     0     1     0     0    24]]
              precision    recall  f1-score   support

           0       0.92      0.76      0.83     37000
           1       0.04      0.06      0.05       677
           2       0.05      0.70      0.10       583
           3       0.36      0.14      0.20      4089
           4       0.74      0.59      0.65     11132
           5       0.25      

In [7]:
#KNN threshold
from sklearn.neighbors import KNeighborsClassifier
###
clf_knn = KNeighborsClassifier(n_neighbors=5)
x_tr_knn = x_tr
x_ts_knn = x_ts
scaler = StandardScaler()
x_tr_knn = scaler.fit_transform(x_tr_knn)
x_ts_knn = scaler.transform(x_ts_knn)
clf_knn.fit(x_tr_knn,y_tr)
knn_pred = clf_knn.predict(x_ts_knn)
print (accuracy_score(y_ts,knn_pred))
print (confusion_matrix(y_ts,knn_pred))
print(classification_report(y_ts,knn_pred))###

  # Remove the CWD from sys.path while we load stuff.


0.709505417092
[[27043   349     8   212  1353  7231     8   759    36     1]
 [   54    81    93   207   230    10     0     2     0     0]
 [   23    71    34   194   224    14     0    22     1     0]
 [  128   496   852   858  1441   162    16   119    17     0]
 [  579   557   810   990  7201   522     9   429    35     0]
 [ 1704   184   159   432   692  2737     3   143     8     0]
 [   41     3     2   103   382   106 18185    36    13     0]
 [  234    55   110    84   465   360     2  2174    12     0]
 [   35     0     0     8    33    70     2   130   100     0]
 [    3     0     0     1    28     5     0     5     0     2]]
              precision    recall  f1-score   support

           0       0.91      0.73      0.81     37000
           1       0.05      0.12      0.07       677
           2       0.02      0.06      0.03       583
           3       0.28      0.21      0.24      4089
           4       0.60      0.65      0.62     11132
           5       0.24      

In [8]:
#SVM threshold
from sklearn import svm
clf_svm = svm.SVC(random_state=42)
scaler = StandardScaler()
x_tr_svm = scaler.fit_transform(x_tr)
x_ts_svm = scaler.transform(x_ts)
clf_svm.fit(x_tr_svm,y_tr)
svm_pred = clf_svm.predict(x_ts_svm)
print (accuracy_score(y_ts,svm_pred))
print (confusion_matrix(y_ts,svm_pred))
print(classification_report(y_ts,svm_pred))

  return f(**kwargs)


0.695719768741
[[23067    16     0    31  1824 10490     1  1565     6     0]
 [   56     2     0     0   592    19     0     8     0     0]
 [    3     0     0     0   515    25     0    40     0     0]
 [   51     0     0    55  3506   200    28   243     6     0]
 [  268     0     0    17  9825   626    12   381     3     0]
 [  527     0     0     0  1517  3748     1   268     1     0]
 [   13     0     0     2   475   170 18164    47     0     0]
 [   16     0     0     0   795   259    27  2399     0     0]
 [    3     0     0     0    34    93     0   233    15     0]
 [    1     0     0     0    30     7     0     1     0     5]]
              precision    recall  f1-score   support

           0       0.96      0.62      0.76     37000
           1       0.11      0.00      0.01       677
           2       0.00      0.00      0.00       583
           3       0.52      0.01      0.03      4089
           4       0.51      0.88      0.65     11132
           5       0.24      

  _warn_prf(average, modifier, msg_start, len(result))
