### Prepare data

In [19]:
import pandas as pd

df = pd.read_csv('train_data_v1.csv')

columns = ['flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'fwd_pkt_len_std', 'fwd_seg_size_avg', 'flow_iat_mean', 'flow_iat_max',
       'flow_iat_std', 'fwd_iat_tot', 'fwd_iat_max', 'fwd_iat_mean',
       'fwd_iat_std', 'bwd_iat_tot', 'bwd_iat_max', 'bwd_iat_std',
       'fin_flag_cnt', 'rst_flag_cnt', 'init_fwd_win_byts', 'win_byts_tot',
       'fwd_win_tot', 'fwd_win_mean', 'win_byts_std', 'fwd_win_std',
       'fwd_win_max', 'win_byts_min', 'fwd_win_min', 'zero_win_cnt',
       'idle_max', 'idle_mean', 'idle_std']

data_x = df[columns]
data_y = df['label']

### Data Split

In [20]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2)

### Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

DCmodel = DecisionTreeClassifier()
DCmodel.fit(x_train, y_train)
y_predict_DC = DCmodel.predict(x_test)
print('accuracy : '+ str(metrics.accuracy_score(y_test, y_predict_DC)))

accuracy : 0.9989646501996746


### RandomForest

In [22]:
from sklearn.ensemble import RandomForestClassifier

RFmodel = RandomForestClassifier(criterion='entropy')
RFmodel.fit(x_train, y_train)
y_predict_RF = RFmodel.predict(x_test)
print('accuracy : '+ str(metrics.accuracy_score(y_test, y_predict_RF)))

accuracy : 0.9994083715426711


### KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier

best_i = 1
best_acc = 0
for i in  range(1, 49, 2):
    KNNmodel = KNeighborsClassifier(n_neighbors=i)
    KNNmodel.fit(x_train, y_train)
    y_predict_KNN = KNNmodel.predict(x_test)
    score = metrics.accuracy_score(y_test, y_predict_KNN)
    if score > best_acc:
        best_acc = score
        best_i = i
    print('K = ' + str(i) + '=> accuracy : '+ str(score))
print('\nBest K = ' + str(best_i) + ',accuracy = ' + str(best_acc))
KNNmodel = KNeighborsClassifier(n_neighbors=best_i)
KNNmodel.fit(x_train, y_train)
y_predict_KNN = KNNmodel.predict(x_test)

K = 1=> accuracy : 0.9986688359710102
K = 3=> accuracy : 0.9987181383424543
K = 5=> accuracy : 0.9983730217423458
K = 7=> accuracy : 0.9976827885421289
K = 9=> accuracy : 0.9973869743134645
K = 11=> accuracy : 0.9971404624562441
K = 13=> accuracy : 0.9966967411132476
K = 15=> accuracy : 0.9963516245131391
K = 17=> accuracy : 0.9962037173988069
K = 19=> accuracy : 0.9959079031701424
K = 21=> accuracy : 0.995562786570034
K = 23=> accuracy : 0.9953162747128137
K = 25=> accuracy : 0.9951190652270374
K = 27=> accuracy : 0.9952669723413696
K = 29=> accuracy : 0.9949711581127052
K = 31=> accuracy : 0.9949218557412611
K = 33=> accuracy : 0.9944288320268205
K = 35=> accuracy : 0.9941823201696002
K = 37=> accuracy : 0.994034413055268
K = 39=> accuracy : 0.9936892964551595
K = 41=> accuracy : 0.993294877483607
K = 43=> accuracy : 0.9927032490262782
K = 45=> accuracy : 0.9925553419119459
K = 47=> accuracy : 0.9923088300547256

Best K = 3,accuracy = 0.9987181383424543


### XGBoost

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

XGBoostmodel = GradientBoostingClassifier()
XGBoostmodel.fit(x_train, y_train)
y_predict_XGBoost = XGBoostmodel.predict(x_test)
print('accuracy : '+ str(metrics.accuracy_score(y_test, y_predict_XGBoost)))

accuracy : 0.9991618596854509


### Naive Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB 

NBmodel = GaussianNB()
NBmodel.fit(x_train, y_train)
y_predict_NB = NBmodel.predict(x_test)
print('accuracy : '+ str(metrics.accuracy_score(y_test, y_predict_NB)))

accuracy : 0.9561701917862249


### Precision and Recall

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import numpy as np

def print_precision_recall(cm):
    FP = cm.sum(axis=0) - np.diag(cm)  
    FN = cm.sum(axis=1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)
    TPR = TP/(TP+FN)
    FPR = FP/(FP+TN)
    precision = TP / (TP+FP)
    recall = TP / (TP+FN)
    f_measure = 2*(recall*precision)/(recall+precision)
    print('precision = ' + str(precision))
    print('recall = ' + str(recall))
    print('f1-score = ' + str(f_measure))
    print('True positive rate : ' + str(TPR))
    print('False positive rate : ' + str(FPR))

cm_DC = confusion_matrix(y_test, y_predict_DC)
print('DecisionTreeClassifier')
print_precision_recall(cm_DC)
print('\n')

cm_RF = confusion_matrix(y_test, y_predict_RF)
print('RandomForestClassifier')
print_precision_recall(cm_RF)
print('\n')

cm_KNN = confusion_matrix(y_test, y_predict_KNN)
print('KNeighborsClassifier')
print_precision_recall(cm_KNN)
print('\n')

cm_XGBoost = confusion_matrix(y_test, y_predict_XGBoost)
print('XGBoostClassifier')
print_precision_recall(cm_XGBoost)
print('\n')

cm_NB = confusion_matrix(y_test, y_predict_NB)
print('NaivebayesClassifier')
print_precision_recall(cm_NB)
print('\n')


DecisionTreeClassifier
precision = [0.99991006 0.99809614 0.99758162]
recall = [0.99982014 0.99714693 0.99858786]
f1-score = [0.9998651  0.99762131 0.99808448]
True positive rate : [0.99982014 0.99714693 0.99858786]
False positive rate : [0.00010913 0.00049761 0.00078298]


RandomForestClassifier
precision = [0.99991008 0.99881094 0.99878959]
recall = [1.         0.99857347 0.99878959]
f1-score = [0.99995504 0.99869219 0.99878959]
True positive rate : [1.         0.99857347 0.99878959]
False positive rate : [0.00010913 0.000311   0.00039149]


KNeighborsClassifier
precision = [0.99964038 0.99833214 0.99697763]
recall = [0.99991007 0.99619591 0.99818439]
f1-score = [0.99977521 0.99726288 0.99758065]
True positive rate : [0.99991007 0.99619591 0.99818439]
False positive rate : [0.00043654 0.0004354  0.00097873]


XGBoostClassifier
precision = [0.99991008 0.99809796 0.99838579]
recall = [1.         0.99809796 0.99818439]
f1-score = [0.99995504 0.99809796 0.99828508]
True positive rate : [

### 模型導出

In [27]:
import pickle

with open("DCmodel.pickle", 'wb') as f:
    pickle.dump(DCmodel, f)

with open("RFmodel.pickle", "wb") as f:
    pickle.dump(RFmodel, f)

with open("KNNmodel.pickle", "wb") as f:
    pickle.dump(KNNmodel, f)

with open("XGBoostmodel.pickle", "wb") as f:
    pickle.dump(XGBoostmodel, f)

with open("NBmodel.pickle", "wb") as f:
    pickle.dump(NBmodel, f)

### Normal test

In [29]:
import pickle
with open("RFmodel.pickle", "rb") as f:
    model = pickle.load(f)
test = pd.read_csv("./model_data/testdata/normal_simulate_test.csv")

columns = ['flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'fwd_pkt_len_std', 'fwd_seg_size_avg', 'flow_iat_mean', 'flow_iat_max',
       'flow_iat_std', 'fwd_iat_tot', 'fwd_iat_max', 'fwd_iat_mean',
       'fwd_iat_std', 'bwd_iat_tot', 'bwd_iat_max', 'bwd_iat_std',
       'fin_flag_cnt', 'rst_flag_cnt', 'init_fwd_win_byts', 'win_byts_tot',
       'fwd_win_tot', 'fwd_win_mean', 'win_byts_std', 'fwd_win_std',
       'fwd_win_max', 'win_byts_min', 'fwd_win_min', 'zero_win_cnt',
       'idle_max', 'idle_mean', 'idle_std']

test_x = test[columns]

num = 0
benign = 0
read = 0
loris = 0
for i in model.predict(test_x):
    num += 1
    if(i == 'benign'):benign += 1
    elif(i == 'slowread'):read += 1
    elif(i == 'slowloris'):loris += 1

print("normal_test.csv")
print("----------------------------------")
print("Total : " + str(num))
print("predicted benign : " + str(benign))
print("predicted slowread : " + str(read))
print("predicted slowloris : " + str(loris))
print('benign accuracy : ' + str(benign/num))

normal_test.csv
----------------------------------
Total : 10103
predicted benign : 10103
predicted slowread : 0
predicted slowloris : 0
benign accuracy : 1.0


### Slowloris test

In [30]:
import pickle
with open("RFmodel.pickle", "rb") as f:
    model = pickle.load(f)
test = pd.read_csv("./model_data/testdata/slowloris_test.csv")

columns = ['flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'fwd_pkt_len_std', 'fwd_seg_size_avg', 'flow_iat_mean', 'flow_iat_max',
       'flow_iat_std', 'fwd_iat_tot', 'fwd_iat_max', 'fwd_iat_mean',
       'fwd_iat_std', 'bwd_iat_tot', 'bwd_iat_max', 'bwd_iat_std',
       'fin_flag_cnt', 'rst_flag_cnt', 'init_fwd_win_byts', 'win_byts_tot',
       'fwd_win_tot', 'fwd_win_mean', 'win_byts_std', 'fwd_win_std',
       'fwd_win_max', 'win_byts_min', 'fwd_win_min', 'zero_win_cnt',
       'idle_max', 'idle_mean', 'idle_std']

test_x = test[columns]

num = 0
benign = 0
read = 0
loris = 0
for i in model.predict(test_x):
    num += 1
    if(i == 'benign'):benign += 1
    elif(i == 'slowread'):read += 1
    elif(i == 'slowloris'):loris += 1

print("slowloris_test.csv")
print("----------------------------------")
print("Total : " + str(num))
print("predicted benign : " + str(benign))
print("predicted slowread : " + str(read))
print("predicted slowloris : " + str(loris))
print('slowloris accuracy : ' + str(loris/num))

slowloris_test.csv
----------------------------------
Total : 5901
predicted benign : 0
predicted slowread : 8
predicted slowloris : 5893
slowloris accuracy : 0.9986442975766819


### Slowread test

In [31]:
import pickle
with open("RFmodel.pickle", "rb") as f:
    model = pickle.load(f)
test = pd.read_csv("./model_data/testdata/slowread_test.csv")

columns = ['flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'fwd_pkt_len_std', 'fwd_seg_size_avg', 'flow_iat_mean', 'flow_iat_max',
       'flow_iat_std', 'fwd_iat_tot', 'fwd_iat_max', 'fwd_iat_mean',
       'fwd_iat_std', 'bwd_iat_tot', 'bwd_iat_max', 'bwd_iat_std',
       'fin_flag_cnt', 'rst_flag_cnt', 'init_fwd_win_byts', 'win_byts_tot',
       'fwd_win_tot', 'fwd_win_mean', 'win_byts_std', 'fwd_win_std',
       'fwd_win_max', 'win_byts_min', 'fwd_win_min', 'zero_win_cnt',
       'idle_max', 'idle_mean', 'idle_std']

test_x = test[columns]

num = 0
benign = 0
read = 0
loris = 0
for i in model.predict(test_x):
    num += 1
    if(i == 'benign'):benign += 1
    elif(i == 'slowread'):read += 1
    elif(i == 'slowloris'):loris += 1

print("slowread_test.csv")
print("----------------------------------")
print("Total : " + str(num))
print("predicted benign : " + str(benign))
print("predicted slowread : " + str(read))
print("predicted slowloris : " + str(loris))
print('slowread accuracy : ' + str(read/num))


slowread_test.csv
----------------------------------
Total : 5871
predicted benign : 0
predicted slowread : 5836
predicted slowloris : 35
slowread accuracy : 0.9940384942939874
