In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [3]:
path1 = "C:\\Users\\alaan\\NSL_KDD_Train.csv"

In [4]:
path2 = "C:\\Users\\alaan\\NSL_KDD_Test.csv"

In [5]:
df = pd.read_csv(path1,header=None, names = col_names)

df_test = pd.read_csv(path2, header=None, names = col_names)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


In [6]:
df.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [7]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178


**Step 1: Data preprocessing:**

In [8]:

print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64


In [9]:
# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


**LabelEncoder**

**Insert categorical features into a 2D numpy array**

In [10]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [11]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)


# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
print(unique_flag2)


# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2


#do it for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2


['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'ser

**Transform categorical features into numbers using LabelEncoder()**

In [12]:
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print(df_categorical_values.head())
print('--------------------')
print(df_categorical_values_enc.head())

# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

  protocol_type   service flag
0           tcp  ftp_data   SF
1           udp     other   SF
2           tcp   private   S0
3           tcp      http   SF
4           tcp      http   SF
--------------------
   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


**One-Hot-Encoding**

In [13]:
enc = OneHotEncoder(categories='auto')
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols)


# test set
testdf_categorical_values_encenc = enc.fit_transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(),columns=testdumcols)

df_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Missing columns in the test set are added

In [14]:
trainservice=df['service'].tolist()
testservice= df_test['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

['service_aol',
 'service_harvest',
 'service_urh_i',
 'service_http_2784',
 'service_http_8001',
 'service_red_i']

In [15]:
for col in difference:
    testdf_cat_data[col] = 0

print(df_cat_data.shape)    
print(testdf_cat_data.shape)

(125973, 84)
(22544, 84)


**New numeric columns are added to main dataframe**

In [17]:
newdf=df.join(df_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)

# test data
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

print(newdf.shape)
print(newdf_test.shape)

(125973, 123)
(22544, 123)


The dataset was divided into separate datasets for each attack category. Attack tags have been renamed for each. 0=Normal, 1=DoS, 2=Probe, 3=R2L, 4=U2R. In new datasets, the label column is replaced with new values.
 
 DoS : 
 
 Probe : 
 
 R2L :
 
 U2R :

In [18]:
labeldf=newdf['label']
labeldf_test=newdf_test['label']


# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})



# put the new label column back
newdf['label'] = newlabeldf
newdf_test['label'] = newlabeldf_test

In [19]:
to_drop_DoS = [0,1]
to_drop_Probe = [0,2]
to_drop_R2L = [0,3]
to_drop_U2R = [0,4]


# Filter all rows with label value other than itself. Normal attack type is reference point for all the other attacks
# job filter function

DoS_df=newdf[newdf['label'].isin(to_drop_DoS)];
Probe_df=newdf[newdf['label'].isin(to_drop_Probe)];
R2L_df=newdf[newdf['label'].isin(to_drop_R2L)];
U2R_df=newdf[newdf['label'].isin(to_drop_U2R)];



#test
DoS_df_test=newdf_test[newdf_test['label'].isin(to_drop_DoS)];
Probe_df_test=newdf_test[newdf_test['label'].isin(to_drop_Probe)];
R2L_df_test=newdf_test[newdf_test['label'].isin(to_drop_R2L)];
U2R_df_test=newdf_test[newdf_test['label'].isin(to_drop_U2R)];


print('Train:')
print('Dimensions of DoS:' ,DoS_df.shape)
print('Dimensions of Probe:' ,Probe_df.shape)
print('Dimensions of R2L:' ,R2L_df.shape)
print('Dimensions of U2R:' ,U2R_df.shape)
print()
print('Test:')
print('Dimensions of DoS:' ,DoS_df_test.shape)
print('Dimensions of Probe:' ,Probe_df_test.shape)
print('Dimensions of R2L:' ,R2L_df_test.shape)
print('Dimensions of U2R:' ,U2R_df_test.shape)

Train:
Dimensions of DoS: (113270, 123)
Dimensions of Probe: (78999, 123)
Dimensions of R2L: (68338, 123)
Dimensions of U2R: (67395, 123)

Test:
Dimensions of DoS: (17171, 123)
Dimensions of Probe: (12132, 123)
Dimensions of R2L: (12596, 123)
Dimensions of U2R: (9778, 123)


**Step 2: Feature Scaling**

In [20]:
# Split dataframes into X & Y
# X Properties , Y result variables

X_DoS = DoS_df.drop('label',1)
Y_DoS = DoS_df.label

X_Probe = Probe_df.drop('label',1)
Y_Probe = Probe_df.label

X_R2L = R2L_df.drop('label',1)
Y_R2L = R2L_df.label

X_U2R = U2R_df.drop('label',1)
Y_U2R = U2R_df.label

# test set
X_DoS_test = DoS_df_test.drop('label',1)
Y_DoS_test = DoS_df_test.label

X_Probe_test = Probe_df_test.drop('label',1)
Y_Probe_test = Probe_df_test.label

X_R2L_test = R2L_df_test.drop('label',1)
Y_R2L_test = R2L_df_test.label

X_U2R_test = U2R_df_test.drop('label',1)
Y_U2R_test = U2R_df_test.label


  after removing the cwd from sys.path.
  import sys
  # Remove the CWD from sys.path while we load stuff.
  del sys.path[0]


**As the column names will be deleted at this stage, we save the column names for later use.**

In [21]:
colNames=list(X_DoS)
colNames_test=list(X_DoS_test)

In [22]:
from sklearn import preprocessing

scaler1 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS=scaler1.transform(X_DoS) 

scaler2 = preprocessing.StandardScaler().fit(X_Probe)
X_Probe=scaler2.transform(X_Probe)

scaler3 = preprocessing.StandardScaler().fit(X_R2L)
X_R2L=scaler3.transform(X_R2L)

scaler4 = preprocessing.StandardScaler().fit(X_U2R)
X_U2R=scaler4.transform(X_U2R) 

# test data
scaler5 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test=scaler5.transform(X_DoS_test) 

scaler6 = preprocessing.StandardScaler().fit(X_Probe_test)
X_Probe_test=scaler6.transform(X_Probe_test) 

scaler7 = preprocessing.StandardScaler().fit(X_R2L_test)
X_R2L_test=scaler7.transform(X_R2L_test) 

scaler8 = preprocessing.StandardScaler().fit(X_U2R_test)
X_U2R_test=scaler8.transform(X_U2R_test)

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

**Recursive Feature Elimination (RFE) , top 13 features (as a group)**

# Random Forest

In [92]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=10,n_jobs=2)
rfe = RFE(estimator=clf, n_features_to_select=13, step=1) #performing recursive feature elimination selecting 13 most important ones.

rfe.fit(X_DoS, Y_DoS.astype(int)) # This line fits the RFE on the training data X_DoS with the corresponding target labels Y_DoS, which are cast as integers using the astype() method.
X_rfeDoS=rfe.transform(X_DoS) #applies the feature selection to the original dataset X_DoS, which returns a new dataset X_rfeDoS that only contains the selected features.
true=rfe.support_ #retrieves the boolean mask indicating which features were selected by the RFE. This boolean mask has a True value for selected features and False value for the remaining features.
rfecolindex_DoS=[i for i, x in enumerate(true) if x] #line creates a list containing the indices of the selected features by iterating over mask true.
rfecolname_DoS=list(colNames[i] for i in rfecolindex_DoS) #line creates a list rfecolname_DoS containing the names of the selected features by mapping the indices in rfecolindex_DoS to the corresponding feature names in colNames.

In [23]:
rfe.fit(X_Probe, Y_Probe.astype(int))
X_rfeProbe=rfe.transform(X_Probe)
true=rfe.support_
rfecolindex_Probe=[i for i, x in enumerate(true) if x]
rfecolname_Probe=list(colNames[i] for i in rfecolindex_Probe)

In [24]:
rfe.fit(X_R2L, Y_R2L.astype(int))
X_rfeR2L=rfe.transform(X_R2L)
true=rfe.support_
rfecolindex_R2L=[i for i, x in enumerate(true) if x]
rfecolname_R2L=list(colNames[i] for i in rfecolindex_R2L)

In [25]:
rfe.fit(X_U2R, Y_U2R.astype(int))
X_rfeU2R=rfe.transform(X_U2R)
true=rfe.support_
rfecolindex_U2R=[i for i, x in enumerate(true) if x]
rfecolname_U2R=list(colNames[i] for i in rfecolindex_U2R)

**Summary of features selected by RFE**

In [26]:
print('Features selected for DoS:',rfecolname_DoS)
print()
print('Features selected for Probe:',rfecolname_Probe)
print()
print('Features selected for R2L:',rfecolname_R2L)
print()
print('Features selected for U2R:',rfecolname_U2R)


Features selected for DoS: ['src_bytes', 'dst_bytes', 'wrong_fragment', 'count', 'srv_count', 'serror_rate', 'same_srv_rate', 'dst_host_same_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_serror_rate', 'Protocol_type_icmp', 'service_ecr_i', 'flag_SF']

Features selected for Probe: ['src_bytes', 'dst_bytes', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'Protocol_type_tcp', 'service_eco_i', 'service_private']

Features selected for R2L: ['duration', 'src_bytes', 'dst_bytes', 'hot', 'is_guest_login', 'dst_host_count', 'dst_host_srv_count', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_rerror_rate', 'service_ftp_data']

Features selected for U2R: ['duration', 'src_bytes', 'dst_bytes', 'hot', 'num_compromised', 'root_shell', 'num_file_creations', 'c

In [27]:
print(X_rfeDoS.shape)
print(X_rfeProbe.shape)
print(X_rfeR2L.shape)
print(X_rfeU2R.shape)


(113270, 13)
(78999, 13)
(68338, 13)
(67395, 13)


**Step 4: Build the model:**

Classifier is trained for all features and for reduced features, for later comparison.

The classifier model itself is stored in the clf variable.

In [93]:
# all features
clf_DoS=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_Probe=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_R2L=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_U2R=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_U2R.fit(X_U2R, Y_U2R.astype(int))

RandomForestClassifier(n_estimators=10, n_jobs=2)

In [29]:
# selected features
clf_rfeDoS=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeProbe=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeR2L=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeU2R=RandomForestClassifier(n_estimators=10,n_jobs=2)
clf_rfeDoS.fit(X_rfeDoS, Y_DoS.astype(int))
clf_rfeProbe.fit(X_rfeProbe, Y_Probe.astype(int))
clf_rfeR2L.fit(X_rfeR2L, Y_R2L.astype(int))
clf_rfeU2R.fit(X_rfeU2R, Y_U2R.astype(int))

RandomForestClassifier(n_estimators=10, n_jobs=2)

**DoS**

In [94]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

scoring = ['accuracy', 'precision', 'recall', 'f1']
for score in scoring:
    scores = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=score)
    print("%s: %0.5f" % (score.capitalize(), scores.mean()))

Accuracy: 0.99785
Precision: 0.99852
Recall: 0.99705
F1: 0.99758


**Probe**

In [95]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99662
Precision_macro: 0.99555
Recall_macro: 0.99277
F1_macro: 0.99482


**U2R**

In [96]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99775 
Precision_macro: 0.94754 
Recall_macro: 0.87956 
F1_macro: 0.90277 


**R2L**

In [97]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.98079
Precision_macro: 0.97425
Recall_macro: 0.96954
F1_macro: 0.97286


**Using 13 Features for each category**


Confusion Matrices

DoS

In [51]:
# reduce test dataset to 13 features, use only features described in rfecolname_DoS etc.
X_DoS_test2=X_DoS_test[:,rfecolindex_DoS]
X_Probe_test2=X_Probe_test[:,rfecolindex_Probe]
X_R2L_test2=X_R2L_test[:,rfecolindex_R2L]
X_U2R_test2=X_U2R_test[:,rfecolindex_U2R]
X_U2R_test2.shape

(9778, 13)

**Cross Validation: Accuracy, Precision, Recall, F-measure**

**DoS**


In [52]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_rfeDoS, X_DoS_test2, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99767
Precision_macro: 0.99798
Recall_macro: 0.99797
F1_macro: 0.99793


**Probe**

In [53]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_rfeProbe, X_Probe_test2, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99341
Precision_macro: 0.99127
Recall_macro: 0.98730
F1_macro: 0.98860


**R2L**

In [54]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_rfeR2L, X_R2L_test2, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.97714
Precision_macro: 0.97288
Recall_macro: 0.96616
F1_macro: 0.96872


**U2R**

In [55]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_rfeU2R, X_U2R_test2, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99714
Precision_macro: 0.95548
Recall_macro: 0.81393
F1_macro: 0.87059


# KNeighbors


In [31]:
from sklearn.neighbors import KNeighborsClassifier

clf_KNN_DoS=KNeighborsClassifier()
clf_KNN_Probe=KNeighborsClassifier()
clf_KNN_R2L=KNeighborsClassifier()
clf_KNN_U2R=KNeighborsClassifier()

clf_KNN_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_KNN_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_KNN_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_KNN_U2R.fit(X_U2R, Y_U2R.astype(int))


KNeighborsClassifier()

**Cross Validation: Accuracy, Precision, Recall, F-measure**

**DoS**

In [36]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_KNN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99715 
Precision_macro: 0.99711 
Recall_macro: 0.99709 
F1_macro: 0.99710 


**Probe**

In [37]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_KNN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99077 
Precision_macro: 0.98606 
Recall_macro: 0.98508 
F1_macro: 0.98553 


**R2L**

In [38]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_KNN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.96705 
Precision_macro: 0.95265 
Recall_macro: 0.95439 
F1_macro: 0.95344 


**U2R**

In [39]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_KNN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99703 
Precision_macro: 0.93143 
Recall_macro: 0.85073 
F1_macro: 0.87831 


# SVM


In [41]:
from sklearn.svm import SVC

clf_SVM_DoS=SVC(kernel='linear', C=1.0, random_state=0)
clf_SVM_Probe=SVC(kernel='linear', C=1.0, random_state=0)
clf_SVM_R2L=SVC(kernel='linear', C=1.0, random_state=0)
clf_SVM_U2R=SVC(kernel='linear', C=1.0, random_state=0)

clf_SVM_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_SVM_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_SVM_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_SVM_U2R.fit(X_U2R, Y_U2R.astype(int))

SVC(kernel='linear', random_state=0)

**DoS**

In [46]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_SVM_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99371 
Precision_macro: 0.99342 
Recall_macro: 0.99380 
F1_macro: 0.99360 


**Probe**

In [47]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_SVM_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.98450 
Precision_macro: 0.96907 
Recall_macro: 0.98365 
F1_macro: 0.97613 


**R2L**

In [48]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_SVM_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.96793 
Precision_macro: 0.94854 
Recall_macro: 0.96264 
F1_macro: 0.95529 


**U2R**

In [49]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_SVM_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99632 
Precision_macro: 0.91056 
Recall_macro: 0.82909 
F1_macro: 0.84869 


## GAUSSIAN NAIVE BAYES

In [50]:
from sklearn.naive_bayes import GaussianNB

In [51]:
clf_GNB_DoS=GaussianNB()
clf_GNB_Probe=GaussianNB()
clf_GNB_R2L=GaussianNB()
clf_GNB_U2R=GaussianNB()

In [52]:
clf_GNB_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_GNB_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_GNB_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_GNB_U2R.fit(X_U2R, Y_U2R.astype(int))

GaussianNB()

In [53]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GNB_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.86733
Precision_macro: 0.90081
Recall_macro: 0.84830
F1_macro: 0.85795


In [54]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GNB_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.97898
Precision_macro: 0.97323
Recall_macro: 0.96051
F1_macro: 0.96654


In [55]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GNB_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.93562
Precision_macro: 0.89097
Recall_macro: 0.95508
F1_macro: 0.91620


In [118]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GNB_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.97259
Precision_macro: 0.60157
Recall_macro: 0.97911
F1_macro: 0.66091


### DECISION TREE

In [57]:
from sklearn.tree import DecisionTreeClassifier
clf_DT_DoS = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
clf_DT_Probe = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
clf_DT_R2L = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
clf_DT_U2R = DecisionTreeClassifier(criterion="entropy", max_depth = 4)

In [58]:
clf_DT_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_DT_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_DT_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_DT_U2R.fit(X_U2R, Y_U2R.astype(int))

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [59]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_DT_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99167
Precision_macro: 0.99107
Recall_macro: 0.99225
F1_macro: 0.99171


In [60]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_DT_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.97824
Precision_macro: 0.97620
Recall_macro: 0.95509
F1_macro: 0.96502


In [61]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_DT_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.93300
Precision_macro: 0.88727
Recall_macro: 0.95559
F1_macro: 0.91331


In [62]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_DT_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99632
Precision_macro: 0.91824
Recall_macro: 0.81372
F1_macro: 0.83802


### LOGISTIC REGRESSION

In [63]:
from sklearn.linear_model import LogisticRegression
clf_LR_DoS= LogisticRegression(max_iter=1200000)
clf_LR_Probe= LogisticRegression(max_iter=1200000)
clf_LR_R2L = LogisticRegression(max_iter=1200000)
clf_LR_U2R= LogisticRegression(max_iter=1200000)

In [64]:
clf_LR_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_LR_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_LR_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_LR_U2R.fit(X_U2R, Y_U2R.astype(int))

LogisticRegression(max_iter=1200000)

In [65]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_LR_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99400
Precision_macro: 0.99367
Recall_macro: 0.99414
F1_macro: 0.99390


In [66]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_LR_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.98384
Precision_macro: 0.97049
Recall_macro: 0.97967
F1_macro: 0.97497


In [67]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_LR_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.96570
Precision_macro: 0.94470
Recall_macro: 0.96071
F1_macro: 0.95232


In [68]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_LR_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99683
Precision_macro: 0.93066
Recall_macro: 0.83763
F1_macro: 0.86486


### GRADIENT BOOSTING CLASSIFIER

In [69]:
from sklearn.ensemble import GradientBoostingClassifier
clf_GBC_DoS = GradientBoostingClassifier(random_state=0)
clf_GBC_Probe = GradientBoostingClassifier(random_state=0)
clf_GBC_R2L = GradientBoostingClassifier(random_state=0)
clf_GBC_U2R = GradientBoostingClassifier(random_state=0)

In [70]:
clf_GBC_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_GBC_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_GBC_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_GBC_U2R.fit(X_U2R, Y_U2R.astype(int))

GradientBoostingClassifier(random_state=0)

In [71]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GBC_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99808
Precision_macro: 0.99809
Recall_macro: 0.99801
F1_macro: 0.99804


In [72]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GBC_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99646
Precision_macro: 0.99545
Recall_macro: 0.99344
F1_macro: 0.99444


In [73]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GBC_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.98118
Precision_macro: 0.97315
Recall_macro: 0.97367
F1_macro: 0.97337


In [74]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_GBC_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99775
Precision_macro: 0.93356
Recall_macro: 0.90549
F1_macro: 0.91415


### ARITIFICIAL NEURAL NETWORK

In [75]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [76]:
clf_ANN_DoS = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_ANN_Probe = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_ANN_R2L = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
clf_ANN_U2R = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

In [77]:
clf_ANN_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_ANN_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_ANN_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_ANN_U2R.fit(X_U2R, Y_U2R.astype(int))

MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

In [78]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ANN_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99668
Precision_macro: 0.99659
Recall_macro: 0.99666
F1_macro: 0.99662


In [79]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ANN_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99242
Precision_macro: 0.98793
Recall_macro: 0.98844
F1_macro: 0.98814


In [80]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ANN_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.97420
Precision_macro: 0.95542
Recall_macro: 0.97401
F1_macro: 0.96423


In [81]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ANN_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f" % (metric.capitalize(), scores.mean()))

Accuracy: 0.99806
Precision_macro: 0.95707
Recall_macro: 0.90445
F1_macro: 0.92435


# Ensemble Learning using RF, KNN AND SVM

In [103]:
from sklearn.ensemble import VotingClassifier

clf_voting_DoS = VotingClassifier(estimators=[('rf', clf_DoS), ('knn', clf_KNN_DoS), ('svm', clf_SVM_DoS)], voting='hard')
clf_voting_Probe = VotingClassifier(estimators=[('rf', clf_Probe), ('knn', clf_KNN_Probe), ('svm', clf_SVM_Probe)], voting='hard')
clf_voting_R2L = VotingClassifier(estimators=[('rf', clf_R2L), ('knn', clf_KNN_R2L), ('svm', clf_SVM_R2L)], voting='hard')
clf_voting_U2R = VotingClassifier(estimators=[('rf', clf_U2R), ('knn', clf_KNN_U2R), ('svm', clf_SVM_U2R)], voting='hard')

clf_voting_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_voting_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_voting_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_voting_U2R.fit(X_U2R, Y_U2R.astype(int))


VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     n_jobs=2)),
                             ('knn', KNeighborsClassifier()),
                             ('svm', SVC(kernel='linear', random_state=0))])

**DoS**

In [104]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99814 
Precision_macro: 0.99771 
Recall_macro: 0.99784 
F1_macro: 0.99804 


**Probe**

In [105]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99283 
Precision_macro: 0.98769 
Recall_macro: 0.98994 
F1_macro: 0.98790 


**R2L**

In [106]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.97198 
Precision_macro: 0.95829 
Recall_macro: 0.96291 
F1_macro: 0.96066 


**U2R**

In [107]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99744 
Precision_macro: 0.94865 
Recall_macro: 0.88054 
F1_macro: 0.88594 


In [None]:
#pip install tensorflow

### HYBBRID MODEL USING GBC, SVM AND KNN

In [108]:
from sklearn.ensemble import VotingClassifier

clf_voting1_DoS = VotingClassifier(estimators=[('gbc', clf_GBC_DoS), ('knn', clf_KNN_DoS), ('svm', clf_SVM_DoS)], voting='hard')
clf_voting1_Probe = VotingClassifier(estimators=[('gbc', clf_GBC_Probe), ('knn', clf_KNN_Probe), ('svm', clf_SVM_Probe)], voting='hard')
clf_voting1_R2L = VotingClassifier(estimators=[('gbc', clf_GBC_R2L), ('knn', clf_KNN_R2L), ('svm', clf_SVM_R2L)], voting='hard')
clf_voting1_U2R = VotingClassifier(estimators=[('gbc', clf_GBC_U2R), ('knn', clf_KNN_U2R), ('svm', clf_SVM_U2R)], voting='hard')

clf_voting1_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_voting1_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_voting1_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_voting1_U2R.fit(X_U2R, Y_U2R.astype(int))

VotingClassifier(estimators=[('gbc',
                              GradientBoostingClassifier(random_state=0)),
                             ('knn', KNeighborsClassifier()),
                             ('svm', SVC(kernel='linear', random_state=0))])

In [109]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting1_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99790 
Precision_macro: 0.99793 
Recall_macro: 0.99780 
F1_macro: 0.99787 


In [110]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting1_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99283 
Precision_macro: 0.98786 
Recall_macro: 0.98978 
F1_macro: 0.98880 


In [111]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting1_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.97325 
Precision_macro: 0.95821 
Recall_macro: 0.96706 
F1_macro: 0.96249 


In [112]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting1_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99734 
Precision_macro: 0.93833 
Recall_macro: 0.87335 
F1_macro: 0.89419 


### ADA BOOST

In [83]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [84]:
clf_ADA_DoS = AdaBoostClassifier(base_estimator=clf_DT_DoS, n_estimators=50)
clf_ADA_Probe = AdaBoostClassifier(base_estimator=clf_DT_Probe, n_estimators=50)
clf_ADA_R2L = AdaBoostClassifier(base_estimator=clf_DT_R2L, n_estimators=50)
clf_ADA_U2R = AdaBoostClassifier(base_estimator=clf_DT_U2R, n_estimators=50)

In [86]:
clf_ADA_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_ADA_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_ADA_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_ADA_U2R.fit(X_U2R, Y_U2R.astype(int))

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=4))

In [87]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ADA_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99819 
Precision_macro: 0.99820 
Recall_macro: 0.99806 
F1_macro: 0.99781 


In [88]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ADA_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99670 
Precision_macro: 0.99695 
Recall_macro: 0.99510 
F1_macro: 0.99612 


In [89]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ADA_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.97944 
Precision_macro: 0.97192 
Recall_macro: 0.97020 
F1_macro: 0.97096 


In [90]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_ADA_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99857 
Precision_macro: 0.96552 
Recall_macro: 0.94022 
F1_macro: 0.94055 


### HYBRID MODEL USING RF, ANN AND ADABOOST

In [98]:
from sklearn.ensemble import VotingClassifier

clf_voting2_DoS = VotingClassifier(estimators=[('rf', clf_DoS), ('ann', clf_ANN_DoS), ('ada', clf_ADA_DoS)], voting='hard')
clf_voting2_Probe = VotingClassifier(estimators=[('rf', clf_Probe), ('ann', clf_ANN_Probe), ('ada', clf_ADA_Probe)], voting='hard')
clf_voting2_R2L = VotingClassifier(estimators=[('rf', clf_R2L), ('ann', clf_ANN_R2L), ('ada', clf_ADA_R2L)], voting='hard')
clf_voting2_U2R = VotingClassifier(estimators=[('rf', clf_U2R), ('ann', clf_ANN_U2R), ('ada', clf_ADA_U2R)], voting='hard')

clf_voting2_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_voting2_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_voting2_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_voting2_U2R.fit(X_U2R, Y_U2R.astype(int))


VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(n_estimators=10,
                                                     n_jobs=2)),
                             ('ann',
                              MLPClassifier(hidden_layer_sizes=(100, 50),
                                            max_iter=500, random_state=42)),
                             ('ada',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                                                       max_depth=4)))])

In [99]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting2_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99825 
Precision_macro: 0.99846 
Recall_macro: 0.99830 
F1_macro: 0.99804 


In [100]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting2_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99679 
Precision_macro: 0.99711 
Recall_macro: 0.99417 
F1_macro: 0.99495 


In [101]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting2_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.98095 
Precision_macro: 0.97385 
Recall_macro: 0.97315 
F1_macro: 0.97405 


In [102]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting2_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99816 
Precision_macro: 0.96188 
Recall_macro: 0.90337 
F1_macro: 0.93696 


### hybrid model USING ann, ada and GBC

In [113]:
from sklearn.ensemble import VotingClassifier

clf_voting3_DoS = VotingClassifier(estimators=[('GBC', clf_GBC_DoS), ('ann', clf_ANN_DoS), ('ada', clf_ADA_DoS)], voting='hard')
clf_voting3_Probe = VotingClassifier(estimators=[('GBC', clf_GBC_Probe), ('ann', clf_ANN_Probe), ('ada', clf_ADA_Probe)], voting='hard')
clf_voting3_R2L = VotingClassifier(estimators=[('GBC', clf_GBC_R2L), ('ann', clf_ANN_R2L), ('ada', clf_ADA_R2L)], voting='hard')
clf_voting3_U2R = VotingClassifier(estimators=[('GBC', clf_GBC_U2R), ('ann', clf_ANN_U2R), ('ada', clf_ADA_U2R)], voting='hard')

clf_voting3_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_voting3_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_voting3_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_voting3_U2R.fit(X_U2R, Y_U2R.astype(int))

VotingClassifier(estimators=[('GBC',
                              GradientBoostingClassifier(random_state=0)),
                             ('ann',
                              MLPClassifier(hidden_layer_sizes=(100, 50),
                                            max_iter=500, random_state=42)),
                             ('ada',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                                                       max_depth=4)))])

In [114]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting3_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99837 
Precision_macro: 0.99846 
Recall_macro: 0.99831 
F1_macro: 0.99840 


In [115]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting3_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99687 
Precision_macro: 0.99613 
Recall_macro: 0.99355 
F1_macro: 0.99496 


In [116]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting3_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.98071 
Precision_macro: 0.97003 
Recall_macro: 0.97616 
F1_macro: 0.97255 


In [117]:
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
for metric in metrics:
    scores = cross_val_score(clf_voting3_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring=metric)
    print("%s: %0.5f " % (metric.capitalize(), scores.mean()))

Accuracy: 0.99888 
Precision_macro: 0.98406 
Recall_macro: 0.93432 
F1_macro: 0.95146 
