In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
import os
from numpy.random import seed
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense
from keras.models import Model

Using TensorFlow backend.


In [2]:
col_names=["id","dur","proto","service","state",
           "spkts","dpkts","sbytes","dbytes","rate",
           "sttl","dttl","sload","dload","sloss",
           "dloss","sinpkt","dinpkt","sjit","djit",
           "swin","stcpb","dtcpb","dwin","tcprtt",
           "synack","ackdat","smean","dmean","trans_depth",
           "response_body_len","ct_srv_src","ct_state_ttl","ct_dst_ltm","ct_src_dport_ltm",
           "ct_dst_sport_ltm","ct_dst_src_ltm","is_ftp_login","ct_ftp_cmd","ct_flw_http_mthd",
           "ct_src_ltm","ct_srv_dst","is_sm_ips_ports","attack_cat","label"
]

df_train=pd.read_csv("UNSW_NB15_training-set.csv",header=None,names=col_names)
df_test=pd.read_csv("UNSW_NB15_testing-set.csv",header=None,names=col_names)

print('Dimensions of the Training set:',df_train.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (82332, 45)
Dimensions of the Test set: (175341, 45)


In [4]:
df_train.head(5)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [5]:
# df_train.describe()

In [6]:
print('Label distribution Training set:')
print(df_train['attack_cat'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['attack_cat'].value_counts())

Label distribution Training set:
Normal            37000
Generic           18871
Exploits          11132
Fuzzers            6062
DoS                4089
Reconnaissance     3496
Analysis            677
Backdoor            583
Shellcode           378
Worms                44
Name: attack_cat, dtype: int64

Label distribution Test set:
Normal            56000
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: attack_cat, dtype: int64


In [7]:
print('Training set:')
for col_name in df_train.columns:
    if df_train[col_name].dtypes == 'object' :
        unique_cat = len(df_train[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df_train['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'proto' has 131 categories
Feature 'service' has 13 categories
Feature 'state' has 7 categories
Feature 'attack_cat' has 10 categories

Distribution of categories in service:
-       47153
dns     21367
http     8287
smtp     1851
ftp      1552
Name: service, dtype: int64


In [8]:
# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test set:
Feature 'proto' has 133 categories
Feature 'service' has 13 categories
Feature 'state' has 9 categories
Feature 'attack_cat' has 10 categories


In [9]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['proto', 'service', 'state']
# insert code to get a list of categorical columns into a variable, categorical_columns
categorical_columns=['proto', 'service', 'state'] 
 # Get the categorical values into a 2D numpy array
df_train_categorical_values = df_train[categorical_columns]
df_test_categorical_values = df_test[categorical_columns]
# df_test_categorical_values.head()

In [10]:
# for train dataset
get_protocol=sorted(df_train.proto.unique())
prefix_protocol = 'protocol_'
protocol_dummies=[prefix_protocol + x for x in get_protocol]

get_service=sorted(df_train.service.unique())
prefix_service = 'service_'
service_dummies=[prefix_service + x for x in get_service]

get_state=sorted(df_train.state.unique())
prefix_state = 'state_'
state_dummies=[prefix_state + x for x in get_state]
dum_cols=protocol_dummies+service_dummies+state_dummies
# print(dum_cols)

# for test dataset
get_protocol_test=sorted(df_test.proto.unique())
protocol_dummies_test=[prefix_protocol + x for x in get_protocol_test]

get_state_test=sorted(df_test.state.unique())
state_dummies_test=[prefix_state + x for x in get_state_test]
dum_cols_test=protocol_dummies_test+service_dummies+state_dummies_test
# print(dum_cols_test)

In [11]:
# train dataset
df_train_categorical_values_enc=df_train_categorical_values.apply(LabelEncoder().fit_transform)
print(df_train_categorical_values_enc.head())

# test dataset
df_test_categorical_values_enc=df_test_categorical_values.apply(LabelEncoder().fit_transform)
print("Train dataset:")
print(df_test_categorical_values_enc.head())

   proto  service  state
0    117        0      4
1    117        0      4
2    117        0      4
3    117        0      4
4    117        0      4
Train dataset:
   proto  service  state
0    113        0      2
1    113        0      2
2    113        0      2
3    113        3      2
4    113        0      2


In [12]:
ohe = OneHotEncoder()
df_train_categorical_values_encenc = ohe.fit_transform(df_train_categorical_values_enc)
df_train_cat_data = pd.DataFrame(df_train_categorical_values_encenc.toarray(),columns=dum_cols)
# test set
df_test_categorical_values_encenc = ohe.fit_transform(df_test_categorical_values_enc)
df_test_cat_data = pd.DataFrame(df_test_categorical_values_encenc.toarray(),columns=dum_cols_test)

df_train_cat_data.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,protocol_3pc,protocol_a/n,protocol_aes-sp3-d,protocol_any,protocol_argus,protocol_aris,protocol_arp,protocol_ax.25,protocol_bbn-rcc,protocol_bna,...,service_snmp,service_ssh,service_ssl,state_ACC,state_CLO,state_CON,state_FIN,state_INT,state_REQ,state_RST
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
train_state=df_train['state'].tolist()
test_state= df_test['state'].tolist()

differences_state=list(set(test_state) - set(train_state))
prefix_state = 'state_'
differences1=[prefix_state + x for x in differences_state]

train_proto=df_train['proto'].tolist()
test_proto=df_test['proto'].tolist()
differences_proto=list(set(test_proto) - set(train_proto))
prefix_protocol = 'protocol_'
differences2=[prefix_protocol + x for x in differences_proto]

differences=differences1+differences2
differences

['state_URN',
 'state_ECO',
 'state_no',
 'state_PAR',
 'protocol_rtp',
 'protocol_icmp']

In [14]:
for difference in differences:
    df_train_cat_data[difference] = 0

df_train_cat_data.shape
# print(df_train_cat_data.head())

(82332, 157)

In [15]:
test_differences_state=list(set(train_state) - set(test_state))
prefix_state = 'state_'
test_differences=[prefix_state + x for x in test_differences_state]

for differ in test_differences:
    df_test_cat_data[differ] = 0

new_df_train=df_train.join(df_train_cat_data)
new_df_train.drop('state', axis=1, inplace=True)
new_df_train.drop('proto', axis=1, inplace=True)
new_df_train.drop('service', axis=1, inplace=True)
new_df_train.drop('id', axis=1, inplace=True)
# test data
new_df_test=df_test.join(df_test_cat_data)
new_df_test.drop('state', axis=1, inplace=True)
new_df_test.drop('proto', axis=1, inplace=True)
new_df_test.drop('service', axis=1, inplace=True)
new_df_test.drop('id', axis=1, inplace=True)

# dif=list(set(new_df_train) - set(new_df_test))

print(new_df_train['attack_cat'])

0        Normal
1        Normal
2        Normal
3        Normal
4        Normal
          ...  
82327    Normal
82328    Normal
82329    Normal
82330    Normal
82331    Normal
Name: attack_cat, Length: 82332, dtype: object


In [16]:
attack_df_train=new_df_train['attack_cat']
attack_df_test=new_df_test['attack_cat']
# change the label column
new_attack_df_train=attack_df_train.replace({ 'Normal' : 0, 'Generic' : 1 ,'Exploits' : 2,'Fuzzers': 3,'DoS': 4,'Reconnaissance': 5,
                            'Analysis' : 6,'Backdoor' : 7,'Shellcode' : 8,'Worms' : 9})
new_attack_df_test=attack_df_test.replace({ 'Normal' : 0, 'Generic' : 1 ,'Exploits' : 2,'Fuzzers': 3,'DoS': 4,'Reconnaissance': 5,
                            'Analysis' : 6,'Backdoor' : 7,'Shellcode' : 8,'Worms' : 9})
# put the new label column back
new_df_train['attack_cat'] = new_attack_df_train
new_df_test['attack_cat'] = new_attack_df_test
print(new_df_train['attack_cat'].head())

0    0
1    0
2    0
3    0
4    0
Name: attack_cat, dtype: int64


In [None]:
normal_mask=new_df_train['attack_cat'] ==0
attack_mask=new_df_train['attack_cat'] !=0

# new_df_train.drop('attack_cat',axis=1,inplace=True)

df_normal=new_df_train[normal_mask]
df_attack=new_df_train[attack_mask]

normal_mask_test=new_df_test['attack_cat'] ==0
attack_mask_test=new_df_test['attack_cat'] !=0

# new_df_train.drop('attack_cat',axis=1,inplace=True)

df_normal_test=new_df_test[normal_mask_test]
df_attack_test=new_df_test[attack_mask_test]


print(df_normal_test.shape)
print(df_attack_test.shape)

# print(df_normal.shape)
# print(df_attack.shape)

In [17]:
drop_Generic = [1]
drop_Exploits = [2]
drop_Fuzzers = [3]
drop_DoS = [4]
drop_Reconnaissance = [5]
drop_Analysis = [6]
drop_Backdoor = [7]
drop_Shellcode = [8]
drop_Worms = [9]

Generic_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Generic)];
Exploits_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Exploits)];
Fuzzers_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Fuzzers)];
DoS_df=new_df_train[~new_df_train['attack_cat'].isin(drop_DoS)];
Reconnaissance_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Reconnaissance)];
Analysis_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Analysis)];
Backdoor_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Backdoor)];
Shellcode_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Shellcode)];
Worms_df=new_df_train[~new_df_train['attack_cat'].isin(drop_Worms)];

#test

Generic_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Generic)];
Exploits_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Exploits)];
Fuzzers_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Fuzzers)];
DoS_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_DoS)];
Reconnaissance_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Reconnaissance)];
Analysis_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Analysis)];
Backdoor_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Backdoor)];
Shellcode_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Shellcode)];
Worms_df_test=new_df_test[~new_df_test['attack_cat'].isin(drop_Worms)];

print('Train:')

print('Dimensions of DoS:' ,Generic_df.shape)
print('Dimensions of Exploits:' ,Exploits_df.shape)
print('Dimensions of Fuzzers:' ,Fuzzers_df.shape)
print('Dimensions of DoS:' ,DoS_df.shape)
print('Dimensions of Reconnaissance:' ,Reconnaissance_df.shape)
print('Dimensions of Analysis:' ,Analysis_df.shape)
print('Dimensions of Backdoor:' ,Backdoor_df.shape)
print('Dimensions of Shellcode:' ,Shellcode_df.shape)
print('Dimensions of Worms:' ,Worms_df.shape)

print('Test:')
print('Dimensions of DoS:' ,Generic_df_test.shape)
print('Dimensions of Exploits:' ,Exploits_df_test.shape)
print('Dimensions of Fuzzers:' ,Fuzzers_df_test.shape)
print('Dimensions of DoS:' ,DoS_df_test.shape)
print('Dimensions of Reconnaissance:' ,Reconnaissance_df_test.shape)
print('Dimensions of Analysis:' ,Analysis_df_test.shape)
print('Dimensions of Backdoor:' ,Backdoor_df_test.shape)
print('Dimensions of Shellcode:' ,Shellcode_df_test.shape)
print('Dimensions of Worms:' ,Worms_df_test.shape)


Train:
Dimensions of DoS: (63461, 198)
Dimensions of Exploits: (71200, 198)
Dimensions of Fuzzers: (76270, 198)
Dimensions of DoS: (78243, 198)
Dimensions of Reconnaissance: (78836, 198)
Dimensions of Analysis: (81655, 198)
Dimensions of Backdoor: (81749, 198)
Dimensions of Shellcode: (81954, 198)
Dimensions of Worms: (82288, 198)
Test:
Dimensions of DoS: (135341, 198)
Dimensions of Exploits: (141948, 198)
Dimensions of Fuzzers: (157157, 198)
Dimensions of DoS: (163077, 198)
Dimensions of Reconnaissance: (164850, 198)
Dimensions of Analysis: (173341, 198)
Dimensions of Backdoor: (173595, 198)
Dimensions of Shellcode: (174208, 198)
Dimensions of Worms: (175211, 198)


In [18]:
X_Generic = Generic_df.drop('attack_cat',1)
Y_Generic = Generic_df.attack_cat

X_Exploits = Exploits_df.drop('attack_cat',1)
Y_Exploits = Exploits_df.attack_cat

X_Fuzzers = Fuzzers_df.drop('attack_cat',1)
Y_Fuzzers = Fuzzers_df.attack_cat

X_DoS = DoS_df.drop('attack_cat',1)
Y_DoS = DoS_df.attack_cat

X_Reconnaissance = Reconnaissance_df.drop('attack_cat',1)
Y_Reconnaissance = Reconnaissance_df.attack_cat

X_Analysis = Analysis_df.drop('attack_cat',1)
Y_Analysis = Analysis_df.attack_cat


X_Backdoor = Backdoor_df.drop('attack_cat',1)
Y_Backdoor = Backdoor_df.attack_cat


X_Shellcode = Shellcode_df.drop('attack_cat',1)
Y_Shellcode = Shellcode_df.attack_cat

X_Worms =Worms_df.drop('attack_cat',1)
Y_Worms =Worms_df.attack_cat

X_Generic_test = Generic_df_test.drop('attack_cat',1)
Y_Generic_test = Generic_df_test.attack_cat

X_Exploits_test = Exploits_df_test.drop('attack_cat',1)
Y_Exploits_test = Exploits_df_test.attack_cat

X_Fuzzers_test = Fuzzers_df_test.drop('attack_cat',1)
Y_Fuzzers_test = Fuzzers_df_test.attack_cat

X_DoS_test = DoS_df_test.drop('attack_cat',1)
Y_DoS_test = DoS_df_test.attack_cat

X_Reconnaissance_test = Reconnaissance_df_test.drop('attack_cat',1)
Y_Reconnaissance_test = Reconnaissance_df_test.attack_cat

X_Analysis_test = Analysis_df_test.drop('attack_cat',1)
Y_Analysis_test = Analysis_df_test.attack_cat


X_Backdoor_test = Backdoor_df_test.drop('attack_cat',1)
Y_Backdoor_test = Backdoor_df_test.attack_cat


X_Shellcode_test = Shellcode_df_test.drop('attack_cat',1)
Y_Shellcode_test = Shellcode_df_test.attack_cat

X_Worms_test =Worms_df_test.drop('attack_cat',1)
Y_Worms_test =Worms_df_test.attack_cat


In [19]:
X_normal=df_normal.drop('attack_cat',1)
Y_normal=df_normal.attack_cat

X_attack=df_attack.drop('attack_cat',1)
Y_attack=df_attack.attack_cat

X_normal_test=df_normal.drop('attack_cat',1)
Y_normal_test=df_normal.attack_cat

X_attack_test=df_attack.drop('attack_cat',1)
Y_attack_test=df_attack.attack_cat


In [20]:
from sklearn import preprocessing

scaler1 = preprocessing.StandardScaler().fit(X_Generic)
X_Generic=scaler1.transform(X_Generic)

scaler2 = preprocessing.StandardScaler().fit(X_Generic_test)
X_Generic_test=scaler2.transform(X_Generic_test) 

In [21]:
# print(X_Generic.std(axis=0))

In [22]:
scaler3 = preprocessing.StandardScaler().fit(X_Exploits)
X_Exploits=scaler3.transform(X_Exploits)

scaler4 = preprocessing.StandardScaler().fit(X_Exploits_test)
X_Exploits_test=scaler4.transform(X_Exploits_test)

In [23]:
scaler5 = preprocessing.StandardScaler().fit(X_Fuzzers)
X_Fuzzers=scaler5.transform(X_Fuzzers)

scaler6= preprocessing.StandardScaler().fit(X_Fuzzers_test)
X_Fuzzers_test=scaler6.transform(X_Fuzzers_test)

In [24]:
scaler7 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS=scaler7.transform(X_DoS)

scaler8 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test=scaler8.transform(X_DoS_test)

In [25]:
scaler9 = preprocessing.StandardScaler().fit(X_Reconnaissance)
X_Reconnaissance=scaler9.transform(X_Reconnaissance)

scaler10 = preprocessing.StandardScaler().fit(X_Reconnaissance_test)
X_Reconnaissance_test=scaler10.transform(X_Reconnaissance_test)

In [26]:
scaler11 = preprocessing.StandardScaler().fit(X_Analysis)
X_Analysis=scaler11.transform(X_Analysis)

scaler12 = preprocessing.StandardScaler().fit(X_Analysis_test)
X_Analysis_test=scaler12.transform(X_Analysis_test)

In [27]:
scaler13 = preprocessing.StandardScaler().fit(X_Backdoor)
X_Backdoor=scaler13.transform(X_Backdoor)

scaler14 = preprocessing.StandardScaler().fit(X_Backdoor_test)
X_Backdoor_test=scaler14.transform(X_Backdoor_test)

In [28]:
scaler15 = preprocessing.StandardScaler().fit(X_Shellcode)
X_Shellcode=scaler15.transform(X_Shellcode)

scaler16 = preprocessing.StandardScaler().fit(X_Shellcode_test)
X_Shellcode_test=scaler16.transform(X_Shellcode_test)

In [29]:
scaler17 = preprocessing.StandardScaler().fit(X_Worms)
X_Worms=scaler17.transform(X_Worms)

scaler18 = preprocessing.StandardScaler().fit(X_Worms_test)
X_Worms_test=scaler18.transform(X_Worms_test)

In [30]:
scaler19 = preprocessing.StandardScaler().fit(X_normal)
X_normal=scaler19.transform(X_normal)

scaler20 = preprocessing.StandardScaler().fit(X_normal_test)
X_normal_test=scaler20.transform(X_normal_test)

In [31]:
scaler21 = preprocessing.StandardScaler().fit(X_attack)
X_attack=scaler21.transform(X_attack)

scaler22 = preprocessing.StandardScaler().fit(X_attack_test)
X_attack_test=scaler22.transform(X_attack_test)

In [32]:
from sklearn import tree
rf_DoS = tree.DecisionTreeClassifier()
rf_DoS.fit(X_DoS, Y_DoS)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [33]:
rf_Generic=tree.DecisionTreeClassifier()
rf_Generic.fit(X_Generic,Y_Generic)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [34]:
rf_Exploits=tree.DecisionTreeClassifier()
rf_Exploits.fit(X_Exploits,Y_Exploits)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [35]:
rf_Fuzzers=tree.DecisionTreeClassifier()
rf_Fuzzers.fit(X_Fuzzers,Y_Fuzzers)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [36]:
rf_Analysis=tree.DecisionTreeClassifier()
rf_Analysis.fit(X_Analysis,Y_Analysis)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [37]:
rf_Backdoor=tree.DecisionTreeClassifier()
rf_Backdoor.fit(X_Backdoor,Y_Backdoor)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [38]:
rf_Shellcode=tree.DecisionTreeClassifier()
rf_Shellcode.fit(X_Shellcode,Y_Shellcode)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [39]:
rf_Worms=tree.DecisionTreeClassifier()
rf_Worms.fit(X_Worms,Y_Worms)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [40]:
rf_Reconnaissance=tree.DecisionTreeClassifier()
rf_Reconnaissance.fit(X_Reconnaissance,Y_Reconnaissance)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [41]:
rf_normal=tree.DecisionTreeClassifier()
rf_normal.fit(X_normal,Y_normal)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [42]:
rf_attack=tree.DecisionTreeClassifier()
rf_attack.fit(X_attack,Y_attack)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [43]:
rf_DoS.predict(X_DoS_test)
rf_Generic.predict(X_Generic_test)
rf_Exploits.predict(X_Exploits_test)
rf_Fuzzers.predict(X_Fuzzers_test)
rf_Analysis.predict(X_Analysis_test)
rf_Backdoor.predict(X_Backdoor_test)
rf_Shellcode.predict(X_Shellcode_test)
rf_Worms.predict(X_Worms_test)
rf_Reconnaissance.predict(X_Reconnaissance_test)
rf_normal.predict(X_normal_test)
rf_attack.predict(X_attack_test)

array([2, 2, 2, ..., 3, 3, 3], dtype=int64)

In [44]:
Y_DoS_pred=rf_DoS.predict(X_DoS_test)
Y_Generic_pred=rf_Generic.predict(X_Generic_test)
Y_Exploits_pred=rf_Exploits.predict(X_Exploits_test)
Y_Fuzzers_pred=rf_Fuzzers.predict(X_Fuzzers_test)
Y_Analysis_pred=rf_Analysis.predict(X_Analysis_test)
Y_Backdoor_pred=rf_Backdoor.predict(X_Backdoor_test)
Y_Shellcode_pred=rf_Shellcode.predict(X_Shellcode_test)
Y_Worms_pred=rf_Worms.predict(X_Worms_test)
Y_Reconnaissance_pred=rf_Reconnaissance.predict(X_Reconnaissance_test)
Y_normal_pred=rf_normal.predict(X_normal_test)
Y_attack_pred=rf_attack.predict(X_attack_test)
# Create confusion matrix
# pd.crosstab(Y_DoS_test, Y_DoS_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [45]:
def accuracy_report(classifier,X,y,num_cv):
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import precision_score
    from sklearn.model_selection import cross_val_predict

#   y_pred = cross_val_predict(classifier, X, y, cv=10)
    scores = cross_val_score(classifier, X,y, cv=num_cv)
    recall = cross_val_score(classifier, X, y, cv=num_cv, scoring='recall_weighted')
    precision = cross_val_score(classifier, X, y, cv=num_cv, scoring='precision_weighted')
    f1 = cross_val_score(classifier,X, y, scoring='f1_weighted', cv=num_cv)
    
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("Recall: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))
    print("Precision: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
    print("F1-Score: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std() * 2))

In [46]:
DoS_accuracy=accuracy_report(rf_DoS,X_DoS_test,Y_DoS_test,10)

Accuracy: 0.93 (+/- 0.01)
Recall: 0.93 (+/- 0.01)
Precision: 0.92 (+/- 0.01)
F1-Score: 0.92 (+/- 0.01)


In [47]:
Exploits_accuracy=accuracy_report(rf_Exploits,X_Exploits_test,Y_Exploits_test,10)

Accuracy: 0.93 (+/- 0.02)
Recall: 0.93 (+/- 0.02)
Precision: 0.93 (+/- 0.02)
F1-Score: 0.92 (+/- 0.02)


In [48]:
Fuzzers_accuracy=accuracy_report(rf_Fuzzers,X_Fuzzers_test,Y_Fuzzers_test,10)

Accuracy: 0.85 (+/- 0.01)
Recall: 0.85 (+/- 0.01)
Precision: 0.85 (+/- 0.02)
F1-Score: 0.85 (+/- 0.01)


In [49]:
Analysis_accuracy=accuracy_report(rf_Analysis,X_Analysis_test,Y_Analysis_test,10)

Accuracy: 0.86 (+/- 0.02)
Recall: 0.86 (+/- 0.02)
Precision: 0.86 (+/- 0.01)
F1-Score: 0.85 (+/- 0.01)


In [50]:
Backdoor_accuracy=accuracy_report(rf_Backdoor,X_Backdoor_test,Y_Backdoor_test,10)

Accuracy: 0.86 (+/- 0.01)
Recall: 0.86 (+/- 0.01)
Precision: 0.86 (+/- 0.02)
F1-Score: 0.85 (+/- 0.01)


In [51]:
Worms_accuracy=accuracy_report(rf_Worms,X_Worms_test,Y_Worms_test,10)

Accuracy: 0.85 (+/- 0.01)
Recall: 0.85 (+/- 0.01)
Precision: 0.85 (+/- 0.02)
F1-Score: 0.84 (+/- 0.01)


In [52]:
Reconnaissance_accuracy=accuracy_report(rf_Reconnaissance,X_Reconnaissance_test,Y_Reconnaissance_test,10)

Accuracy: 0.86 (+/- 0.02)
Recall: 0.86 (+/- 0.02)
Precision: 0.86 (+/- 0.02)
F1-Score: 0.85 (+/- 0.01)


In [53]:
Normal_accuracy=accuracy_report(rf_normal,X_normal_test,Y_normal_test,10)

Accuracy: 1.00 (+/- 0.00)
Recall: 1.00 (+/- 0.00)
Precision: 1.00 (+/- 0.00)
F1-Score: 1.00 (+/- 0.00)


In [54]:
Attack_accuracy=accuracy_report(rf_attack,X_attack_test,Y_attack_test,10)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 0.75 (+/- 0.06)
Recall: 0.75 (+/- 0.06)
Precision: 0.76 (+/- 0.06)
F1-Score: 0.74 (+/- 0.06)


In [None]:
Shellcode_accuracy=accuracy_report(rf_Shellcode,X_Shellcode_test,Y_Shellcode_test,10)

In [None]:
Generic_accuracy=accuracy_report(rf_Generic,X_Generic_test,Y_Generic_test,10)