*** STEP 1: NORMALIZE DATASET***


In [19]:
# Import dependencies:

import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

In [20]:
path = '/Datasets/NSL_KDD/data/'


columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot'
,'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations'
,'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate'
,'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count'
,'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate'
,'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','outcome','level'])

train = path + 'KDDTrain+.csv'
test = path + 'KDDTest+.csv'

train_df = pd.read_csv(train, names=columns)
test_df = pd.read_csv(test, names=columns)

train_df.shape, test_df.shape

((125973, 43), (22544, 43))

In [21]:
df = pd.concat([train_df, test_df], ignore_index=True)
df.shape

(148517, 43)

In [22]:
print(df['outcome'].value_counts())

normal             77054
neptune            45871
satan               4368
ipsweep             3740
smurf               3311
portsweep           3088
nmap                1566
back                1315
guess_passwd        1284
mscan                996
warezmaster          964
teardrop             904
warezclient          890
apache2              737
processtable         685
snmpguess            331
saint                319
mailbomb             293
pod                  242
snmpgetattack        178
httptunnel           133
buffer_overflow       50
land                  25
multihop              25
rootkit               23
named                 17
ps                    15
sendmail              14
xterm                 13
imap                  12
loadmodule            11
ftp_write             11
xlock                  9
phf                    6
perl                   5
xsnoop                 4
spy                    2
worm                   2
sqlattack              2
udpstorm               2


In [24]:
# Find columns with non-numerical values and the number of their catigories:

for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
      unique_cat = len(df[col_name].unique())
      print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'outcome' has 40 categories


In [25]:
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
df_categorical_values.shape

(148517, 3)

In [26]:
# protocol type
unique_protocol1=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol1]
print(unique_protocol2)

# service
unique_service1=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service1]
print(unique_service2)

# flag
unique_flag1=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag1]
print(unique_flag2)


# put them together
dumcols_train = unique_protocol2 + unique_service2 + unique_flag2

['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'ser

In [27]:
# Convert non_numerical values into numerical:

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print("before convert:\n" ,df_categorical_values.head())
print('--------------------')
print("after convert:\n", df_categorical_values_enc.head())


before convert:
   protocol_type   service flag
0           tcp  ftp_data   SF
1           udp     other   SF
2           tcp   private   S0
3           tcp      http   SF
4           tcp      http   SF
--------------------
after convert:
    protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


In [28]:
# Apply one_hot encoding:

enc = OneHotEncoder(categories='auto')

df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(),columns=dumcols_train)

df_cat_data.shape

(148517, 84)

In [29]:
df_cat_data.head()

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
# drop 3 columns from the dataset:

new_df = df
new_df.drop('flag', axis=1, inplace=True)
new_df.drop('protocol_type', axis=1, inplace=True)
new_df.drop('service', axis=1, inplace=True)
new_df.drop('level', axis=1, inplace=True)


new_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,491,0,0,0,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,146,0,0,0,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,0,0,0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,232,8153,0,0,0,0,0,1,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,199,420,0,0,0,0,0,1,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [31]:
# add df_cat_data to the dataset:

# to add the df_cat_data between column[0] and rest of the dataset:
new_df_left = new_df.iloc[:, :1]
new_df_right = new_df.iloc[:, 1:]

new_df1 = pd.concat([new_df_left, df_cat_data, new_df_right], axis=1)
new_df1.shape

(148517, 123)

In [32]:
# define each label belongs to one class of : Normal(0), DoS(1), Probe(2), U2R(3), R2L(4) and replace them in dataset

df_label = new_df1['outcome']
new_df_label = df_label.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

new_df1['label'] = new_df_label

In [33]:

# print the distribution of every class:

to_drop_Normal = [0]
to_drop_DoS = [1]
to_drop_Probe = [2]
to_drop_R2L = [3]
to_drop_U2R = [4]


Normal_df = new_df1[new_df1['label'].isin(to_drop_Normal)];
DoS_df = new_df1[new_df1['label'].isin(to_drop_DoS)];
Probe_df = new_df1[new_df1['label'].isin(to_drop_Probe)];
R2L_df = new_df1[new_df1['label'].isin(to_drop_R2L)];
U2R_df = new_df1[new_df1['label'].isin(to_drop_U2R)];

print('Dimensions of Normal:' ,Normal_df.shape)
print('Dimensions of DoS:' ,DoS_df.shape)
print('Dimensions of Probe:' ,Probe_df.shape)
print('Dimensions of R2L:' ,R2L_df.shape)
print('Dimensions of U2R:' ,U2R_df.shape)

Dimensions of Normal: (77054, 124)
Dimensions of DoS: (53387, 124)
Dimensions of Probe: (14077, 124)
Dimensions of R2L: (3880, 124)
Dimensions of U2R: (119, 124)


In [34]:
#new
colNames=list(new_df1)
all_data = pd.concat([Normal_df, DoS_df, Probe_df, R2L_df, U2R_df])
# concatenated_array = np.concatenate((array1, array2), axis=0)
# concatenated_df = pd.DataFrame(concatenated_array, columns=df_train.columns)
# concatenated_df.to_csv('/content/drive/MyDrive/concatenated_data.csv', index=False)
all_data1 = all_data.to_numpy()
np.random.shuffle(all_data1)
final_df = pd.DataFrame(all_data1, columns=colNames)
# final_df.to_csv('/content/drive/MyDrive/NSLKDD-Final.csv', index=False)


In [35]:
# split dataset into labels and rest of it:

X_Normal = Normal_df.drop('label',1)
Y_Normal = Normal_df.label

X_DoS = DoS_df.drop('label',1)
Y_DoS = DoS_df.label

X_Probe = Probe_df.drop('label',1)
Y_Probe = Probe_df.label

X_R2L = R2L_df.drop('label',1)
Y_R2L = R2L_df.label

X_U2R = U2R_df.drop('label',1)
Y_U2R = U2R_df.label

  X_Normal = Normal_df.drop('label',1)
  X_DoS = DoS_df.drop('label',1)
  X_Probe = Probe_df.drop('label',1)
  X_R2L = R2L_df.drop('label',1)
  X_U2R = U2R_df.drop('label',1)


In [36]:
# Since the column names will be deleted at this stage, we save the column names for later use.

colNames=list(new_df1)

In [None]:
# Min_Max normalization:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

#train set
X_Normal_1 = scaler.fit_transform(X_Normal)
X_DoS_1 = scaler.fit_transform(X_DoS)
X_Probe_1 = scaler.fit_transform(X_Probe)
X_R2L_1 = scaler.fit_transform(X_R2L)
X_U2R_1 = scaler.fit_transform(X_U2R)

In [None]:
Normal = np.hstack((X_Normal_1, Y_Normal[:, np.newaxis]))
DoS = np.hstack((X_DoS_1, Y_DoS[:, np.newaxis]))
Probe = np.hstack((X_Probe_1, Y_Probe[:, np.newaxis]))
R2L = np.hstack((X_R2L_1, Y_R2L[:, np.newaxis]))
U2R = np.hstack((X_U2R_1, Y_U2R[:, np.newaxis]))

In [None]:
np_final = np.concatenate((Normal, DoS, Probe, R2L, U2R))
np.random.shuffle(np_final)

final_df = pd.DataFrame(np_final, columns=colNames)
final_df.shape

(148517, 123)

In [None]:
final_df.head()

Unnamed: 0,duration,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,1.8e-05,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.658824,0.97,0.07,0.03,0.01,0.0,0.0,0.03,0.04,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.031496,0.04,0.08,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007874,0.01,0.06,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.047059,1.0,0.0,1.0,0.17,0.0,0.0,0.0,0.0,0.0


In [None]:
def del_repeatetive(final_df):
    return final_df.loc[(final_df!=final_df.shift()).any(axis=1)]

final_df1 = del_repeatetive(final_df.copy())
final_df1.shape

In [None]:
final_df.to_csv('/Datasets/NSL_KDD/Normalized_NSL_KDD.csv', index=False)