# **Data Preprocessing**

In [None]:
# importing required libraries
import numpy as np
import pandas as pd

In [None]:
# dataset doesn't have column names, so we have to provide it

col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]

In [None]:
# importing dataset
data = pd.read_csv('data\raw\KDDTrain+.txt',header=None, names=col_names)

In [None]:
# print dataset
data

In [None]:
# remove attribute 'difficulty_level'
data.drop(['difficulty_level'],axis=1,inplace=True)
data.shape

In [None]:
# descriptive statistics of dataset
data.describe()

In [None]:
# number of attack labels 
data['label'].value_counts()

In [None]:
# changing attack labels to their respective attack class
def change_label(df):
  df.label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
  df.label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
  df.label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
  df.label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)

In [None]:
# calling change_label() function
change_label(data)

In [None]:
# distribution of attack classes
data.label.value_counts()

# **Data Normalization**

In [None]:
# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [None]:
# selecting numeric attributes columns from data
numeric_col = data.select_dtypes(include='number').columns

In [None]:
# using standard scaler for normalizing
std_scaler = StandardScaler()
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
  return df

In [None]:
# data before normalization
data.head()

In [None]:
# calling the normalization() function
data = normalization(data.copy(),numeric_col)

In [None]:
# data after normalization
data.head()

# **One-hot-encoding**

In [None]:
# selecting categorical data attributes
cat_col = ['protocol_type','service','flag']

In [None]:
# creating a dataframe with only categorical attributes
categorical = data[cat_col]
categorical.head()

In [None]:
# one-hot-encoding categorical attributes using pandas.get_dummies() function
categorical = pd.get_dummies(categorical,columns=cat_col).apply(lambda x: x.astype(int))
categorical.head()

# **Binary Classification**

In [None]:
# changing attack labels into two categories 'normal' and 'abnormal'
bin_label = pd.DataFrame(data.label.map(lambda x:'normal' if x=='normal' else 'abnormal'))

In [None]:
# creating a dataframe with binary labels (normal,abnormal)
bin_data = data.copy()
bin_data['label'] = bin_label

In [None]:
# label encoding (0,1) binary labels (abnormal,normal)
le1 = preprocessing.LabelEncoder()
enc_label = bin_label.apply(le1.fit_transform)
bin_data['intrusion'] = enc_label

In [None]:
le1.classes_

In [None]:
np.save("label_classes\le1_classes.npy",le1.classes_,allow_pickle=True)

In [None]:
# dataset with binary labels and label encoded column
bin_data.head()

In [None]:
# one-hot-encoding attack label
# changing attack labels into two categories 'normal' and 'abnormal'

bin_data = pd.get_dummies(bin_data,columns=['label'],prefix="",prefix_sep="")
bin_data['label'] = bin_label
bin_data

In [None]:
bin_data['abnormal'] = bin_data['abnormal'].astype(int)
bin_data['normal'] = bin_data['normal'].astype(int)
bin_data.head()

In [None]:
# importing library for plotting
import matplotlib.pyplot as plt

In [None]:
# pie chart distribution of normal and abnormal labels
plt.figure(figsize=(8,8))
plt.pie(bin_data.label.value_counts(),labels=bin_data.label.unique(),autopct='%0.2f%%')
plt.title("Pie chart distribution of normal and abnormal labels")
plt.legend()
plt.savefig('plots/Pie_chart_binary.png')
plt.show()

# **Multi-class Classification**

In [None]:
# creating a dataframe with multi-class labels (Dos,Probe,R2L,U2R,normal)
multi_data = data.copy()
multi_label = pd.DataFrame(multi_data.label)

In [None]:
# label encoding (0,1,2,3,4) multi-class labels (Dos,normal,Probe,R2L,U2R)
le2 = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
multi_data['intrusion'] = enc_label

In [None]:
multi_data.head()

In [None]:
le2.classes_

In [None]:
np.save("label_classes\le2_classes.npy",le2.classes_,allow_pickle=True)

In [None]:
# one-hot-encoding attack label
multi_data = pd.get_dummies(multi_data,columns=['label'],prefix="",prefix_sep="") 
multi_data['label'] = multi_label
multi_data

In [None]:
# converting boolean columns to integers (1 and 0)
multi_data['Dos'] = multi_data['Dos'].astype(int)
multi_data['Probe'] = multi_data['Probe'].astype(int)
multi_data['R2L'] = multi_data['R2L'].astype(int)
multi_data['U2R'] = multi_data['U2R'].astype(int)
multi_data['normal'] = multi_data['normal'].astype(int)

In [None]:
multi_data.head()   

In [None]:
# pie chart distribution of multi-class labels
plt.figure(figsize=(8,8))
plt.pie(multi_data.label.value_counts(),labels=multi_data.label.unique(),autopct='%0.2f%%')
plt.title('Pie chart distribution of multi-class labels')
plt.legend()
plt.savefig('plots/Pie_chart_multi.png')
plt.show()