# SECTION 1: DECLARE THE MODULES

In [None]:
import os
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

: 

# SECTION 2: Data import and preprocess
#Run this but dont worry if it does not make any sense Jump to SECTION 3 that is related to your HD task.

In [None]:
DataSet = "Week_5_NSL-KDD-Dataset/training_attack_types.txt"
DataSet

: 

In [None]:
header_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate','rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'success_pred']

col_names = np.array(header_names)

nominal_idx = [1, 2, 3]
binary_idx = [6, 11, 13, 14, 20, 21]
numeric_idx = list(set(range(41)).difference(nominal_idx).difference(binary_idx)) #doesn't contain numbers in above 2 variables

nominal_cols = col_names[nominal_idx].tolist() #gives col names in header names according to number held in idx var
binary_cols = col_names[binary_idx].tolist()   #basically differentating according to type
numeric_cols = col_names[numeric_idx].tolist()

category = defaultdict(list)
category['benign'].append('normal')

with open(DataSet, 'r') as f:
    for line in f.readlines():
        attack, cat = line.strip().split(' ')
        category[cat].append(attack)

attack_mapping = dict((v,k) for k in category for v in category[k])

attack_mapping

: 

In [None]:
#Processing Training Data
train_file = "Week_5_NSL-KDD-Dataset/KDDTrain+.txt"

train_df = pd.read_csv(train_file, names=header_names)
train_df['attack_category'] = train_df['attack_type'].map(lambda x: attack_mapping[x])
train_df.drop(['success_pred'], axis=1, inplace=True)

: 

In [None]:
#Processing test Data
test_file = "Week_5_NSL-KDD-Dataset/KDDTest+.txt"

test_df = pd.read_csv(test_file, names=header_names)
test_df['attack_category'] = test_df['attack_type'].map(lambda x: attack_mapping[x]) #adds column with type defined in mapping
test_df.drop(['success_pred'], axis=1, inplace=True)

: 

In [None]:
train_attack_types = train_df['attack_type'].value_counts()
train_attack_cats = train_df['attack_category'].value_counts()

test_attack_types = test_df['attack_type'].value_counts()
test_attack_cats = test_df['attack_category'].value_counts()

train_attack_types.plot(kind='barh', figsize=(20,10), fontsize=20)

: 

In [None]:
train_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)

: 

In [None]:
test_attack_types.plot(kind='barh', figsize=(20,15), fontsize=20)

: 

In [None]:
test_attack_cats.plot(kind='barh', figsize=(20,10), fontsize=30)

: 

In [None]:
train_df[binary_cols].describe().transpose()
train_df.groupby(['su_attempted']).size()
train_df['su_attempted'].replace(2, 0, inplace=True)
test_df['su_attempted'].replace(2, 0, inplace=True)
train_df.groupby(['su_attempted']).size()

: 

In [None]:
train_df.groupby(['num_outbound_cmds']).size()
train_df.drop('num_outbound_cmds', axis=1, inplace=True)
test_df.drop('num_outbound_cmds', axis=1, inplace=True)
numeric_cols.remove('num_outbound_cmds')

: 

In [None]:
#Data Preparation
train_Y = train_df['attack_category']
train_x_raw = train_df.drop(['attack_category', 'attack_type'], axis=1)
test_Y = test_df['attack_category']
test_x_raw = test_df.drop(['attack_category', 'attack_type'], axis=1)

combined_df_raw = pd.concat([train_x_raw, test_x_raw])
combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True)

train_x = combined_df[:len(train_x_raw)]
test_x = combined_df[len(train_x_raw):]

# Store dummy variable feature names
dummy_variables = list(set(train_x)-set(combined_df_raw))

: 

In [None]:
train_x.describe()
train_x['duration'].describe()

: 

In [None]:
# Experimenting with StandardScaler on the single 'duration' feature
from sklearn.preprocessing import StandardScaler

durations = train_x['duration'].values.reshape(-1, 1)
standard_scaler = StandardScaler().fit(durations)
scaled_durations = standard_scaler.transform(durations)
pd.Series(scaled_durations.flatten()).describe()

: 

In [None]:
# Experimenting with MinMaxScaler on the single 'duration' feature
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler().fit(durations)
min_max_scaled_durations = min_max_scaler.transform(durations)
pd.Series(min_max_scaled_durations.flatten()).describe()

: 

In [None]:
# Experimenting with RobustScaler on the single 'duration' feature
from sklearn.preprocessing import RobustScaler

min_max_scaler = RobustScaler().fit(durations)
robust_scaled_durations = min_max_scaler.transform(durations)
pd.Series(robust_scaled_durations.flatten()).describe()

: 

In [None]:
# Experimenting with MaxAbsScaler on the single 'duration' feature
from sklearn.preprocessing import MaxAbsScaler

max_Abs_scaler = MaxAbsScaler().fit(durations)
robust_scaled_durations = max_Abs_scaler.transform(durations)
pd.Series(robust_scaled_durations.flatten()).describe()

: 

In [None]:
# Let's proceed with StandardScaler- Apply to all the numeric columns

standard_scaler = StandardScaler().fit(train_x[numeric_cols])
train_x[numeric_cols] = standard_scaler.transform(train_x[numeric_cols])
test_x[numeric_cols] = standard_scaler.transform(test_x[numeric_cols])
train_x.describe()

train_Y_bin = train_Y.apply(lambda x: 0 if x is 'benign' else 1)
test_Y_bin = test_Y.apply(lambda x: 0 if x is 'benign' else 1)

: 

# SECTION 3: Multi class classification
#This is the section where you have to add other algorithms, tune algorithms and visualize to compare and analyze algorithms

In [None]:
import time
from sklearn import metrics

: 

In [None]:
# 5-class classification version
from sklearn.tree import DecisionTreeClassifier

start = time.time()
tree = DecisionTreeClassifier(random_state=17)
tree.fit(train_x, train_Y)
pred_y = tree.predict(test_x)
end = time.time()

: 

In [None]:
def output():
    names = ('benign', 'dos', 'probe', 'r2l', 'u2r')
    conf_matrix = metrics.confusion_matrix(test_Y, pred_y)
    TP = np.diag(conf_matrix)
    FP = conf_matrix.sum(axis=0) - TP
    FN = conf_matrix.sum(axis=1) - TP
    TN = conf_matrix.sum() - (FP + FN + TP)
    FPR = (FP/(FP+TN))
    
    print("Confusion Matrix:\n", conf_matrix)
    print("\nClassification report:\n", metrics.classification_report(test_Y, pred_y, digits=5))
    for i in range(5):
        print("FPR of ", names[i], " is: {:.5f}" .format(FPR[i]))
    print("\nAccuracy Score: {:.5f}" .format(metrics.accuracy_score(test_Y, pred_y)))
    print("Zero one loss: {:.5f}" .format(metrics.zero_one_loss(test_Y, pred_y)))
    print("Time taken: {:.5f}" .format(end - start), "seconds")

: 

In [None]:
print("Decision Tree Classifier results are shown below.\n")
output()
print("ROC area: ", metrics.roc_auc_score(test_Y, tree.predict_proba(test_x), multi_class="ovr"))

: 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
start = time.time()
knn = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn.fit(train_x, train_Y)
pred_y = knn.predict(test_x)
end = time.time()
print("K-Nearest Neighbors classifier results are shown below.")
output()
print("\nROC area: ", metrics.roc_auc_score(test_Y, knn.predict_proba(test_x), multi_class="ovr"))

In [None]:
from sklearn.naive_bayes import GaussianNB
start = time.time()
gnb = GaussianNB()
gnb.fit(train_x, train_Y)
pred_y = gnb.predict(test_x)
end = time.time()
print("Gaussian Naive Bayes classifier results are shown below.")
output()
print("\nROC area: ", metrics.roc_auc_score(test_Y, gnb.predict_proba(test_x), multi_class="ovr"))

In [None]:
from sklearn.linear_model import LogisticRegression
start = time.time()
clf = LogisticRegression(C=0.75, random_state=17)
clf.fit(train_x, train_Y)
pred_y = clf.predict(test_x)
end = time.time()
print("Logistic Regression classifier results are shown below.")
output()
print("\nROC area: ", metrics.roc_auc_score(test_Y, clf.predict_proba(test_x), multi_class="ovr"))

In [None]:
from sklearn.svm import SVC
start = time.time()
svc = SVC(kernel='rbf')
svc.fit(train_x, train_Y)
pred_y = svc.predict(test_x)
end = time.time()
print("Support Vector Machine classifier results are shown below.")
output()
# print("\nROC area: ", metrics.roc_auc_score(test_Y, svc.predict_proba(test_x), multi_class="ovr"))

In [None]:
from sklearn.svm import SVC
start = time.time()
svc = SVC(kernel='rbf', C=0.75)
svc.fit(train_x, train_Y)
pred_y = svc.predict(test_x)
end = time.time()
print("Support Vector Machine classifier results are shown below.")
output()
# print("\nROC area: ", metrics.roc_auc_score(test_Y, svc.predict_proba(test_x), multi_class="ovr"))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# start = time.time()
# rfc = RandomForestClassifier(random_state=17)
# rfc.fit(train_x, train_Y)
# pred_y = rfc.predict(test_x)
# end = time.time()
# output()
# print("\nROC area: ", metrics.roc_auc_score(test_Y, rfc.predict_proba(test_x), multi_class="ovr"))

In [None]:
from sklearn.neural_network import MLPClassifier
start = time.time()
neural = MLPClassifier(random_state=17)
neural.fit(train_x, train_Y)
pred_y = neural.predict(test_x)
end = time.time()
print("Multi-Layer Perceptron classifier results are shown below.")
output()
print("\nROC area: ", metrics.roc_auc_score(test_Y, neural.predict_proba(test_x), multi_class="ovr"))