In [1]:
# import pandas for te CSV read and processing
import pandas as pd

In [3]:
# tabulate used to print output in a well structured format
!pip install tabulate
from tabulate import tabulate

Defaulting to user installation because normal site-packages is not writeable
Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [4]:
# import the LogisticRegression class from the sklearn library
from sklearn.ensemble import RandomForestClassifier

In [5]:
# specify the report requirements
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [6]:
#create the test and training datasets
from sklearn.model_selection import train_test_split

In [7]:
# import matplotlib for plotting graphs
import matplotlib.pyplot as plt

In [8]:
import warnings
warnings.filterwarnings('ignore')
# time measurement
import time

In [9]:
# for SVM classifier
from sklearn import tree

from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier

# result generation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


def result_generation(classifier, classifier_name, x_train, x_test, y_train, y_test):
    # result
    # 1: model building time, 2: testing time 3: accuracy rate
    result=[]

    start_time=0
    end_time=0

    print("Result for "+ classifier_name + " classifier with "+ str(len(x_train.columns))+" attributes")

    # training
    start_time=time.time()
    classifier.fit(x_train, y_train)
    end_time=time.time()

    result.append(end_time-start_time)

    # testing
    start_time=time.time()
    y_pred = classifier.predict(x_test)
    end_time=time.time()

    result.append(end_time-start_time)

    # show result
    print(accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))

    result.append(accuracy_score(y_test, y_pred))
    return result

In [10]:
# create data frame from the normalised training dataset. If you have haven't already created this run code in explore_dataset.ipynb first
train_df=pd.read_csv("KDDTrain_Norm.csv")

In [11]:
# create data frame from the testing data (see above comment)
test_df=pd.read_csv("KDDTest_Norm.csv")

In [12]:
# show the training dataset
print('Training Data')
print(tabulate(train_df.head(5), train_df.columns, tablefmt = 'psql'))
print('Training Data')

print(tabulate(test_df.head(5), train_df.columns, tablefmt = 'psql'))

Training Data
+----+------------+-------------+-------------+------------+------------------+-------------+------------+---------------------+-------------+-------------------+--------------+----------------+------------+----------------------+--------------+--------------------+---------------------+-----------------+------------------+-----------+-------------+---------------+-------------------+---------------+-------------------+-----------------+-----------------+----------------------+------------------+----------------------+--------------------------+--------------------------+-------------------------------+-------------------------------+------------------------+----------------------------+------------------------+----------------------------+-----------------+-----------+--------+---------+
|    |   duration |   src_bytes |   dst_bytes |       land |   wrong_fragment |      urgent |        hot |   num_failed_logins |   logged_in |   num_compromised |   root_shell |   su_att

In [13]:
# training data
x_train=train_df[train_df.columns[:-1]]
y_train=train_df['class']

In [14]:
# prepare the test data frame
x_test=test_df[test_df.columns[:-1]]
y_test=test_df['class']

In [15]:
# specify the top ranked features (found in the explore_dataset.ipynb notebook)
top_col=["src_bytes", "dst_bytes", "flag", "same_srv_rate", "dst_host_same_srv_rate", "diff_srv_rate", "dst_host_srv_count", "count", "protocol_type", "logged_in", "dst_host_same_src_port_rate", "service", "dst_host_diff_srv_rate"]

In [16]:
# create data frame to hold output
df_btime=pd.DataFrame({"Classifier":[], "All_Attributes":[], "Top_13_Attributes":[]})
df_ttime=pd.DataFrame({"Classifier":[], "All_Attributes":[], "Top_13_Attributes":[]})
df_acc=pd.DataFrame({"Classifier":[], "All_Attributes":[], "Top_13_Attributes":[]})

In [17]:
# create random forest classifier
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
r1=result_generation(classifier, "RandomForest", x_train, x_test, y_train, y_test)
r2=result_generation(classifier, "RandomForest", x_train[top_col], x_test[top_col], y_train, y_test)

df_btime.loc[len(df_btime.index)] = ['RandomForest', r1[0], r2[0]]
df_ttime.loc[len(df_ttime.index)] = ['RandomForest', r1[1], r2[1]]
df_acc.loc[len(df_acc.index)] = ['RandomForest', r1[2], r2[2]]

Result for RandomForest classifier with 41 attributes
0.7915631653655074
[[8605 4228]
 [ 471 9240]]
              precision    recall  f1-score   support

     anomaly       0.95      0.67      0.79     12833
      normal       0.69      0.95      0.80      9711

    accuracy                           0.79     22544
   macro avg       0.82      0.81      0.79     22544
weighted avg       0.84      0.79      0.79     22544

Result for RandomForest classifier with 13 attributes
0.7830908445706175
[[9626 3207]
 [1683 8028]]
              precision    recall  f1-score   support

     anomaly       0.85      0.75      0.80     12833
      normal       0.71      0.83      0.77      9711

    accuracy                           0.78     22544
   macro avg       0.78      0.79      0.78     22544
weighted avg       0.79      0.78      0.78     22544



In [18]:
# create decision tree classifier
classifier = tree.DecisionTreeClassifier()
r1=result_generation(classifier, "DecisionTree", x_train, x_test, y_train, y_test)
r2=result_generation(classifier, "DecisionTree", x_train[top_col], x_test[top_col], y_train, y_test)

df_btime.loc[len(df_btime.index)] = ['DecisionTree', r1[0], r2[0]]
df_ttime.loc[len(df_ttime.index)] = ['DecisionTree', r1[1], r2[1]]
df_acc.loc[len(df_acc.index)] = ['DecisionTree', r1[2], r2[2]]

Result for DecisionTree classifier with 41 attributes
0.7950674237047551
[[9762 3071]
 [1549 8162]]
              precision    recall  f1-score   support

     anomaly       0.86      0.76      0.81     12833
      normal       0.73      0.84      0.78      9711

    accuracy                           0.80     22544
   macro avg       0.79      0.80      0.79     22544
weighted avg       0.80      0.80      0.80     22544

Result for DecisionTree classifier with 13 attributes
0.7758161816891412
[[10030  2803]
 [ 2251  7460]]
              precision    recall  f1-score   support

     anomaly       0.82      0.78      0.80     12833
      normal       0.73      0.77      0.75      9711

    accuracy                           0.78     22544
   macro avg       0.77      0.77      0.77     22544
weighted avg       0.78      0.78      0.78     22544



In [None]:
# create SVM classifier
classifier = SVC(kernel = 'linear', C = 1)
r1=result_generation(classifier, "SVM", x_train, x_test, y_train, y_test)
r2=result_generation(classifier, "SVM", x_train[top_col], x_test[top_col], y_train, y_test)

df_btime.loc[len(df_btime.index)] = ['SVM', r1[0], r2[0]]
df_ttime.loc[len(df_ttime.index)] = ['SVM', r1[1], r2[1]]
df_acc.loc[len(df_acc.index)] = ['SVM', r1[2], r2[2]]

In [None]:
# create MLP classifier
ncount=len(x_train.columns)
classifier = MLPClassifier(hidden_layer_sizes=(ncount,ncount,ncount,ncount),activation="relu",random_state=1)
r1=result_generation(classifier, "MLP", x_train, x_test, y_train, y_test)
ncount=len(top_col)
classifier = MLPClassifier(hidden_layer_sizes=(ncount,ncount,ncount,ncount),activation="relu",random_state=1)
r2=result_generation(classifier, "MLP", x_train[top_col], x_test[top_col], y_train, y_test)

df_btime.loc[len(df_btime.index)] = ['MLP', r1[0], r2[0]]
df_ttime.loc[len(df_ttime.index)] = ['MLP', r1[1], r2[1]]
df_acc.loc[len(df_acc.index)] = ['MLP', r1[2], r2[2]]

In [None]:
# plot all graphs from output of classifiers 
print(df_btime)
print(df_ttime)
print(df_acc)

df_btime=df_btime.set_index('Classifier')
df_ttime=df_ttime.set_index('Classifier')
df_acc=df_acc.set_index('Classifier')



plots1= df_btime.plot.bar(rot=0, title="Model Building Time")
plots2= df_ttime.plot.bar(rot=0, title="Model Evaluation Time")
plots3= df_acc.plot.bar(rot=0, title="Model Accuracy")

for bar in plots1.patches:
    plots1.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')

for bar in plots2.patches:
    plots2.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')

for bar in plots3.patches:
    plots3.annotate(format(bar.get_height(), '.6f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),
                   textcoords='offset points')

plt.show()