# Data Preparation and EDA for IoT23 dataset

In [1]:
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import csv
from numpy import array
from numpy import argmax
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [2]:
data= pd.read_csv('preprocessed_iot23_1M_samples.csv', low_memory=False)

In [69]:
data.head()  

Unnamed: 0.1,Unnamed: 0,ts,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto,conn_state,label
0,0,1536227000.0,17576.0,8081.0,3e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,5,POHS
1,1,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,5,POHS
2,2,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,5,POHS
3,3,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,5,POHS
4,4,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,2.0,80.0,0.0,0.0,1,5,POHS


In [3]:
del data['Unnamed: 0']

In [4]:
data.tail()

Unnamed: 0,ts,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto,conn_state,label
1244126,1532526000.0,41762.0,23.0,0.0,0.0,0.0,0,0,0.0,1.0,40.0,0.0,0.0,1,5,POHS
1244127,1532526000.0,58758.0,23.0,0.0,0.0,0.0,0,0,0.0,1.0,40.0,0.0,0.0,1,5,POHS
1244128,1532526000.0,40400.0,23.0,0.0,0.0,0.0,0,0,0.0,1.0,40.0,0.0,0.0,1,5,POHS
1244129,1532526000.0,27117.0,23.0,0.0,0.0,0.0,0,0,0.0,1.0,40.0,0.0,0.0,1,5,POHS
1244130,1532526000.0,23227.0,23.0,0.0,0.0,0.0,0,0,0.0,1.0,40.0,0.0,0.0,1,5,POHS


In [5]:
data.shape

(1244131, 16)

In [66]:
print(data.columns)

Index(['ts', 'id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes',
       'local_orig', 'local_resp', 'missed_bytes', 'orig_pkts',
       'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto', 'conn_state',
       'label'],
      dtype='object')


In [6]:
#we drop port numbers because they can cause overfitting of the model
df= data.drop(columns=['id.orig_p','id.resp_p'])

In [8]:
#separate the X variables
X=df.iloc[:,:-1]

In [9]:
#separate labels (y)
y=data['label']

In [10]:
X.describe()

Unnamed: 0,ts,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto,conn_state
count,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0,1244131.0
mean,1546090000.0,0.4382011,3162.748,12.28624,0.0,0.0,0.05353777,206.3864,8971.911,0.03341047,13.82442,1.011863,4.577432
std,9480849.0,55.60016,1847804.0,10466.2,0.0,0.0,15.25009,77913.96,3070481.0,7.342897,10839.93,0.1103718,1.502551
min,1532101000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1538493000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,0.0,1.0,5.0
50%,1545459000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,0.0,1.0,5.0
75%,1551379000.0,2e-06,0.0,0.0,0.0,0.0,0.0,2.0,80.0,0.0,0.0,1.0,5.0
max,1569018000.0,48976.82,1744830000.0,11661020.0,0.0,0.0,5792.0,66027350.0,1914793000.0,8055.0,12076990.0,2.0,11.0


In [11]:
X.isnull().sum()

ts               0
duration         0
orig_bytes       0
resp_bytes       0
local_orig       0
local_resp       0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
proto            0
conn_state       0
dtype: int64

In [12]:
X = np.asarray(X).astype(np.float32)

In [13]:
y

0          POHS
1          POHS
2          POHS
3          POHS
4          POHS
           ... 
1244126    POHS
1244127    POHS
1244128    POHS
1244129    POHS
1244130    POHS
Name: label, Length: 1244131, dtype: object

In [14]:
y.value_counts()

POHS      677827
Okiru     262690
Benign    149744
DDoS      138777
C&C        15093
Name: label, dtype: int64

In [15]:
le=LabelEncoder()

In [16]:
y=le.fit_transform(y)

In [17]:
y

array([4, 4, 4, ..., 4, 4, 4])

In [18]:
#the labels are encoded in alphabetic order
#0---->Benign
#1---->Cnc
#2---->DDoS
#3---->Okiru
#4---->POHScan


In [19]:
le.inverse_transform(y)

array(['POHS', 'POHS', 'POHS', ..., 'POHS', 'POHS', 'POHS'], dtype=object)

In [20]:
y.shape

(1244131,)

In [21]:
sc=StandardScaler()

In [22]:
Xs=sc.fit_transform(X)

=======================================================================

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split
import time as timer

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(Xs, y, random_state=124, test_size=0.20, shuffle=True)

# Gussian Naive Bayes 

In [61]:
GaussianNB?

In [25]:
start = timer.time()
gnb_model = GaussianNB()
gnb_model.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 1.07 seconds


In [26]:
# Predicting the results on train set
y_gnbr = gnb_model.predict(X_train)

In [27]:
print("Accuracy score for GNB: {:.4f}".format(accuracy_score(Y_train,y_gnbr)))

Accuracy score for GNB: 0.3828


In [28]:
# Predicting the test set results
y_gnb = gnb_model.predict(X_test)
y_gnb_prob = gnb_model.predict_proba(X_test)

In [29]:
print("Accuracy score for GNB: {:.4f}".format(accuracy_score(Y_train,y_gnbr)))

Accuracy score for GNB: 0.3828


In [63]:
print("Classification report for GNB: \n{}".format(classification_report(Y_test,y_gnb)))
print("Confusion matrix for GNB: \n{}".format(confusion_matrix(Y_test,y_gnb)))
print("Accuracy score for GNB: {:.4f}".format(accuracy_score(Y_test,y_gnb)))
# calculate precision, recall, and f1 scores
prec_gnb = precision_score(Y_test,y_gnb,average='weighted')
rec_gnb = recall_score(Y_test,y_gnb,average='weighted')
f1_gnb = f1_score(Y_test,y_gnb,average='weighted')
print("Precision score for GNB: {:.4f}".format(prec_gnb))
print("Recall score for GNB: {:.4f}".format(rec_gnb))
print("F1 score for GNB: {:.4f}".format(f1_gnb))

Classification report for GNB: 
              precision    recall  f1-score   support

           0       1.00      0.10      0.19     29952
           1       0.55      0.11      0.19      2945
           2       1.00      0.82      0.90     27853
           3       0.27      1.00      0.42     52514
           4       0.60      0.12      0.20    135563

    accuracy                           0.38    248827
   macro avg       0.68      0.43      0.38    248827
weighted avg       0.62      0.38      0.33    248827

Confusion matrix for GNB: 
[[  3108    230      4  16503  10107]
 [     0    334      0   1694    917]
 [     2      2  22704   5144      1]
 [     0     41      0  52472      1]
 [     1      0      1 119036  16525]]
Accuracy score for GNB: 0.3824
Precision score for GNB: 0.6223
Recall score for GNB: 0.3824
F1 score for GNB: 0.3253


In [69]:
cf=confusion_matrix(Y_test,y_gnb)

In [70]:
#accuracy for each class
cf.diagonal()/cf.sum(axis=1)

array([0.10376603, 0.11341256, 0.81513661, 0.99920021, 0.12189904])

# Logistic Regression 

In [205]:
LogisticRegression?

In [31]:
start = timer.time()
logit_model = LogisticRegression(solver='sag', max_iter=300, multi_class='multinomial')
logit_model.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 70.79 seconds


In [33]:
# Predicting the train set results
y_logitr = logit_model.predict(X_train)

In [34]:
print("Accuracy score for LR: {:.4f}".format(accuracy_score(Y_train,y_logitr)))

Accuracy score for LR: 0.6489


In [35]:
# Predicting the test set results
y_logit = logit_model.predict(X_test)
y_logit_prob = logit_model.predict_proba(X_test)

In [36]:
print("Classification report for LR: \n{}".format(classification_report(Y_test,y_logit)))
print("Confusion matrix for LR: \n{}".format(confusion_matrix(Y_test,y_logit)))
print("Accuracy score for LR: {:.4f}".format(accuracy_score(Y_test,y_logit)))
# calculate precision, recall, and f1 scores
prec_logit = precision_score(Y_test,y_logit,average='weighted')
rec_logit = recall_score(Y_test,y_logit,average='weighted')
f1_logit = f1_score(Y_test,y_logit,average='weighted')
print("Precision score for LR: {:.4f}".format(prec_logit))
print("Recall score for LR: {:.4f}".format(rec_logit))
print("F1 score for LR: {:.4f}".format(f1_logit))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report for LR: 
              precision    recall  f1-score   support

           0       1.00      0.10      0.18     29952
           1       0.00      0.00      0.00      2945
           2       1.00      0.82      0.90     27853
           3       0.48      0.38      0.42     52514
           4       0.64      0.85      0.73    135563

    accuracy                           0.65    248827
   macro avg       0.62      0.43      0.45    248827
weighted avg       0.68      0.65      0.61    248827

Confusion matrix for LR: 
[[  3041      0     62     27  26822]
 [     1      0      0   1625   1319]
 [     3      0  22702      0   5148]
 [     0      0     41  19806  32667]
 [     1      0      0  19818 115744]]
Accuracy score for LR: 0.6482


  _warn_prf(average, modifier, msg_start, len(result))


Precision score for LR: 0.6799
Recall score for LR: 0.6482
F1 score for LR: 0.6092


In [65]:
cf=confusion_matrix(Y_test,y_logit)

In [66]:
#accuracy for each class
cf.diagonal()/cf.sum(axis=1)

array([0.10152911, 0.        , 0.8150648 , 0.37715657, 0.85380229])

# RandomForestClassifier

In [37]:
start = timer.time()
Rf_clf=RandomForestClassifier(n_estimators =10, max_depth=20, criterion = 'entropy', random_state = 124)
Rf_clf.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.4f} seconds".format(end-start))

Finished training within 25.8060 seconds


In [38]:
#checking train accuracy
y_RF_train = Rf_clf.predict(X_train)

In [59]:
print("Training Accuracy score for RF: {:.4f}".format(accuracy_score(Y_train,y_RF_train)))

Training Accuracy score for RF: 0.9032


In [40]:
y_RF = Rf_clf.predict(X_test)
y_RF_prob = Rf_clf.predict_proba(X_test)

In [58]:
print("Classification report for RF: \n{}".format(classification_report(Y_test,y_RF)))
print("Confusion matrix for RF: \n{}".format(confusion_matrix(Y_test,y_RF)))
print("Accuracy score for RF: {:.4f}".format(accuracy_score(Y_test,y_RF)))
# calculate precision, recall, and f1 scores
prec_rf = precision_score(Y_test,y_RF, average='weighted')
rec_rf = recall_score(Y_test,y_RF,average='weighted')
f1_rf = f1_score(Y_test,y_RF,average='weighted')
print("Precision score for DT: {:.4f}".format(prec_rf))
print("Recall score for DT: {:.4f}".format(rec_rf))
print("F1 score for DF: {:.4f}".format(f1_rf))


Classification report for RF: 
              precision    recall  f1-score   support

           0       1.00      0.78      0.88     29952
           1       1.00      0.99      1.00      2945
           2       1.00      0.82      0.90     27853
           3       1.00      0.76      0.86     52514
           4       0.85      1.00      0.92    135563

    accuracy                           0.90    248827
   macro avg       0.97      0.87      0.91    248827
weighted avg       0.92      0.90      0.90    248827

Confusion matrix for RF: 
[[ 23483      7      3      0   6459]
 [    18   2927      0      0      0]
 [     1      0  22735      0   5117]
 [     2      0      1  39813  12698]
 [     2      1      0      0 135560]]
Accuracy score for RF: 0.9023
Precision score for DT: 0.9171
Recall score for DT: 0.9023
F1 score for DF: 0.9002


In [71]:
#accuracy for each class
cf=confusion_matrix(Y_test,y_RF)
cf.diagonal()/cf.sum(axis=1)

array([0.7840211 , 0.99388795, 0.8162496 , 0.75814069, 0.99997787])

# Decision Trees

In [42]:
start = timer.time()
DT_clf= DecisionTreeClassifier(max_depth=10)
DT_clf.fit(X_train,Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 5.63 seconds


In [61]:
#checking train accuracy
y_DT_train = DT_clf.predict(X_train)

In [62]:
print("Train Accuracy score for DT:5.63 {:.4f}".format(accuracy_score(Y_train,y_DT_train)))

Train Accuracy score for DT:5.63 0.9031


In [43]:
#test accuracy
y_DT = DT_clf.predict(X_test)
y_DT_prob = DT_clf.predict_proba(X_test)

In [56]:
print("Classification report for DT: \n{}".format(classification_report(Y_test,y_DT)))
print("Confusion matrix for DT: \n{}".format(confusion_matrix(Y_test,y_DT)))
print("Accuracy score for DT: {:.4f}".format(accuracy_score(Y_test,y_DT)))
# calculate precision, recall, and f1 scores
prec_dt = precision_score(Y_test,y_DT, average='weighted')
rec_dt = recall_score(Y_test,y_DT, average='weighted')
f1_dt = f1_score(Y_test,y_DT, average='weighted')
print("Precision score for DT: {:.4f}".format(prec_dt))
print("Recall score for DT: {:.4f}".format(rec_dt))
print("F1 score for DF: {:.4f}".format(f1_dt))


Classification report for DT: 
              precision    recall  f1-score   support

           0       1.00      0.78      0.88     29952
           1       1.00      0.99      1.00      2945
           2       1.00      0.82      0.90     27853
           3       1.00      0.76      0.86     52514
           4       0.85      1.00      0.92    135563

    accuracy                           0.90    248827
   macro avg       0.97      0.87      0.91    248827
weighted avg       0.92      0.90      0.90    248827

Confusion matrix for DT: 
[[ 23485      3      4      0   6460]
 [    20   2924      0      0      1]
 [     3      0  22734      0   5116]
 [     3      0      0  39814  12697]
 [     1      3      0      0 135559]]
Accuracy score for DT: 0.9023
Precision score for DT: 0.9171
Recall score for DT: 0.9023
F1 score for DF: 0.9002


In [72]:
#accuracy for each class
cf=confusion_matrix(Y_test,y_DT)
cf.diagonal()/cf.sum(axis=1)

array([0.78408787, 0.99286927, 0.81621369, 0.75815973, 0.99997049])

# KNN 

In [53]:
start = timer.time()
knn_model = neighbors.KNeighborsClassifier(p=2,leaf_size=20, n_neighbors=5, n_jobs=-1)
knn_model.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 2.95 seconds


In [54]:
start = timer.time()
y_KNN = knn_model.predict(X_test)
y_KNN_prob = knn_model.predict_proba(X_test)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 921.68 seconds


In [52]:
#checking train accuracy
y_KNN_train = xg_clf.predict(X_train)
print("Accuracy score for KNN: {:.4f}".format(accuracy_score(Y_train,y_KNN_train)))

Accuracy score for KNN: 0.9033


In [55]:
# print("Classification report for KNN: \n{}".format(classification_report(Y_test,y_KNN)))
print("Confusion matrix for KNN: \n{}".format(confusion_matrix(Y_test,y_KNN)))
print("Accuracy score for KNN: {:.4f}".format(accuracy_score(Y_test,y_KNN)))
# calculate precision, recall, and f1 scores
prec_knn = precision_score(Y_test,y_KNN,average='weighted')
rec_knn = recall_score(Y_test,y_KNN,average='weighted')
f1_knn = f1_score(Y_test,y_KNN,average='weighted')
print("Precision score for KNN: {:.4f}".format(prec_knn))
print("Recall score for KNN: {:.4f}".format(rec_knn))
print("F1 score for KNN: {:.4f}".format(f1_knn))


Confusion matrix for KNN: 
[[ 23483      4      4    346   6115]
 [    27   2915      0      0      3]
 [     1      1  23046   3242   1563]
 [     3      0    461  46241   5809]
 [     2      1    764  15989 118807]]
Accuracy score for KNN: 0.8620
Precision score for KNN: 0.8758
Recall score for KNN: 0.8620
F1 score for KNN: 0.8647


In [73]:
#accuracy for each class
cf=confusion_matrix(Y_test,y_KNN)
cf.diagonal()/cf.sum(axis=1)

array([0.7840211 , 0.98981324, 0.82741536, 0.88054614, 0.87639695])

# XGBoost Classifier

In [45]:
start = timer.time()
xg_clf=XGBClassifier(use_label_encoder=False)
xg_clf.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 184.57 seconds


In [46]:
#checking train accuracy
y_xg_train = xg_clf.predict(X_train)
print("Accuracy score for XGB: {:.4f}".format(accuracy_score(Y_train,y_xg_train)))

Accuracy score for DT: 0.9033


In [47]:
start = timer.time()
y_xg = xg_clf.predict(X_test)
y_xg_prob = xg_clf.predict_proba(X_test)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 1.15 seconds


In [48]:
print("Classification report for XGBoost: \n{}".format(classification_report(Y_test,y_xg)))
print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(Y_test,y_xg)))
print("Accuracy score for XGBoost: {:.4f}".format(accuracy_score(Y_test,y_xg)))
# calculate precision, recall, and f1 scores
prec_knn = precision_score(Y_test,y_xg,average='weighted')
rec_knn = recall_score(Y_test,y_xg,average='weighted')
f1_knn = f1_score(Y_test,y_xg,average='weighted')
print("Precision score for XGBoost: {:.4f}".format(prec_knn))
print("Recall score for XGBoost: {:.4f}".format(rec_knn))
print("F1 score for XGBoost: {:.4f}".format(f1_knn))

Classification report for XGBoost: 
              precision    recall  f1-score   support

           0       1.00      0.78      0.88     29952
           1       1.00      0.99      1.00      2945
           2       1.00      0.82      0.90     27853
           3       1.00      0.76      0.86     52514
           4       0.85      1.00      0.92    135563

    accuracy                           0.90    248827
   macro avg       0.97      0.87      0.91    248827
weighted avg       0.92      0.90      0.90    248827

Confusion matrix for XGBoost: 
[[ 23487      3      3      0   6459]
 [    20   2923      0      0      2]
 [     2      0  22774     15   5062]
 [     2      0     14  39857  12641]
 [     2      0     42     27 135492]]
Accuracy score for XGBoost: 0.9024
Precision score for XGBoost: 0.9169
Recall score for XGBoost: 0.9024
F1 score for XGBoost: 0.9003


In [75]:
#accuracy for each class
cf=confusion_matrix(Y_test,y_xg)
cf.diagonal()/cf.sum(axis=1)

array([0.78415465, 0.99252971, 0.8176498 , 0.75897856, 0.99947626])