# Data Preparation and EDA for IoT23 dataset

In [1]:
import os 
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import csv
from numpy import array
from numpy import argmax
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [2]:
data= pd.read_csv('preprocessed_iot23.csv', low_memory=False)

In [3]:
data.head()  

Unnamed: 0.1,Unnamed: 0,ts,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,...,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,label
0,0,1536227000.0,17576.0,8081.0,3e-06,0.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,POHScan
1,1,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,POHScan
2,2,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,POHScan
3,3,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,POHScan
4,4,1536227000.0,17576.0,8081.0,2e-06,0.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,POHScan


In [4]:
del data['Unnamed: 0']

In [5]:
data.tail()

Unnamed: 0,ts,id.orig_p,id.resp_p,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,...,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,label
2802750,1562165000.0,68.0,67.0,90.034713,3300.0,0.0,0,0,0.0,11.0,...,0,0,1,0,0,0,0,0,0,Benign
2802751,1562165000.0,143.0,0.0,90.39997,340.0,0.0,0,0,0.0,9.0,...,0,0,0,0,0,0,0,0,0,Benign
2802752,1562165000.0,135.0,136.0,89.82403,72.0,0.0,0,0,0.0,3.0,...,0,0,0,0,0,0,0,0,0,Benign
2802753,1562165000.0,143.0,0.0,45.215915,200.0,0.0,0,0,0.0,8.0,...,0,0,0,0,0,0,0,0,0,Benign
2802754,1562165000.0,133.0,134.0,44.242223,0.0,0.0,0,0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,Benign


In [6]:
data.shape

(2802755, 30)

In [7]:
print(data.columns)

Index(['ts', 'id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes',
       'local_orig', 'local_resp', 'missed_bytes', 'orig_pkts',
       'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp',
       'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ',
       'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR',
       'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2',
       'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR',
       'label'],
      dtype='object')


In [8]:
#we will now drop the port numbers to see the effect of excluding the port number from the data

In [9]:
df=data.drop(columns=['id.orig_p','id.resp_p'])

In [10]:
print(df.columns)

Index(['ts', 'duration', 'orig_bytes', 'resp_bytes', 'local_orig',
       'local_resp', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts',
       'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp',
       'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO',
       'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH',
       'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3',
       'conn_state_SF', 'conn_state_SH', 'conn_state_SHR', 'label'],
      dtype='object')


In [11]:
#so we dropped the port numbers

In [12]:
#separate the X variables
X=df.iloc[:,:-1]

In [13]:
#separate labels (y)
y=data['label']

In [14]:
X.describe()

Unnamed: 0,ts,duration,orig_bytes,resp_bytes,local_orig,local_resp,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
count,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,...,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0,2802755.0
mean,1539080000.0,0.3764077,1509.65,128.3293,0.0,0.0,0.01349422,181.9718,6626.713,0.1510189,...,7.028798e-05,0.0005444643,2.426184e-05,0.8410621,1.213092e-05,1.462846e-05,9.633379e-06,0.007802323,4.78101e-05,4.638293e-06
std,14401080.0,65.46523,924516.6,201145.0,0.0,0.0,6.924522,73404.39,2470116.0,143.2305,...,0.0083835,0.02332741,0.004925572,0.3656182,0.003482927,0.00382469,0.003103754,0.08798551,0.006914321,0.002153665
min,1525880000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1526150000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1532529000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1545398000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,60.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1569018000.0,78840.33,962347200.0,336516400.0,0.0,0.0,7363.0,66027350.0,1914793000.0,239484.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
X.isnull().sum()

ts                   0
duration             0
orig_bytes           0
resp_bytes           0
local_orig           0
local_resp           0
missed_bytes         0
orig_pkts            0
orig_ip_bytes        0
resp_pkts            0
resp_ip_bytes        0
proto_icmp           0
proto_tcp            0
proto_udp            0
conn_state_OTH       0
conn_state_REJ       0
conn_state_RSTO      0
conn_state_RSTOS0    0
conn_state_RSTR      0
conn_state_RSTRH     0
conn_state_S0        0
conn_state_S1        0
conn_state_S2        0
conn_state_S3        0
conn_state_SF        0
conn_state_SH        0
conn_state_SHR       0
dtype: int64

In [16]:
X = np.asarray(X).astype(np.float32)

In [17]:
y

0          POHScan
1          POHScan
2          POHScan
3          POHScan
4          POHScan
            ...   
2802750     Benign
2802751     Benign
2802752     Benign
2802753     Benign
2802754     Benign
Name: label, Length: 2802755, dtype: object

In [18]:
y.value_counts()

POHScan    1324265
Okiru       549228
Benign      472319
DDoS        448703
C&C           8240
Name: label, dtype: int64

In [19]:
le=LabelEncoder()

In [20]:
y=le.fit_transform(y)

In [21]:
y

array([4, 4, 4, ..., 0, 0, 0])

In [22]:
#the labels are encoded in alphabetic order
#0---->Benign
#1---->Cnc
#2---->DDoS
#3---->Okiru
#4---->POHScan


In [23]:
le.inverse_transform(y)

array(['POHScan', 'POHScan', 'POHScan', ..., 'Benign', 'Benign', 'Benign'],
      dtype=object)

In [24]:
y.shape

(2802755,)

In [25]:
sc=StandardScaler()

In [26]:
Xs=sc.fit_transform(X)

=======================================================================

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split
import time as timer

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(Xs, y, random_state=124, test_size=0.40, shuffle=True)

# Gussian Naive Bayes 

In [77]:
GaussianNB?

In [82]:
start = timer.time()
gnb_model = GaussianNB()
gnb_model.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 1.01 seconds


In [83]:
# Predicting the results on train set
y_gnbr = gnb_model.predict(X_train)

In [84]:
print("Accuracy score for GNB: {:.4f}".format(accuracy_score(Y_train,y_gnbr)))

Accuracy score for GNB: 0.4958


In [85]:
# Predicting the test set results
y_gnb = gnb_model.predict(X_test)
y_gnb_prob = gnb_model.predict_proba(X_test)

In [86]:
print("Accuracy score for GNB: {:.4f}".format(accuracy_score(Y_train,y_gnbr)))

Accuracy score for GNB: 0.4958


In [None]:
print("Classification report for GNB: \n{}".format(classification_report(Y_test,y_gnb)))
print("Confusion matrix for GNB: \n{}".format(confusion_matrix(Y_test,y_gnb)))
print("Accuracy score for GNB: {:.2f}".format(accuracy_score(Y_test,y_gnb)))
# calculate precision, recall, and f1 scores
prec_gnb = precision_score(Y_test,y_gnb,average='weighted')
rec_gnb = recall_score(Y_test,y_gnb,average='weighted')
f1_gnb = f1_score(Y_test,y_gnb,average='weighted')
print("Precision score for GNB: {:.4f}".format(prec_gnb))
print("Recall score for GNB: {:.4f}".format(rec_gnb))
print("F1 score for GNB: {:.4f}".format(f1_gnb))

# Logistic Regression 

In [88]:
LogisticRegression?

In [89]:
start = timer.time()
logit_model = LogisticRegression(solver='lbfgs', max_iter=300,random_state=12, C=0.0001 )
logit_model.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 54.30 seconds


In [90]:
# Predicting the train set results
y_logitr = logit_model.predict(X_train)

In [91]:
print("Accuracy  score for LR: {:.4f}".format(accuracy_score(Y_train,y_logitr)))

Accuracy score for LR: 0.8744


In [92]:
# Predicting the test set results
y_logit = logit_model.predict(X_test)
y_logit_prob = logit_model.predict_proba(X_test)

In [93]:
print("Classification report for LR: \n{}".format(classification_report(Y_test,y_logit)))
print("Confusion matrix for LR: \n{}".format(confusion_matrix(Y_test,y_logit)))
print("Accuracy score for LR: {:.4f}".format(accuracy_score(Y_test,y_logit)))
# calculate precision, recall, and f1 scores
prec_logit = precision_score(Y_test,y_logit,average='weighted')
rec_logit = recall_score(Y_test,y_logit,average='weighted')
f1_logit = f1_score(Y_test,y_logit,average='weighted')
print("Precision score for LR: {:.4f}".format(prec_logit))
print("Recall score for LR: {:.4f}".format(rec_logit))
print("F1 score for LR: {:.4f}".format(f1_logit))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report for LR: 
              precision    recall  f1-score   support

           0       1.00      0.91      0.95    188800
           1       0.00      0.00      0.00      3277
           2       1.00      0.89      0.94    179740
           3       0.71      0.91      0.80    220134
           4       0.88      0.85      0.86    529151

    accuracy                           0.87   1121102
   macro avg       0.72      0.71      0.71   1121102
weighted avg       0.88      0.87      0.88   1121102

Confusion matrix for LR: 
[[171286      0     81    137  17296]
 [     3      0      0      1   3273]
 [     2      0 159773      3  19962]
 [     0      0      0 199962  20172]
 [    20      0      0  79990 449141]]
Accuracy score for LR: 0.8743


  _warn_prf(average, modifier, msg_start, len(result))


Precision score for LR: 0.8846
Recall score for LR: 0.8743
F1 score for LR: 0.8761


# RandomForestClassifier

In [64]:
RandomForestClassifier?

In [31]:
start = timer.time()
Rf_clf=RandomForestClassifier(n_estimators =10, max_depth=20, criterion = 'entropy', random_state = 124)
Rf_clf.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.4f} seconds".format(end-start))

Finished training within 31.7524 seconds


In [32]:
#checking train accuracy
y_RF_train = Rf_clf.predict(X_train)

In [33]:
print("Accuracy score for RF: {:.4f}".format(accuracy_score(Y_train,y_RF_train)))

Accuracy score for RF: 0.9492


In [34]:
y_RF = Rf_clf.predict(X_test)
y_RF_prob = Rf_clf.predict_proba(X_test)

In [36]:
print("Classification report for RF: \n{}".format(classification_report(Y_test,y_RF)))
print("Confusion matrix for RF: \n{}".format(confusion_matrix(Y_test,y_RF)))
print("Accuracy score for RF: {:.4f}".format(accuracy_score(Y_test,y_RF)))
# calculate precision, recall, and f1 scores
prec_rf = precision_score(Y_test,y_RF, average='weighted')
rec_rf = recall_score(Y_test,y_RF,average='weighted')
f1_rf = f1_score(Y_test,y_RF,average='weighted')
print("Precision score for DT: {:.4f}".format(prec_rf))
print("Recall score for DT: {:.4f}".format(rec_rf))
print("F1 score for DF: {:.4f}".format(f1_rf))

Classification report for RF: 
              precision    recall  f1-score   support

           0       1.00      0.91      0.95    188800
           1       1.00      1.00      1.00      3277
           2       1.00      0.89      0.94    179740
           3       1.00      0.91      0.95    220134
           4       0.90      1.00      0.95    529151

    accuracy                           0.95   1121102
   macro avg       0.98      0.94      0.96   1121102
weighted avg       0.95      0.95      0.95   1121102

Confusion matrix for RF: 
[[171801      0     72      0  16927]
 [     4   3273      0      0      0]
 [     5      0 159781      0  19954]
 [     3      0      2 200001  20128]
 [    21      0      0      0 529130]]
Accuracy score for RF: 0.9491
Precision score for DT: 0.9540
Recall score for DT: 0.9491
F1 score for DF: 0.9490


# Decision Trees

In [70]:
DecisionTreeClassifier?

In [71]:
start = timer.time()
DT_clf= DecisionTreeClassifier(max_depth=20)
DT_clf.fit(X_train,Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 8.80 seconds


In [72]:
#checking train accuracy
y_DT_train = DT_clf.predict(X_train)

In [73]:
print("Accuracy score for DT: {:.4f}".format(accuracy_score(Y_train,y_DT_train)))

Accuracy score for DT: 0.9492


In [74]:
#test accuracy
y_DT = DT_clf.predict(X_test)
y_DT_prob = DT_clf.predict_proba(X_test)

In [75]:
print("Classification report for DT: \n{}".format(classification_report(Y_test,y_DT)))
print("Confusion matrix for DT: \n{}".format(confusion_matrix(Y_test,y_DT)))
print("Accuracy score for DT: {:.4f}".format(accuracy_score(Y_test,y_DT)))
# calculate precision, recall, and f1 scores
prec_dt = precision_score(Y_test,y_DT, average='weighted')
rec_dt = recall_score(Y_test,y_DT, average='weighted')
f1_dt = f1_score(Y_test,y_DT, average='weighted')
print("Precision score for DT: {:.4f}".format(prec_dt))
print("Recall score for DT: {:.4f}".format(rec_dt))
print("F1 score for DF: {:.4f}".format(f1_dt))


Classification report for DT: 
              precision    recall  f1-score   support

           0       1.00      0.91      0.95    188800
           1       1.00      1.00      1.00      3277
           2       1.00      0.89      0.94    179740
           3       1.00      0.91      0.95    220134
           4       0.90      1.00      0.95    529151

    accuracy                           0.95   1121102
   macro avg       0.98      0.94      0.96   1121102
weighted avg       0.95      0.95      0.95   1121102

Confusion matrix for DT: 
[[171800      2     71      3  16924]
 [     3   3274      0      0      0]
 [     5      0 159781      0  19954]
 [     3      0      2 200001  20128]
 [    63      0      0      0 529088]]
Accuracy score for DT: 0.9490
Precision score for DT: 0.9490
Recall score for DT: 0.9490
F1 score for DF: 0.9490


# ======KNN======

In [33]:
start = timer.time()
knn_model = neighbors.KNeighborsClassifier(p=2, leaf_size=20, n_neighbors=3, n_jobs=-1)
knn_model.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 0.16 seconds


stopped it because it took a lot time of time around 9 hours

In [34]:
start = timer.time()
y_KNN = knn_ model.predict(X_test)
y_KNN_prob = knn_model.predict_proba(X_test)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

KeyboardInterrupt: 

In [None]:
#checking train accuracy
y_KNN_train = xg_clf.predict(X_train)
print("Accuracy score for DT: {:.4f}".format(accuracy_score(Y_train,y_KNN_train)))

In [None]:
# print("Classification report for KNN: \n{}".format(classification_report(Y_test,y_KNN)))
print("Confusion matrix for KNN: \n{}".format(confusion_matrix(Y_test,y_KNN)))
print("Accuracy score for KNN: {:.4f}".format(accuracy_score(Y_test,y_KNN)))
# calculate precision, recall, and f1 scores
prec_knn = precision_score(Y_test,y_KNN,average='weighted')
rec_knn = recall_score(Y_test,y_KNN,average='weighted')
f1_knn = f1_score(Y_test,y_KNN,average='weighted')
print("Precision score for KNN: {:.4f}".format(prec_knn))
print("Recall score for KNN: {:.4f}".format(rec_knn))
print("F1 score for KNN: {:.4f}".format(f1_knn))


# ==================XGBClassifier============

In [95]:
start = timer.time()
xg_clf=XGBClassifier(use_label_encoder=False)
xg_clf.fit(X_train, Y_train)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))



Finished training within 455.35 seconds


In [96]:
start = timer.time()
y_xg = xg_clf.predict(X_test)
y_xg_prob = xg_clf.predict_proba(X_test)
end = timer.time()
print("Finished training within {:.2f} seconds".format(end-start))

Finished training within 6.84 seconds


In [98]:
#checking train accuracy
y_xg_train = xg_clf.predict(X_train)
print("Accuracy score for DT: {:.4f}".format(accuracy_score(Y_train,y_xg_train)))

Accuracy score for DT: 0.9492


In [97]:
print("Classification report for XGBoost: \n{}".format(classification_report(Y_test,y_xg)))
print("Confusion matrix for XGBoost: \n{}".format(confusion_matrix(Y_test,y_xg)))
print("Accuracy score for XGBoost: {:.4f}".format(accuracy_score(Y_test,y_xg)))
# calculate precision, recall, and f1 scores
prec_knn = precision_score(Y_test,y_xg,average='weighted')
rec_knn = recall_score(Y_test,y_xg,average='weighted')
f1_knn = f1_score(Y_test,y_xg,average='weighted')
print("Precision score for XGBoost: {:.4f}".format(prec_knn))
print("Recall score for XGBoost: {:.4f}".format(rec_knn))
print("F1 score for XGBoost: {:.4f}".format(f1_knn))

Classification report for XGBoost: 
              precision    recall  f1-score   support

           0       1.00      0.91      0.95    188800
           1       1.00      1.00      1.00      3277
           2       1.00      0.89      0.94    179740
           3       1.00      0.91      0.95    220134
           4       0.90      1.00      0.95    529151

    accuracy                           0.95   1121102
   macro avg       0.98      0.94      0.96   1121102
weighted avg       0.95      0.95      0.95   1121102

Confusion matrix for XGBoost: 
[[171802      0     70      1  16927]
 [     3   3274      0      0      0]
 [     5      0 159845     47  19843]
 [     5      0     24 200119  19986]
 [    17      0     78    120 528936]]
Accuracy score for XGBoost: 0.9490
Precision score for XGBoost: 0.9802
Recall score for XGBoost: 0.9414
F1 score for XGBoost: 0.9588
