# A Lightweight Concept Drift Detection and Adaptation Framework for IoT Data Streams
This is the code for the paper entitled "**A Lightweight Concept Drift Detection and Adaptation Framework for IoT Data Streams**" accepted in IEEE Internet of Things Magazine.  
Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  
Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University

**Notebook 1: Data pre-processing**  
Aims:  
&nbsp; 1): Assign columns names and transform the original 'txt' files to dataframes  
&nbsp; 2): Transform the multi-class dataset to the binary dataset for anomaly detection  
&nbsp; 3): Label encoding to pre-process string features  

## Import libraries

In [None]:
 from google.colab import drive
 drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Read the Edge IIoT dataset by Mohamed Amine Ferrag, Othmane Friha, Djallel Hamouda, Leandros Maglaras, Helge Janicke.
The Edge IIoT dataset is publicly available at: [[1]](https://www.kaggle.com/datasets/mohamedamineferrag/edgeiiotset-cyber-security-dataset-of-iot-iiot)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Edge-IIoTset_dataset/Selected_dataset_MLandDL/ML-EdgeIIoT-dataset.csv")

In [None]:
df.dtypes

frame.time             object
ip.src_host            object
ip.dst_host            object
arp.dst.proto_ipv4     object
arp.opcode            float64
                       ...   
mbtcp.len             float64
mbtcp.trans_id        float64
mbtcp.unit_id         float64
Attack_label            int64
Attack_type            object
Length: 63, dtype: object

In [None]:
df.head(5)

print(df['Attack_type'].value_counts())

Normal                   24301
DDoS_UDP                 14498
DDoS_ICMP                14090
Ransomware               10925
DDoS_HTTP                10561
SQL_injection            10311
Uploading                10269
DDoS_TCP                 10247
Backdoor                 10195
Vulnerability_scanner    10076
Port_Scanning            10071
XSS                      10052
Password                  9989
MITM                      1214
Fingerprinting            1001
Name: Attack_type, dtype: int64


In [None]:
from sklearn.utils import shuffle
drop_columns = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4",

         "http.file_data","http.request.full_uri","icmp.transmit_timestamp",

         "http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",

         "tcp.dstport", "udp.port", "mqtt.msg"]

df.drop(drop_columns, axis=1, inplace=True)

df.dropna(axis=0, how='any', inplace=True)

df.drop_duplicates(subset=None, keep="first", inplace=True)

df = shuffle(df)

df.isna().sum()

print(df['Attack_type'].value_counts())

Normal                   24125
DDoS_UDP                 14498
DDoS_ICMP                13096
DDoS_HTTP                10495
SQL_injection            10282
DDoS_TCP                 10247
Uploading                10214
Vulnerability_scanner    10062
Password                  9972
Backdoor                  9865
Ransomware                9689
XSS                       9552
Port_Scanning             8924
Fingerprinting             853
MITM                       358
Name: Attack_type, dtype: int64


In [None]:
df


Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.request.method,http.referer,http.request.version,http.response,...,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label,Attack_type
91556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Backdoor
40161,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DDoS_HTTP
96257,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,XSS
20494,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Uploading
40023,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DDoS_HTTP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41396,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DDoS_HTTP
88079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Backdoor
111771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
6726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Ransomware


In [None]:
dtypes = df.dtypes
object_columns = dtypes[dtypes == 'object'].index
object_columns_info = df[object_columns]

for objects in object_columns:
  df.drop([objects], axis=1, inplace=True)
df

Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.response,http.tls_port,tcp.ack,tcp.ack_raw,...,mqtt.len,mqtt.msg_decoded_as,mqtt.msgtype,mqtt.proto_len,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label
91556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.213215e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
40161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.634212e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
96257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.871701e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
20494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,3.369417e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
40023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.270381e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
88079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.453307e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
111771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96979.0,2.371722e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.998023e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
dtypes = df.dtypes
object_columns_int = dtypes[dtypes == 'int64'].index
object_columns_float = dtypes[dtypes == 'float64'].index
print(object_columns_int)
print(object_columns_int.size)
print("\n")
print(object_columns_float)
print(object_columns_float.size)

Index(['Attack_label'], dtype='object')
1


Index(['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le',
       'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port',
       'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type',
       'dns.retransmission', 'dns.retransmit_request',
       'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags',
       'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype',
       'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len',
       'mbtcp.trans_id', 'mbtcp.unit_id'],
      dtype='object')
39


## Save the pre-processed dataset

df: training & test set  

In [None]:
df.to_csv('/content/drive/MyDrive/IoT_Dataset_Edge_ML.csv',index=0)

In [None]:
df

Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.response,http.tls_port,tcp.ack,tcp.ack_raw,...,mqtt.len,mqtt.msg_decoded_as,mqtt.msgtype,mqtt.proto_len,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label
91556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.213215e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
40161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.634212e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
96257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.871701e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
20494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,3.369417e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
40023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.270381e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
88079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.453307e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
111771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96979.0,2.371722e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.998023e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
