# A Lightweight Concept Drift Detection and Adaptation Framework for IoT Data Streams
This is the code for the paper entitled "**A Lightweight Concept Drift Detection and Adaptation Framework for IoT Data Streams**" accepted in IEEE Internet of Things Magazine.  
Authors: Li Yang (lyang339@uwo.ca) and Abdallah Shami (Abdallah.Shami@uwo.ca)  
Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University

**Notebook 1: Data pre-processing**  
Aims:  
&nbsp; 1): Assign columns names and transform the original 'txt' files to dataframes  
&nbsp; 2): Transform the multi-class dataset to the binary dataset for anomaly detection  
&nbsp; 3): Label encoding to pre-process string features  

## Import libraries

In [1]:
 from google.colab import drive
 drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Read the IoT ID 20 dataset
The IoT ID20 dataset is publicly available at: [[1]](https://www.unb.ca/cic/datasets/nsl.html) [[2]](https://github.com/jmnwong/NSL-KDD-Dataset)

In [3]:
df = pd.read_csv("/content/drive/MyDrive/IoT_Network_Intrusion_Dataset.csv")

In [4]:
df.dtypes

Flow_ID      object
Src_IP       object
Src_Port      int64
Dst_IP       object
Dst_Port      int64
             ...   
Idle_Max    float64
Idle_Min    float64
Label        object
Cat          object
Sub_Cat      object
Length: 86, dtype: object

In [5]:
#Labeling Anomaly or Normal as 1 and 0
df['Label'][df['Label']=='Normal']=0
df['Label'][df['Label']=='Anomaly']=1

In [6]:
df[["Label"]] = df[["Label"]].apply(pd.to_numeric)
df.dtypes

Flow_ID      object
Src_IP       object
Src_Port      int64
Dst_IP       object
Dst_Port      int64
             ...   
Idle_Max    float64
Idle_Min    float64
Label         int64
Cat          object
Sub_Cat      object
Length: 86, dtype: object

In [8]:
# df.sort_values(by='Timestamp', inplace=True)
# df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
# total_rows = len(df)
# step_size = 10
# mask = list(range(0, total_rows, step_size))
# new_df = df.iloc[mask]

In [10]:
#reducing using labels
total_rows = len(df)
desired_rows = max(total_rows // 10, 1)

label_0_rows = df[df['Label'] == 0].sample(n=int(desired_rows * 0.15), replace=False)
label_1_rows = df[df['Label'] == 1].sample(n=int(desired_rows * 0.85), replace=False)


new_df = pd.concat([label_0_rows, label_1_rows]) #concate both

# Shuffle the final DataFrame
new_df = new_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [11]:
new_df

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label,Cat,Sub_Cat
0,192.168.0.13-192.168.0.16-9020-56255-6,192.168.0.16,56255,192.168.0.13,9020,6,10/09/2019 01:43:57 AM,149,0,3,...,0.0,0.0,0.0,74.5,0.707107,75.0,74.0,1,Mirai,Mirai-Hostbruteforceg
1,192.168.0.24-104.118.134.215-43238-443-6,104.118.134.215,443,192.168.0.24,43238,6,25/07/2019 03:25:42 AM,179,1,1,...,0.0,0.0,0.0,179.0,0.000000,179.0,179.0,1,Mirai,Mirai-Ackflooding
2,192.168.0.13-192.168.0.16-9020-52727-6,192.168.0.16,52727,192.168.0.13,9020,6,11/07/2019 01:24:24 AM,119,1,1,...,0.0,0.0,0.0,119.0,0.000000,119.0,119.0,1,Scan,Scan Port OS
3,192.168.0.13-192.168.0.16-9020-52717-6,192.168.0.13,9020,192.168.0.16,52717,6,11/07/2019 01:23:29 AM,74,0,2,...,0.0,0.0,0.0,74.0,0.000000,74.0,74.0,1,Scan,Scan Port OS
4,192.168.0.13-222.209.147.42-554-5305-6,222.209.147.42,5305,192.168.0.13,554,6,26/05/2019 10:11:15 PM,2933,0,2,...,0.0,0.0,0.0,2933.0,0.000000,2933.0,2933.0,1,DoS,DoS-Synflooding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62572,192.168.0.24-104.74.213.186-51875-443-6,104.74.213.186,443,192.168.0.24,51875,6,20/08/2019 03:03:40 AM,113,1,1,...,0.0,0.0,0.0,113.0,0.000000,113.0,113.0,1,Mirai,Mirai-Hostbruteforceg
62573,192.168.0.13-47.74.196.243-48920-443-6,192.168.0.13,48920,47.74.196.243,443,6,10/09/2019 01:49:31 AM,79,0,2,...,0.0,0.0,0.0,79.0,0.000000,79.0,79.0,1,Mirai,Mirai-Hostbruteforceg
62574,192.168.0.13-192.168.0.16-9020-49784-6,192.168.0.13,9020,192.168.0.16,49784,6,20/05/2019 04:56:23 AM,321,3,1,...,0.0,0.0,0.0,107.0,31.575307,130.0,71.0,0,Normal,Normal
62575,192.168.0.24-223.39.123.194-40962-8281-6,223.39.123.194,8281,192.168.0.24,40962,6,04/06/2019 11:47:44 PM,874,0,2,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1,DoS,DoS-Synflooding


In [12]:
dtypes = new_df.dtypes
object_columns = dtypes[dtypes == 'object'].index
object_columns_info = new_df[object_columns]

print(object_columns)


Index(['Flow_ID', 'Src_IP', 'Dst_IP', 'Timestamp', 'Cat', 'Sub_Cat'], dtype='object')


In [13]:
for objects in object_columns:
  new_df.drop([objects], axis=1, inplace=True)
new_df

Unnamed: 0,Src_Port,Dst_Port,Protocol,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,TotLen_Fwd_Pkts,TotLen_Bwd_Pkts,Fwd_Pkt_Len_Max,Fwd_Pkt_Len_Min,...,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,56255,9020,6,149,0,3,0.0,2806.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,74.5,0.707107,75.0,74.0,1
1,443,43238,6,179,1,1,1441.0,1441.0,1441.0,1441.0,...,0,0.0,0.0,0.0,0.0,179.0,0.000000,179.0,179.0,1
2,52727,9020,6,119,1,1,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,119.0,0.000000,119.0,119.0,1
3,9020,52717,6,74,0,2,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,74.0,0.000000,74.0,74.0,1
4,5305,554,6,2933,0,2,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,2933.0,0.000000,2933.0,2933.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62572,443,51875,6,113,1,1,1448.0,1448.0,1448.0,1448.0,...,0,0.0,0.0,0.0,0.0,113.0,0.000000,113.0,113.0,1
62573,48920,443,6,79,0,2,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,79.0,0.000000,79.0,79.0,1
62574,9020,49784,6,321,3,1,2806.0,1388.0,1388.0,30.0,...,0,0.0,0.0,0.0,0.0,107.0,31.575307,130.0,71.0,0
62575,8281,40962,6,874,0,2,0.0,346.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1


In [14]:
#new_df.drop(['Timestamp'], axis=1, inplace=True)

In [14]:
dtypes = df.dtypes
object_columns_int = dtypes[dtypes == 'int64'].index
object_columns_float = dtypes[dtypes == 'float64'].index
print(object_columns_int)
print(object_columns_int.size)
print("\n")
print(object_columns_float)
print(object_columns_float.size)

Index(['Src_Port', 'Dst_Port', 'Protocol', 'Flow_Duration', 'Tot_Fwd_Pkts',
       'Tot_Bwd_Pkts', 'Fwd_PSH_Flags', 'Bwd_PSH_Flags', 'Fwd_URG_Flags',
       'Bwd_URG_Flags', 'Fwd_Header_Len', 'Bwd_Header_Len', 'FIN_Flag_Cnt',
       'SYN_Flag_Cnt', 'RST_Flag_Cnt', 'PSH_Flag_Cnt', 'ACK_Flag_Cnt',
       'URG_Flag_Cnt', 'CWE_Flag_Count', 'ECE_Flag_Cnt', 'Fwd_Byts/b_Avg',
       'Fwd_Pkts/b_Avg', 'Fwd_Blk_Rate_Avg', 'Bwd_Byts/b_Avg',
       'Bwd_Pkts/b_Avg', 'Bwd_Blk_Rate_Avg', 'Subflow_Fwd_Pkts',
       'Subflow_Fwd_Byts', 'Subflow_Bwd_Pkts', 'Subflow_Bwd_Byts',
       'Init_Fwd_Win_Byts', 'Init_Bwd_Win_Byts', 'Fwd_Act_Data_Pkts',
       'Fwd_Seg_Size_Min', 'Label'],
      dtype='object')
35


Index(['TotLen_Fwd_Pkts', 'TotLen_Bwd_Pkts', 'Fwd_Pkt_Len_Max',
       'Fwd_Pkt_Len_Min', 'Fwd_Pkt_Len_Mean', 'Fwd_Pkt_Len_Std',
       'Bwd_Pkt_Len_Max', 'Bwd_Pkt_Len_Min', 'Bwd_Pkt_Len_Mean',
       'Bwd_Pkt_Len_Std', 'Flow_Byts/s', 'Flow_Pkts/s', 'Flow_IAT_Mean',
       'Flow_IAT_Std', 'Flow_IAT

## Save the pre-processed dataset

df: training & test set  

In [15]:
new_df.to_csv('/content/drive/MyDrive/IoT_Dataset_ID20.csv',index=0)