In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dataset_dir = '../Dataset/'

In [4]:
os.listdir(dataset_dir)

['all_sensor_clean.csv',
 'Dataset.csv',
 'encoded_all_sensor.csv',
 'NCC-2 Dataset Simultaneous Botnet Dataset',
 'sensor1_clean.csv',
 'sensor2_clean.csv',
 'sensor3_clean.csv',
 'test_data.csv',
 'train_dataset.csv']

# Encode Categorical

In [5]:
# Frequency Encoding
def frequency_encoding(df):
    freq_map = df.value_counts().to_dict()  # Create frequency map
    df = df.map(freq_map)  # Apply frequency encoding
    return df

In [6]:
from sklearn.preprocessing import LabelEncoder
def encode(df):
    df = df.drop_duplicates()

    # Start Time
    df['StartTime'] = pd.to_datetime(df['StartTime'], errors='coerce')
    df['StartTimeHour'] = df['StartTime'].dt.hour
    df['StartTimeMinute'] = df['StartTime'].dt.minute
    df['StartTimeSecond'] = df['StartTime'].dt.second
    df = df.drop(columns=['StartTime'])

    # SrcAddr & DstAddr & Sport & Dport Freq Encoding
    df['SrcAddr'] = frequency_encoding(df['SrcAddr'])
    df['DstAddr'] = frequency_encoding(df['DstAddr'])
    df['Sport'] = frequency_encoding(df['Sport'])
    df['Dport'] = frequency_encoding(df['Dport'])

    # Dir & State & dTos & sTos & Proto Label Encoding
    le = LabelEncoder()
    df['Dir'] = le.fit_transform(df['Dir'].astype(str))
    df['State'] = le.fit_transform(df['State'].astype(str))
    df['sTos'] = le.fit_transform(df['sTos'].astype(str))
    df['dTos'] = le.fit_transform(df['dTos'].astype(str))
    df['BotnetName'] = le.fit_transform(df['BotnetName'].astype(str))
    df['Proto'] = le.fit_transform(df['Proto'].astype(str))

    return df

In [7]:
def add_target(df):
    df['isBotnet'] = df['Label'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
    df['isSpam'] = df['Label'].apply(lambda x: 1 if x == 2 else 0)
    return df

# Sensor 1

In [8]:
df_1 = pd.read_csv(dataset_dir + 'sensor1_clean.csv')

In [9]:
df_1.shape

(4520851, 18)

In [10]:
df_1.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 09:00:00,0.0,icmp,147.32.84.165,0x0303,->,202.103.52.147,0xc413,URP,0.0,0.0,1,190,190,1,1,rbot,1
1,2022-07-07 09:00:00,0.0,udp,147.32.84.138,48579,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0,0,-,1
2,2022-07-07 09:00:00,0.0,udp,147.32.84.138,54107,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0,0,-,1
3,2022-07-07 09:00:00,3075.34,udp,93.97.178.23,14899,<->,147.32.84.229,13363,CON,0.0,0.0,8,956,708,0,0,-,1
4,2022-07-07 09:00:00,0.0,udp,147.32.84.138,38233,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0,0,-,1


In [11]:
df_1 = encode(df_1)
df_1 = add_target(df_1)

In [12]:
df_1.shape

(4520851, 22)

In [13]:
df_1.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,0,82656,1987,0,47,664,322,0,0,1,190,190,1,1,3,1,9,0,0,1,0
1,0.0,4,827329,81,3,1823076,1852324,16,0,0,2,214,81,0,0,0,1,9,0,0,0,0
2,0.0,4,827329,108,3,1823076,1852324,16,0,0,2,214,81,0,0,0,1,9,0,0,0,0
3,3075.34,4,2,18,3,1326350,1274027,16,0,0,8,956,708,0,0,0,1,9,0,0,0,0
4,0.0,4,827329,66,3,1823076,1852324,16,0,0,2,214,81,0,0,0,1,9,0,0,0,0


In [14]:
df_1 = df_1.drop(columns=['ActivityLabel','BotnetName','SensorId'])

In [15]:
df_1.to_csv(dataset_dir + 'sensor1_encoded.csv', index=False)

# Sensor 2

In [16]:
df_2 = pd.read_csv(dataset_dir + 'sensor2_clean.csv')

In [17]:
df_2.shape

(5474937, 18)

In [18]:
df_2.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 09:00:00,0.0,icmp,147.32.84.193,0x5303,->,147.32.96.69,80,UR,0.0,0.0,1,1066,1066,1,1,rbot,2
1,2022-07-07 09:00:00,2987.3,udp,121.94.23.18,60729,<->,147.32.84.229,13363,CON,0.0,0.0,15,1455,308,0,0,-,2
2,2022-07-07 09:00:00,3582.86,tcp,147.32.84.229,443,<?>,212.103.28.2,41132,PA_PA,0.0,0.0,1021,96853,40471,0,0,-,2
3,2022-07-07 09:00:00,3454.74,udp,110.67.100.220,54196,<->,147.32.84.229,13363,CON,0.0,0.0,19,3370,1544,0,0,-,2
4,2022-07-07 09:00:00,3279.28,udp,119.31.154.162,56109,<->,147.32.84.229,13363,CON,0.0,0.0,9,2209,1939,0,0,-,2


In [21]:
df_2 = encode(df_2)
df_2 = add_target(df_2)

In [22]:
df_2.shape

(5474937, 22)

In [23]:
df_2.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,0,27893,1,0,19372,775304,320,0,0,1,1066,1066,1,1,3,2,9,0,0,1,0
1,2987.3,4,1,143,3,1615327,1554370,16,0,0,15,1455,308,0,0,0,2,9,0,0,0,0
2,3582.86,3,228416,1698,5,3,4,178,0,0,1021,96853,40471,0,0,0,2,9,0,0,0,0
3,3454.74,4,4,151,3,1615327,1554370,16,0,0,19,3370,1544,0,0,0,2,9,0,0,0,0
4,3279.28,4,2,139,3,1615327,1554370,16,0,0,9,2209,1939,0,0,0,2,9,0,0,0,0


In [24]:
df_2 = df_2.drop(columns=['ActivityLabel','BotnetName','SensorId'])

In [25]:
df_2.to_csv(dataset_dir + 'sensor2_encoded.csv', index=False)

# Sensor 3

In [26]:
df_3 = pd.read_csv(dataset_dir + 'sensor3_clean.csv')

In [27]:
df_3.shape

(3596299, 18)

In [28]:
df_3.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 09:00:00,0.0,icmp,147.32.84.165,0x0303,->,202.103.52.147,0xc413,URP,0.0,0.0,1,190,190,1,1,rbot,3
1,2022-07-07 09:00:00,0.13,udp,147.32.84.59,54866,<->,216.121.135.141,56313,CON,0.0,0.0,2,133,72,0,0,-,3
2,2022-07-07 09:00:00,3570.6,tcp,147.32.84.59,44213,<?>,205.188.10.230,443,PA_PA,0.0,0.0,260,19571,7824,0,0,-,3
3,2022-07-07 09:00:00,3467.74,tcp,69.63.180.46,80,<?>,147.32.85.124,53493,FPA_FPA,0.0,0.0,345,147540,37595,0,0,-,3
4,2022-07-07 09:00:00,115.1,tcp,147.32.86.182,51749,->,74.125.232.199,80,FSPA_FSPA,0.0,0.0,10,2840,1709,0,0,-,3


In [29]:
df_3 = encode(df_3)
df_3 = add_target(df_3)

In [30]:
df_3.shape

(3596299, 22)

In [31]:
df_3.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,0,86658,6907,0,47,300,250,0,0,1,190,190,1,1,4,3,9,0,0,1,0
1,0.13,4,587582,667,3,6,10,9,0,0,2,133,72,0,0,0,3,9,0,0,0,0
2,3570.6,3,587582,48,5,19,156707,124,0,0,260,19571,7824,0,0,0,3,9,0,0,0,0
3,3467.74,3,7,4169,5,69,2,25,0,0,345,147540,37595,0,0,0,3,9,0,0,0,0
4,115.1,3,9327,102,0,1996,625148,78,0,0,10,2840,1709,0,0,0,3,9,0,0,0,0


In [32]:
df_3 = df_3.drop(columns=['ActivityLabel','BotnetName','SensorId'])

In [33]:
df_3.to_csv(dataset_dir + 'sensor3_encoded.csv', index=False)