In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dataset_dir = '../../Dataset/'

In [4]:
os.listdir(dataset_dir)

['NCC-2 Dataset Simultaneous Botnet Dataset',
 'Outdated',
 'sensor1_test.csv',
 'sensor1_train.csv',
 'sensor2_test.csv',
 'sensor2_train.csv',
 'sensor3_test.csv',
 'sensor3_train.csv',
 'test1_clean.csv',
 'test2_clean.csv',
 'test3_clean.csv',
 'train1_clean.csv',
 'train2_clean.csv',
 'train3_clean.csv']

# Encode Categorical

In [12]:
# Frequency Encoding
def frequency_mapping(df):
    freq_map = df.value_counts().to_dict()  # Create frequency map
    
    return freq_map

In [28]:
from sklearn.preprocessing import LabelEncoder
def encode(df, srcaddr_map, dstaddr_map, sport_map, dport_map):
    # Start Time
    df['StartTime'] = pd.to_datetime(df['StartTime'], errors='coerce')
    df['StartTimeHour'] = df['StartTime'].dt.hour
    df['StartTimeMinute'] = df['StartTime'].dt.minute
    df['StartTimeSecond'] = df['StartTime'].dt.second
    df = df.drop(columns=['StartTime'])

    # SrcAddr & DstAddr & Sport & Dport Freq Encoding
    df['SrcAddr'] = df['SrcAddr'].map(lambda x: srcaddr_map.get(x, 0))
    df['DstAddr'] = df['DstAddr'].map(lambda x: dstaddr_map.get(x, 0))
    df['Sport'] = df['Sport'].map(lambda x: sport_map.get(x, 0))
    df['Dport'] = df['Dport'].map(lambda x: dport_map.get(x, 0))

    # Dir & State & dTos & sTos & Proto Label Encoding
    le = LabelEncoder()
    df['Dir'] = le.fit_transform(df['Dir'].astype(str))
    df['State'] = le.fit_transform(df['State'].astype(str))
    df['sTos'] = le.fit_transform(df['sTos'].astype(str))
    df['dTos'] = le.fit_transform(df['dTos'].astype(str))
    df['Proto'] = le.fit_transform(df['Proto'].astype(str))

    return df

In [22]:
def add_target(df):
    df['isBotnet'] = df['Label'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
    df['isSpam'] = df['Label'].apply(lambda x: 1 if x == 2 else 0)
    return df

# Sensor 1

In [29]:
sensor1_train = pd.read_csv(dataset_dir + 'train1_clean.csv')
sensor1_test = pd.read_csv(dataset_dir + 'test1_clean.csv')

In [30]:
sensor1_train.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2022-07-07 11:44:44,0.0,udp,82.133.110.37,55045,<->,147.32.84.229,13363,CON,0.0,0.0,2,135,75,0
1,2022-07-07 09:05:32,123.08,tcp,147.32.84.123,46276,->,74.125.232.220,80,FSPA_FSPA,0.0,0.0,10,1302,896,0
2,2022-07-07 12:38:18,3084.69,udp,173.76.254.89,35323,<->,147.32.84.229,13363,CON,0.0,0.0,4,270,150,0
3,2022-07-07 13:42:35,0.07,tcp,81.19.46.234,52359,->,147.32.85.56,44076,FSPA_FSPA,0.0,0.0,9,651,369,0
4,2022-07-07 09:15:24,577.48,udp,89.215.75.36,57650,<->,147.32.84.229,13363,CON,0.0,0.0,8,1062,307,0


In [31]:
sensor1_test.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2022-07-07 13:54:05,0.0,udp,109.115.16.166,57512,<->,147.32.84.229,13363,CON,0.0,0.0,2,131,71,0
1,2022-07-07 12:35:37,0.0,udp,147.32.85.34,61930,<->,147.32.80.9,53,CON,0.0,0.0,2,327,85,0
2,2022-07-07 14:31:09,0.0,udp,90.178.42.24,18948,<->,147.32.84.229,13363,CON,0.0,0.0,2,137,77,0
3,2022-07-07 14:44:12,0.0,udp,147.32.84.19,35404,<->,147.32.80.9,53,CON,0.0,0.0,2,345,69,0
4,2022-07-07 09:37:22,0.06,tcp,147.32.84.59,52156,->,77.75.72.2,80,FSRPA_FSPA,0.0,0.0,10,1435,795,0


In [32]:
sensor1_srcaddr_map = frequency_mapping(sensor1_train['SrcAddr'])
sensor1_dstaddr_map = frequency_mapping(sensor1_train['DstAddr'])
sensor1_sport_map = frequency_mapping(sensor1_train['Sport'])
sensor1_dport_map = frequency_mapping(sensor1_train['Dport'])

In [33]:
sensor1_train = encode(sensor1_train, sensor1_srcaddr_map, sensor1_dstaddr_map, sensor1_sport_map, sensor1_dport_map)
sensor1_train = add_target(sensor1_train)

sensor1_test = encode(sensor1_test, sensor1_srcaddr_map, sensor1_dstaddr_map, sensor1_sport_map, sensor1_dport_map)
sensor1_test = add_target(sensor1_test)

In [34]:
sensor1_train.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,4,1,105,3,994426,955188,15,0,0,2,135,75,0,11,44,44,0,0
1,123.08,3,9998,60,0,1913,489667,103,0,0,10,1302,896,0,9,5,32,0,0
2,3084.69,4,2,62,3,994426,955188,15,0,0,4,270,150,0,12,38,18,0,0
3,0.07,3,46,98,0,3386,2870,103,0,0,9,651,369,0,13,42,35,0,0
4,577.48,4,1,86,3,994426,955188,15,0,0,8,1062,307,0,9,15,24,0,0


In [36]:
sensor1_test.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,4,0,102,3,994426,955188,15,0,0,2,131,71,0,13,54,5,0,0
1,0.0,4,74229,49,3,1367792,1389751,15,0,0,2,327,85,0,12,35,37,0,0
2,0.0,4,8,13,3,994426,955188,15,0,0,2,137,77,0,14,31,9,0,0
3,0.0,4,5610,45,3,1367792,1389751,15,0,0,2,345,69,0,14,44,12,0,0
4,0.06,3,395832,107,0,299,489667,112,0,0,10,1435,795,0,9,37,22,0,0


In [37]:
sensor1_train.to_csv(dataset_dir + 'train1_encoded.csv', index=False)
sensor1_test.to_csv(dataset_dir + 'test1_encoded.csv', index=False)

# Sensor 2

In [38]:
sensor2_train = pd.read_csv(dataset_dir + 'train2_clean.csv')
sensor2_test = pd.read_csv(dataset_dir + 'test2_clean.csv')

In [39]:
sensor2_train.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2022-07-07 11:43:01,20.0,tcp,147.32.86.135,3010,->,77.75.77.139,80,FSA_FSA,0.0,0.0,7,424,242,0
1,2022-07-07 09:25:52,0.0,udp,147.32.84.59,57416,<->,147.32.80.9,53,CON,0.0,0.0,2,240,95,0
2,2022-07-07 10:29:47,0.0,udp,113.161.77.173,37738,<->,147.32.84.229,13363,CON,0.0,0.0,2,131,71,0
3,2022-07-07 09:28:36,0.07,tcp,147.32.84.59,38205,->,78.138.112.61,80,FSPA_FSPA,0.0,0.0,10,2401,1000,0
4,2022-07-07 09:42:20,0.0,udp,147.32.84.138,36646,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0


In [40]:
sensor2_test.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2022-07-07 09:15:18,0.0,udp,147.32.84.138,33015,<->,147.32.80.9,53,CON,0.0,0.0,2,216,83,0
1,2022-07-07 12:47:24,0.0,udp,147.32.84.59,56638,<->,147.32.80.9,53,CON,0.0,0.0,2,573,80,0
2,2022-07-07 11:31:33,0.0,udp,147.32.84.138,59325,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
3,2022-07-07 09:21:04,0.0,rtcp,94.19.250.72,15909,<->,147.32.84.229,13363,CON,0.0,0.0,2,425,83,0
4,2022-07-07 14:31:02,0.0,udp,147.32.84.138,53846,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0


In [41]:
sensor2_srcaddr_map = frequency_mapping(sensor2_train['SrcAddr'])
sensor2_dstaddr_map = frequency_mapping(sensor2_train['DstAddr'])
sensor2_sport_map = frequency_mapping(sensor2_train['Sport'])
sensor2_dport_map = frequency_mapping(sensor2_train['Dport'])

In [42]:
sensor2_train = encode(sensor2_train, sensor2_srcaddr_map, sensor2_dstaddr_map, sensor2_sport_map, sensor2_dport_map)
sensor2_train = add_target(sensor2_train)

sensor2_test = encode(sensor2_test, sensor2_srcaddr_map, sensor2_dstaddr_map, sensor2_sport_map, sensor2_dport_map)
sensor2_test = add_target(sensor2_test)

In [43]:
sensor2_train.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,20.0,3,14610,75,0,141,581761,82,0,0,7,424,242,0,11,43,1,0,0
1,0.0,4,462414,109,3,1621520,1645689,16,0,0,2,240,95,0,9,25,52,0,0
2,0.0,4,1,63,3,1212234,1166534,16,0,0,2,131,71,0,10,29,47,0,0
3,0.07,3,462414,83,0,100,581761,104,0,0,10,2401,1000,0,9,28,36,0,0
4,0.0,4,811174,68,3,1621520,1645689,16,0,0,2,214,81,0,9,42,20,0,0


In [44]:
sensor2_test.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,4,811174,72,3,1621520,1645689,12,0,0,2,216,83,0,9,15,18,0,0
1,0.0,4,462414,84,3,1621520,1645689,12,0,0,2,573,80,0,12,47,24,0,0
2,0.0,4,811174,194,3,1621520,1645689,12,0,0,2,214,81,0,11,31,33,0,0
3,0.0,1,0,29,3,1212234,1166534,12,0,0,2,425,83,0,9,21,4,0,0
4,0.0,4,811174,105,3,1621520,1645689,12,0,0,2,214,81,0,14,31,2,0,0


In [45]:
sensor2_train.to_csv(dataset_dir + 'train2_encoded.csv', index=False)
sensor2_test.to_csv(dataset_dir + 'test2_encoded.csv', index=False)

# Sensor 3

In [46]:
sensor3_train = pd.read_csv(dataset_dir + 'train3_clean.csv')
sensor3_test = pd.read_csv(dataset_dir + 'test3_clean.csv')

In [47]:
sensor3_train.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2022-07-07 12:06:15,0.0,udp,95.52.89.144,1867,<->,147.32.84.229,13363,CON,0.0,0.0,2,548,488,0
1,2022-07-07 12:27:40,0.0,udp,147.32.84.138,42444,<->,147.32.80.9,53,CON,0.0,0.0,2,214,81,0
2,2022-07-07 11:22:43,1.24,tcp,147.32.85.76,1113,->,77.75.73.170,993,FSPA_FSRPA,0.0,0.0,53,17709,2206,0
3,2022-07-07 09:20:12,3251.45,udp,178.74.193.49,12852,<->,147.32.84.229,13363,CON,0.0,0.0,6,1270,232,0
4,2022-07-07 10:24:28,0.0,udp,147.32.84.59,49599,<->,147.32.80.9,53,CON,0.0,0.0,2,200,75,0


In [48]:
sensor3_test.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label
0,2022-07-07 10:37:22,0.02,udp,147.32.84.118,65502,<->,147.32.80.9,53,CON,0.0,0.0,2,335,73,0
1,2022-07-07 09:41:09,0.0,udp,212.104.119.129,23512,<->,147.32.86.165,12114,CON,0.0,0.0,2,138,77,0
2,2022-07-07 09:54:04,0.17,udp,69.104.66.134,59705,<->,147.32.84.192,31037,CON,0.0,0.0,2,485,145,1
3,2022-07-07 13:16:26,104.19,udp,85.240.58.228,53275,<->,147.32.84.229,13363,CON,0.0,0.0,4,499,364,0
4,2022-07-07 10:09:27,0.33,tcp,147.32.84.59,63491,->,76.13.114.90,80,FSPA_FSPA,0.0,0.0,9,1535,699,0


In [49]:
sensor3_srcaddr_map = frequency_mapping(sensor3_train['SrcAddr'])
sensor3_dstaddr_map = frequency_mapping(sensor3_train['DstAddr'])
sensor3_sport_map = frequency_mapping(sensor3_train['Sport'])
sensor3_dport_map = frequency_mapping(sensor3_train['Dport'])

In [50]:
sensor3_train = encode(sensor3_train, sensor3_srcaddr_map, sensor3_dstaddr_map, sensor3_sport_map, sensor3_dport_map)
sensor3_train = add_target(sensor3_train)

sensor3_test = encode(sensor3_test, sensor3_srcaddr_map, sensor3_dstaddr_map, sensor3_sport_map, sensor3_dport_map)
sensor3_test = add_target(sensor3_test)

In [51]:
sensor3_train.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.0,4,1,106,3,338987,318377,7,0,0,2,548,488,0,12,6,15,0,0
1,0.0,4,492764,50,3,1211113,1220778,7,0,0,2,214,81,0,12,27,40,0,0
2,1.24,3,6457,202,0,248,3257,76,0,0,53,17709,2206,0,11,22,43,0,0
3,3251.45,4,1,6,3,338987,318377,7,0,0,6,1270,232,0,9,20,12,0,0
4,0.0,4,440828,86,3,1211113,1220778,7,0,0,2,200,75,0,10,24,28,0,0


In [52]:
sensor3_test.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
0,0.02,4,32986,36,3,1211113,1220778,8,0,0,2,335,73,0,10,37,22,0,0
1,0.0,4,1,12,3,197730,185254,8,0,0,2,138,77,0,9,41,9,0,0
2,0.17,4,731,246,3,791,732,8,0,0,2,485,145,1,9,54,4,1,0
3,104.19,4,1,85,3,338987,318377,8,0,0,4,499,364,0,13,16,26,0,0
4,0.33,3,440828,32,0,11057,469208,57,0,0,9,1535,699,0,10,9,27,0,0


In [53]:
sensor3_train.to_csv(dataset_dir + 'train3_encoded.csv', index=False)
sensor3_test.to_csv(dataset_dir + 'test3_encoded.csv', index=False)