In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
%matplotlib inline

# Import Dataset

In [2]:
dataset_dir = './NCC-2 Dataset Simultaneous Botnet Dataset/'
sensor1_path = dataset_dir + 'sensor1/'
sensor2_path = dataset_dir + 'sensor2/'
sensor3_path = dataset_dir + 'sensor3/'
allsensor_path = dataset_dir + 'all-sensors/'

In [5]:
os.listdir(sensor3_path)

['sensor3-acumulatedonHour-botnetOnly.png',
 'sensor3-activityAnalysisPerMinutes-botnetOnly.png',
 'sensor3-activityAnalysisPerMinutes.png',
 'sensor3-description.txt',
 'sensor3.binetflow',
 'sensor3-acumulatedonHour.png',
 'sensor3_botnet-only.binetflow',
 'sensor3_normal-only.binetflow']

# Sensor 3 .binetflow

In [6]:
# df1 -> all, df2 -> botnet only, df3 -> normal only
df = pd.read_csv(sensor3_path + 'sensor3.binetflow')

In [7]:
df.shape

(3885792, 18)

In [8]:
df.columns

Index(['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr',
       'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes',
       'Label', 'ActivityLabel', 'BotnetName', 'SensorId'],
      dtype='object')

In [9]:
df.dtypes

StartTime         object
Dur              float64
Proto             object
SrcAddr           object
Sport             object
Dir               object
DstAddr           object
Dport             object
State             object
sTos             float64
dTos             float64
TotPkts            int64
TotBytes           int64
SrcBytes           int64
Label             object
ActivityLabel      int64
BotnetName        object
SensorId           int64
dtype: object

In [10]:
df.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 09:00:00,0.0,icmp,147.32.84.165,0x0303,->,202.103.52.147,0xc413,URP,0.0,,1,190,190,flow=From-Botnet-V44-ICMP,1,rbot,3
1,2022-07-07 09:00:00,0.13229,udp,147.32.84.59,54866,<->,216.121.135.141,56313,CON,0.0,0.0,2,133,72,flow=Background-Established-cmpgw-CVUT,0,-,3
2,2022-07-07 09:00:00,3570.602295,tcp,147.32.84.59,44213,<?>,205.188.10.230,443,PA_PA,0.0,0.0,260,19571,7824,flow=Background-Established-cmpgw-CVUT,0,-,3
3,2022-07-07 09:00:00,3595.741455,ipx/spx,00:15:17:2c:e5:2d,,->,ff:ff:ff:ff:ff:ff,,INT,,,568,52954,52954,flow=Background,0,-,3
4,2022-07-07 09:00:00,3467.739746,tcp,69.63.180.46,80,<?>,147.32.85.124,53493,FPA_FPA,0.0,0.0,345,147540,37595,flow=Background,0,-,3


In [11]:
df['StartTime'] = pd.to_datetime(df['StartTime'], errors='coerce')

In [12]:
df_cat = df.select_dtypes('object')
df_num = df.select_dtypes(['int64', 'float64'])
df_date = df.select_dtypes('datetime64')

In [13]:
print('Kolom Categorical :', df_cat.columns.value_counts().count())
print('Kolom Numerical :', df_num.columns.value_counts().count())
print('Kolom Datetime :', df_date.columns.value_counts().count())

Kolom Categorical : 9
Kolom Numerical : 8
Kolom Datetime : 1


In [14]:
del df_cat, df_date, df_num

## Check Missing Value

In [15]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,301131,7.7
1,sTos,30137,0.8
2,Sport,27890,0.7
3,Dport,13553,0.3
4,State,26,0.0
5,StartTime,0,0.0
6,Dur,0,0.0
7,Dir,0,0.0
8,Proto,0,0.0
9,SrcAddr,0,0.0


## Cek Kolom

### Dir

In [16]:
pd.DataFrame(df['Dir'].value_counts()).reset_index()

Unnamed: 0,Dir,count
0,<->,2590345
1,->,1253544
2,<-,20430
3,<?>,12659
4,?>,7372
5,who,1436
6,<?,6


### State

In [17]:
pd.DataFrame(df['State'].value_counts()).reset_index()

Unnamed: 0,State,count
0,CON,2585301
1,FSPA_FSPA,552923
2,INT,161125
3,SRPA_FSPA,72090
4,S_,70696
...,...,...
294,SRPAEC_SAE,1
295,SRPAC_SRPA,1
296,FRPA_A,1
297,FPU_,1


### BotNet Name

In [18]:
pd.DataFrame(df['BotnetName'].value_counts()).reset_index()

Unnamed: 0,BotnetName,count
0,-,3591792
1,neris,220000
2,virut,38000
3,murlo,14000
4,rbot,13000
5,nsis.ay,9000


### SrcAddr

In [19]:
pd.DataFrame(df['SrcAddr'].value_counts()).reset_index()

Unnamed: 0,SrcAddr,count
0,147.32.84.138,657515
1,147.32.84.59,600006
2,147.32.85.25,136842
3,147.32.84.229,130008
4,147.32.86.20,93943
...,...,...
448304,109.122.133.254,1
448305,178.110.127.175,1
448306,78.39.237.43,1
448307,87.97.145.248,1


### DstAddr

In [20]:
pd.DataFrame(df['DstAddr'].value_counts()).reset_index()

Unnamed: 0,DstAddr,count
0,147.32.80.9,1618579
1,147.32.84.229,456705
2,147.32.86.165,309233
3,147.32.80.13,32222
4,147.32.84.118,28649
...,...,...
160057,94.181.195.49,1
160058,178.234.197.26,1
160059,178.46.206.218,1
160060,66.90.206.14,1


In [21]:
pd.DataFrame(df['Proto'].value_counts()).reset_index()

Unnamed: 0,Proto,count
0,udp,2744470
1,tcp,1054886
2,icmp,75155
3,igmp,5221
4,rtp,2389
5,rtcp,1944
6,arp,1432
7,ipv6-icmp,165
8,ipv6,42
9,ipx/spx,36


## Feature Selection

In [22]:
# dataset = df.drop(columns=['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'sTos', 'dTos', 'BotnetName', 'SensorId',])

dataset = pd.DataFrame(df[[
    'StartTime',
    'Dur',
    'Proto',
    'TotPkts',
    'TotBytes',
    'SrcBytes',
    'Label',
    'ActivityLabel',
    ]])

In [23]:
dataset.shape

(3885792, 8)

In [24]:
dataset.columns

Index(['StartTime', 'Dur', 'Proto', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label',
       'ActivityLabel'],
      dtype='object')

In [25]:
# Cek Missing Data
total_missing = dataset.isnull().sum().sort_values(ascending=False)
percent_1 = dataset.isnull().sum()/dataset.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,StartTime,0,0.0
1,Dur,0,0.0
2,Proto,0,0.0
3,TotPkts,0,0.0
4,TotBytes,0,0.0
5,SrcBytes,0,0.0
6,Label,0,0.0
7,ActivityLabel,0,0.0


In [19]:
# state_missing = dataset[dataset['State'].isnull()]
# state_missing

In [20]:
# del state_missing

In [21]:
# dataset = dataset.dropna(axis=0, subset=['State'])

In [22]:
# dataset.head()

## Feature Extraction

### StartTime

In [26]:
dataset['StartTimeHour'] = dataset['StartTime'].dt.hour
dataset['StartTimeMinute'] = dataset['StartTime'].dt.minute
dataset['StartTimeSecond'] = dataset['StartTime'].dt.second
dataset = dataset.drop(columns=['StartTime'])

In [27]:
dataset.head()

Unnamed: 0,Dur,Proto,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,StartTimeHour,StartTimeMinute,StartTimeSecond
0,0.0,icmp,1,190,190,flow=From-Botnet-V44-ICMP,1,9,0,0
1,0.13229,udp,2,133,72,flow=Background-Established-cmpgw-CVUT,0,9,0,0
2,3570.602295,tcp,260,19571,7824,flow=Background-Established-cmpgw-CVUT,0,9,0,0
3,3595.741455,ipx/spx,568,52954,52954,flow=Background,0,9,0,0
4,3467.739746,tcp,345,147540,37595,flow=Background,0,9,0,0


### Proto

In [28]:
pd.DataFrame(dataset['Proto'].value_counts()).reset_index()

Unnamed: 0,Proto,count
0,udp,2744470
1,tcp,1054886
2,icmp,75155
3,igmp,5221
4,rtp,2389
5,rtcp,1944
6,arp,1432
7,ipv6-icmp,165
8,ipv6,42
9,ipx/spx,36


In [29]:
one_hot_proto = pd.get_dummies(dataset['Proto'], prefix='Proto').astype(int)
dataset = pd.concat([dataset, one_hot_proto], axis=1)
dataset = dataset.drop(columns=['Proto'])

In [30]:
del one_hot_proto

In [31]:
dataset.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,StartTimeHour,StartTimeMinute,StartTimeSecond,Proto_arp,...,Proto_llc,Proto_pim,Proto_rarp,Proto_rsvp,Proto_rtcp,Proto_rtp,Proto_tcp,Proto_udp,Proto_udt,Proto_unas
0,0.0,1,190,190,flow=From-Botnet-V44-ICMP,1,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.13229,2,133,72,flow=Background-Established-cmpgw-CVUT,0,9,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3570.602295,260,19571,7824,flow=Background-Established-cmpgw-CVUT,0,9,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3595.741455,568,52954,52954,flow=Background,0,9,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3467.739746,345,147540,37595,flow=Background,0,9,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### Dir

In [29]:
# pd.DataFrame(dataset['Dir'].value_counts()).reset_index()

In [31]:
# one_hot_dir = pd.get_dummies(dataset['Dir'], prefix='Dir').astype(int)
# dataset = pd.concat([dataset, one_hot_dir], axis=1)
# dataset = dataset.drop(columns=['Dir'])

In [32]:
# del one_hot_dir

In [34]:
# dataset.head()

### State

In [35]:
# pd.DataFrame(dataset['State'].value_counts()).reset_index()

In [36]:
# def sortState(state_str):
#     # Pisahkan berdasarkan '_' dan hapus spasi kosong
#     states = [proc.strip() for proc in state_str.split('_')]
#     # Urutkan daftar secara alfabetis atau berdasarkan kode angka
#     sorted_states = sorted(states)
#     # Gabungkan kembali dengan tanda '; '
#     return '_'.join(sorted_states)

In [37]:
# dataset['State'] = dataset['State'].apply(sortState)

In [38]:
# pd.DataFrame(dataset['State'].value_counts()).reset_index()

In [39]:
# one_hot_state = pd.get_dummies(dataset['State'], prefix='State').astype(int)
# dataset = pd.concat([dataset, one_hot_state], axis=1)
# dataset = dataset.drop(columns=['State'])

In [40]:
# del one_hot_state

In [41]:
# dataset.head()

## Labeling

### Target

In [32]:
pd.DataFrame(dataset['Label'].value_counts()).reset_index()

Unnamed: 0,Label,count
0,flow=To-Background-UDP-CVUT-DNS-Server,1553211
1,flow=Background-UDP-Established,908173
2,flow=Background-TCP-Established,408535
3,flow=Background-Established-cmpgw-CVUT,268040
4,flow=Background-UDP-Attempt,134922
...,...,...
157,flow=From-Normal-V44-UDP-CVUT-DNS-Server,8
158,flow=From-Normal-V50-MatLab-Server,6
159,flow=Normal-V50-HTTP-windowsupdate,6
160,flow=Normal-V44-HTTP-windowsupdate,5


In [33]:
def categorize_label(label):
    label = label.lower()  
    if 'spam' in label:
        return 1
    else:
        return 0

In [34]:
dataset['isSpam'] = dataset['Label'].apply(categorize_label)
dataset = dataset.drop(columns=['Label'])


In [35]:
dataset.rename(columns={"ActivityLabel":"isBotnet"}, inplace=True)

In [36]:
grouped_df = dataset.groupby(['isBotnet', 'isSpam']).size().reset_index(name='count')
grouped_df

Unnamed: 0,isBotnet,isSpam,count
0,0,0,3591792
1,1,0,271000
2,1,1,23000


In [37]:
del grouped_df

In [38]:
dataset.dtypes

Dur                float64
TotPkts              int64
TotBytes             int64
SrcBytes             int64
isBotnet             int64
StartTimeHour        int32
StartTimeMinute      int32
StartTimeSecond      int32
Proto_arp            int64
Proto_esp            int64
Proto_gre            int64
Proto_icmp           int64
Proto_igmp           int64
Proto_ipv6           int64
Proto_ipv6-icmp      int64
Proto_ipx/spx        int64
Proto_llc            int64
Proto_pim            int64
Proto_rarp           int64
Proto_rsvp           int64
Proto_rtcp           int64
Proto_rtp            int64
Proto_tcp            int64
Proto_udp            int64
Proto_udt            int64
Proto_unas           int64
isSpam               int64
dtype: object

In [39]:
dataset.shape

(3885792, 27)

In [40]:
dataset.head()

Unnamed: 0,Dur,TotPkts,TotBytes,SrcBytes,isBotnet,StartTimeHour,StartTimeMinute,StartTimeSecond,Proto_arp,Proto_esp,...,Proto_pim,Proto_rarp,Proto_rsvp,Proto_rtcp,Proto_rtp,Proto_tcp,Proto_udp,Proto_udt,Proto_unas,isSpam
0,0.0,1,190,190,1,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.13229,2,133,72,0,9,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3570.602295,260,19571,7824,0,9,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3595.741455,568,52954,52954,0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3467.739746,345,147540,37595,0,9,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [41]:
del df

In [42]:
dataset.to_csv('./Dataset.csv', index=False)