In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
%matplotlib inline

In [3]:
# tabel setting
# pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)
# pd.set_option('display.precision', 2)
# pd.options.display.float_format = '{:,.2f}'.format

# Import Dataset

In [2]:
dataset_dir = './Dataset/NCC-2 Dataset Simultaneous Botnet Dataset/'
sensor1_path = dataset_dir + 'sensor1/'
sensor2_path = dataset_dir + 'sensor2/'
sensor3_path = dataset_dir + 'sensor3/'
allsensor_path = dataset_dir + 'all-sensors/'

In [3]:
os.listdir(sensor3_path)

['sensor3-acumulatedonHour-botnetOnly.png',
 'sensor3-activityAnalysisPerMinutes-botnetOnly.png',
 'sensor3-activityAnalysisPerMinutes.png',
 'sensor3-description.txt',
 'sensor3.binetflow',
 'sensor3-acumulatedonHour.png',
 'sensor3_botnet-only.binetflow',
 'sensor3_normal-only.binetflow']

# Sensor 3 .binetflow

In [4]:
# df1 -> all, df2 -> botnet only, df3 -> normal only
df = pd.read_csv(sensor3_path + 'sensor3.binetflow')

In [39]:
df.shape

(3885792, 18)

In [40]:
df.columns

Index(['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr',
       'Dport', 'State', 'sTos', 'dTos', 'TotPkts', 'TotBytes', 'SrcBytes',
       'Label', 'ActivityLabel', 'BotnetName', 'SensorId'],
      dtype='object')

In [41]:
df.dtypes

StartTime         object
Dur              float64
Proto             object
SrcAddr           object
Sport             object
Dir               object
DstAddr           object
Dport             object
State             object
sTos             float64
dTos             float64
TotPkts            int64
TotBytes           int64
SrcBytes           int64
Label             object
ActivityLabel      int64
BotnetName        object
SensorId           int64
dtype: object

In [42]:
df.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 09:00:00,0.0,icmp,147.32.84.165,0x0303,->,202.103.52.147,0xc413,URP,0.0,,1,190,190,flow=From-Botnet-V44-ICMP,1,rbot,3
1,2022-07-07 09:00:00,0.13229,udp,147.32.84.59,54866,<->,216.121.135.141,56313,CON,0.0,0.0,2,133,72,flow=Background-Established-cmpgw-CVUT,0,-,3
2,2022-07-07 09:00:00,3570.602295,tcp,147.32.84.59,44213,<?>,205.188.10.230,443,PA_PA,0.0,0.0,260,19571,7824,flow=Background-Established-cmpgw-CVUT,0,-,3
3,2022-07-07 09:00:00,3595.741455,ipx/spx,00:15:17:2c:e5:2d,,->,ff:ff:ff:ff:ff:ff,,INT,,,568,52954,52954,flow=Background,0,-,3
4,2022-07-07 09:00:00,3467.739746,tcp,69.63.180.46,80,<?>,147.32.85.124,53493,FPA_FPA,0.0,0.0,345,147540,37595,flow=Background,0,-,3


In [11]:
df_cat = df.select_dtypes('object')
df_num = df.select_dtypes(['int64', 'float64'])

In [12]:
print('Kolom Categorical :', df_cat.columns.value_counts().count())
print('Kolom Numerical :', df_num.columns.value_counts().count())

Kolom Categorical : 10
Kolom Numerical : 8


In [13]:
del df_cat, df_num

## Check Missing Value

In [14]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,301131,7.7
1,sTos,30137,0.8
2,Sport,27890,0.7
3,Dport,13553,0.3
4,State,26,0.0
5,StartTime,0,0.0
6,Dur,0,0.0
7,Dir,0,0.0
8,Proto,0,0.0
9,SrcAddr,0,0.0


## Cek Kolom

### Start Time

In [16]:
pd.DataFrame(df['StartTime'].value_counts()).reset_index()

Unnamed: 0,StartTime,count
0,2022-07-07 12:31:20,1353
1,2022-07-07 11:46:46,1178
2,2022-07-07 10:48:46,1165
3,2022-07-07 12:01:48,1144
4,2022-07-07 09:54:08,1140
...,...,...
28796,2022-07-07 15:57:24,4
28797,2022-07-07 15:57:57,3
28798,2022-07-07 15:59:20,3
28799,2022-07-07 15:08:10,2


### Dir

In [13]:
pd.DataFrame(df['Dir'].value_counts()).reset_index()

Unnamed: 0,Dir,count
0,<->,2590345
1,->,1253544
2,<-,20430
3,<?>,12659
4,?>,7372
5,who,1436
6,<?,6


### State

In [14]:
pd.DataFrame(df['State'].value_counts()).reset_index()

Unnamed: 0,State,count
0,CON,2585301
1,FSPA_FSPA,552923
2,INT,161125
3,SRPA_FSPA,72090
4,S_,70696
...,...,...
294,SRPAEC_SAE,1
295,SRPAC_SRPA,1
296,FRPA_A,1
297,FPU_,1


### BotNet Name

In [18]:
pd.DataFrame(df['BotnetName'].value_counts()).reset_index()

Unnamed: 0,BotnetName,count
0,-,3591792
1,neris,220000
2,virut,38000
3,murlo,14000
4,rbot,13000
5,nsis.ay,9000


### SrcAddr

In [19]:
pd.DataFrame(df['SrcAddr'].value_counts()).reset_index()

Unnamed: 0,SrcAddr,count
0,147.32.84.138,657515
1,147.32.84.59,600006
2,147.32.85.25,136842
3,147.32.84.229,130008
4,147.32.86.20,93943
...,...,...
448304,109.122.133.254,1
448305,178.110.127.175,1
448306,78.39.237.43,1
448307,87.97.145.248,1


### DstAddr

In [20]:
pd.DataFrame(df['DstAddr'].value_counts()).reset_index()

Unnamed: 0,DstAddr,count
0,147.32.80.9,1618579
1,147.32.84.229,456705
2,147.32.86.165,309233
3,147.32.80.13,32222
4,147.32.84.118,28649
...,...,...
160057,94.181.195.49,1
160058,178.234.197.26,1
160059,178.46.206.218,1
160060,66.90.206.14,1


### Proto

In [21]:
pd.DataFrame(df['Proto'].value_counts()).reset_index()

Unnamed: 0,Proto,count
0,udp,2744470
1,tcp,1054886
2,icmp,75155
3,igmp,5221
4,rtp,2389
5,rtcp,1944
6,arp,1432
7,ipv6-icmp,165
8,ipv6,42
9,ipx/spx,36


### Sport

In [17]:
pd.DataFrame(df['Sport'].value_counts()).reset_index()

Unnamed: 0,Sport,count
0,13363,129446
1,12114,45115
2,0x0303,35460
3,7103,14264
4,80,13020
...,...,...
65304,0x3e32,1
65305,0xf9c5,1
65306,0xf8b4,1
65307,0x5ff5,1


### Dport

In [18]:
pd.DataFrame(df['Dport'].value_counts()).reset_index()

Unnamed: 0,Dport,count
0,53,1628436
1,80,631906
2,13363,424617
3,12114,288197
4,443,162614
...,...,...
68576,26524,1
68577,58717,1
68578,56115,1
68579,57074,1


### dTos

In [15]:
pd.DataFrame(df['dTos'].value_counts()).reset_index()

Unnamed: 0,dTos,count
0,0.0,3584032
1,3.0,341
2,2.0,247
3,1.0,41


### sTos

In [16]:
pd.DataFrame(df['sTos'].value_counts()).reset_index()

Unnamed: 0,sTos,count
0,0.0,3852195
1,3.0,1302
2,192.0,1113
3,2.0,667
4,1.0,378


## Labeling

In [5]:
pd.DataFrame(df['Label'].value_counts()).reset_index()

Unnamed: 0,Label,count
0,flow=To-Background-UDP-CVUT-DNS-Server,1553211
1,flow=Background-UDP-Established,908173
2,flow=Background-TCP-Established,408535
3,flow=Background-Established-cmpgw-CVUT,268040
4,flow=Background-UDP-Attempt,134922
...,...,...
157,flow=From-Normal-V44-UDP-CVUT-DNS-Server,8
158,flow=From-Normal-V50-MatLab-Server,6
159,flow=Normal-V50-HTTP-windowsupdate,6
160,flow=Normal-V44-HTTP-windowsupdate,5


In [6]:
def categorize_label(label):
    label = label.lower()  
    if 'botnet' in label and 'spam' in label:
        return 2
    elif 'botnet' in label:
        return 1
    else:
        return 0

In [7]:
df['Label'] = df['Label'].apply(categorize_label)

In [None]:
df.head()

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,ActivityLabel,BotnetName,SensorId
0,2022-07-07 09:00:00,0.0,icmp,147.32.84.165,0x0303,->,202.103.52.147,0xc413,URP,0.0,,1,190,190,1,1,rbot,3
1,2022-07-07 09:00:00,0.13229,udp,147.32.84.59,54866,<->,216.121.135.141,56313,CON,0.0,0.0,2,133,72,0,0,-,3
2,2022-07-07 09:00:00,3570.602295,tcp,147.32.84.59,44213,<?>,205.188.10.230,443,PA_PA,0.0,0.0,260,19571,7824,0,0,-,3
3,2022-07-07 09:00:00,3595.741455,ipx/spx,00:15:17:2c:e5:2d,,->,ff:ff:ff:ff:ff:ff,,INT,,,568,52954,52954,0,0,-,3
4,2022-07-07 09:00:00,3467.739746,tcp,69.63.180.46,80,<?>,147.32.85.124,53493,FPA_FPA,0.0,0.0,345,147540,37595,0,0,-,3


In [9]:
df['Label'].value_counts()

Label
0    3591792
1     271000
2      23000
Name: count, dtype: int64

## Cleaning Data

### Drop Duplicates

In [10]:
df = df.drop_duplicates()

In [11]:
df.shape

(3884201, 18)

### Drop dTos

In [12]:
cek = df[(df['dTos'].isnull())]

In [13]:
cek['Label'].value_counts()

Label
0    259212
1     29562
2     11999
Name: count, dtype: int64

In [14]:
cek = cek[cek['Label'] == 0]

In [15]:
cek['Label'].value_counts()

Label
0    259212
Name: count, dtype: int64

In [16]:
df = df.drop(cek.index)

In [17]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,41561,1.1
1,sTos,28660,0.8
2,Sport,21109,0.6
3,Dport,1756,0.0
4,State,10,0.0
5,StartTime,0,0.0
6,Dur,0,0.0
7,Dir,0,0.0
8,Proto,0,0.0
9,SrcAddr,0,0.0


### Drop sTos

In [18]:
cek = df[(df['sTos'].isnull())]

In [19]:
cek['Label'].value_counts()

Label
0    28660
Name: count, dtype: int64

In [20]:
df = df.drop(cek.index)

In [21]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,41561,1.2
1,Dport,1752,0.0
2,Sport,998,0.0
3,State,10,0.0
4,Dur,0,0.0
5,StartTime,0,0.0
6,Dir,0,0.0
7,SrcAddr,0,0.0
8,DstAddr,0,0.0
9,Proto,0,0.0


### Drop Dport

In [22]:
cek = df[df['Dport'].isnull()]

In [23]:
cek['Label'].value_counts()

Label
1    1728
0      24
Name: count, dtype: int64

In [24]:
cek = cek[cek['Label'] == 0]

In [25]:
cek['Label'].value_counts()

Label
0    24
Name: count, dtype: int64

In [26]:
df = df.drop(cek.index)

In [27]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,41561,1.2
1,Dport,1728,0.0
2,Sport,979,0.0
3,State,10,0.0
4,Dur,0,0.0
5,StartTime,0,0.0
6,Dir,0,0.0
7,SrcAddr,0,0.0
8,DstAddr,0,0.0
9,Proto,0,0.0


### Drop Sport

In [28]:
cek = df[df['Sport'].isnull()]

In [29]:
cek['Label'].value_counts()

Label
1    973
0      6
Name: count, dtype: int64

In [30]:
cek = cek[cek['Label'] == 0]

In [31]:
cek['Label'].value_counts()

Label
0    6
Name: count, dtype: int64

In [32]:
df = df.drop(cek.index)

In [33]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,41561,1.2
1,Dport,1728,0.0
2,Sport,973,0.0
3,State,10,0.0
4,Dur,0,0.0
5,StartTime,0,0.0
6,Dir,0,0.0
7,SrcAddr,0,0.0
8,DstAddr,0,0.0
9,Proto,0,0.0


### Drop State

In [34]:
cek = df[df['State'].isnull()]

In [35]:
cek['Label'].value_counts()

Label
1    10
Name: count, dtype: int64

In [36]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,dTos,41561,1.2
1,Dport,1728,0.0
2,Sport,973,0.0
3,State,10,0.0
4,Dur,0,0.0
5,StartTime,0,0.0
6,Dir,0,0.0
7,SrcAddr,0,0.0
8,DstAddr,0,0.0
9,Proto,0,0.0


## Imputing

### dTos

In [37]:
df['dTos'].value_counts()

dTos
0.0    3554109
3.0        341
2.0        247
1.0         41
Name: count, dtype: int64

In [38]:
cek = df[df['dTos'].isnull()]

In [39]:
cek['Label'].value_counts()

Label
1    29562
2    11999
Name: count, dtype: int64

In [40]:
# Impute with modes
mode_value1 = df[df['Label'] == 1]['dTos'].mode()[0]
mode_value2 = df[df['Label'] == 2]['dTos'].mode()[0]

df.loc[(df['dTos'].isnull()) & (df['Label'] == 1), 'dTos'] = mode_value1
df.loc[(df['dTos'].isnull()) & (df['Label'] == 2), 'dTos'] = mode_value2

In [41]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,Dport,1728,0.0
1,Sport,973,0.0
2,State,10,0.0
3,StartTime,0,0.0
4,Dur,0,0.0
5,Proto,0,0.0
6,Dir,0,0.0
7,SrcAddr,0,0.0
8,DstAddr,0,0.0
9,sTos,0,0.0


### Dport

In [42]:
df['Dport'].value_counts()

Dport
53        1628176
80         623420
13363      424232
12114      246831
443        156707
           ...   
0x004d          1
0x52d3          1
58367           1
0x71c4          1
16687           1
Name: count, Length: 50627, dtype: int64

In [43]:
cek = df[df['Dport'].isnull()]

In [44]:
cek['Label'].value_counts()

Label
1    1728
Name: count, dtype: int64

In [45]:
# Impute with modes
mode_value = df[df['Label'] == 1]['Dport'].mode()[0]
df['Dport'] = df['Dport'].fillna(mode_value)

### Sport

In [46]:
df['Sport'].value_counts()

Sport
13363     113234
12114      39990
13815       6982
123         6680
0x0303      5934
           ...  
0x0581         1
0x7733         1
0xb69d         1
0xde02         1
0x316d         1
Name: count, Length: 65060, dtype: int64

In [47]:
cek = df[df['Sport'].isnull()]

In [48]:
cek['Label'].value_counts()

Label
1    973
Name: count, dtype: int64

In [49]:
# Impute with modes
mode_value = df[df['Label'] == 1]['Sport'].mode()[0]
df['Sport'] = df['Sport'].fillna(mode_value)

### State

In [50]:
df['State'].value_counts()

State
CON           2584422
FSPA_FSPA      552722
SRPA_FSPA       72085
SRPA_SPA        66905
FSA_FSA         49890
               ...   
SRC                 1
FSA_FRPA            1
SR_FSPA             1
IRQ                 1
SRPAEC_SAE          1
Name: count, Length: 259, dtype: int64

In [51]:
cek = df[df['State'].isnull()]

In [52]:
cek['Label'].value_counts()

Label
1    10
Name: count, dtype: int64

In [53]:
# Impute with modes
mode_value = df[df['Label'] == 1]['State'].mode()[0]
df['State'] = df['State'].fillna(mode_value)

In [54]:
# Cek Missing Data
total_missing = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total_missing, percent_2], axis=1, keys=['Total Missing', '%'])
missing_data = missing_data.reset_index().rename(columns={'index': 'Column'})
missing_data

Unnamed: 0,Column,Total Missing,%
0,StartTime,0,0.0
1,Dur,0,0.0
2,Proto,0,0.0
3,SrcAddr,0,0.0
4,Sport,0,0.0
5,Dir,0,0.0
6,DstAddr,0,0.0
7,Dport,0,0.0
8,State,0,0.0
9,sTos,0,0.0


## Categorical Encoding

In [55]:
# df.to_csv('./Dataset/df_clean.csv')

In [55]:
dataset = pd.DataFrame(df[[
    'StartTime',
    'SrcAddr',
    'DstAddr',
    'Sport',
    'Dport',
    'sTos',
    'dTos',
    'Dir',
    'State',
    'Proto',
    'Dur',
    'TotPkts',
    'TotBytes',
    'SrcBytes',
    'BotnetName',
    'Label',
    ]])

### Start Time

In [56]:
dataset['StartTime'] = pd.to_datetime(dataset['StartTime'], errors='coerce')

dataset['StartTimeHour'] = dataset['StartTime'].dt.hour
dataset['StartTimeMinute'] = dataset['StartTime'].dt.minute
dataset['StartTimeSecond'] = dataset['StartTime'].dt.second
dataset = dataset.drop(columns=['StartTime'])

### SrcAddr & DstAddr & Sport & Dport

In [57]:
# Frequency Encoding
def frequency_encoding(df):
    freq_map = df.value_counts().to_dict()  # Create frequency map
    df = df.map(freq_map)  # Apply frequency encoding
    return df

In [58]:
dataset['SrcAddr'] = frequency_encoding(dataset['SrcAddr'])
dataset['DstAddr'] = frequency_encoding(dataset['DstAddr'])
dataset['Sport'] = frequency_encoding(dataset['Sport'])
dataset['Dport'] = frequency_encoding(dataset['Dport'])

### Dir & State & dTos & sTos

In [59]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dataset['Dir'] = le.fit_transform(dataset['Dir'].astype(str))
dataset['State'] = le.fit_transform(dataset['State'].astype(str))
dataset['sTos'] = le.fit_transform(dataset['sTos'].astype(str))
dataset['dTos'] = le.fit_transform(dataset['dTos'].astype(str))
dataset['BotnetName'] = le.fit_transform(dataset['BotnetName'].astype(str))

### Proto

In [60]:
pd.DataFrame(dataset['Proto'].value_counts()).reset_index()

Unnamed: 0,Proto,count
0,udp,2602758
1,tcp,977074
2,icmp,12367
3,rtp,2331
4,rtcp,1762
5,udt,7


In [61]:
# One Hot Encoding
one_hot_proto = pd.get_dummies(dataset['Proto'], prefix='Proto').astype(int)
dataset = pd.concat([dataset, one_hot_proto], axis=1)
dataset = dataset.drop(columns=['Proto'])

In [62]:
dataset.shape

(3596299, 23)

In [63]:
dataset.head()

Unnamed: 0,SrcAddr,DstAddr,Sport,Dport,sTos,dTos,Dir,State,Dur,TotPkts,...,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,Proto_icmp,Proto_rtcp,Proto_rtp,Proto_tcp,Proto_udp,Proto_udt
0,86658,47,6907,300,0,0,0,250,0.0,1,...,1,9,0,0,1,0,0,0,0,0
1,587582,6,667,10,0,0,3,9,0.13229,2,...,0,9,0,0,0,0,0,0,1,0
2,587582,19,48,156707,0,0,5,124,3570.602295,260,...,0,9,0,0,0,0,0,1,0,0
4,7,69,4169,2,0,0,5,25,3467.739746,345,...,0,9,0,0,0,0,0,1,0,0
5,9327,1996,102,625148,0,0,0,78,115.104431,10,...,0,9,0,0,0,0,0,1,0,0


In [64]:
dataset['isBotnet'] = dataset['Label'].apply(lambda x: 1 if x == 1 or x == 2 else 0)
dataset['isSpam'] = dataset['Label'].apply(lambda x: 1 if x == 2 else 0)

In [65]:
dataset.columns

Index(['SrcAddr', 'DstAddr', 'Sport', 'Dport', 'sTos', 'dTos', 'Dir', 'State',
       'Dur', 'TotPkts', 'TotBytes', 'SrcBytes', 'BotnetName', 'Label',
       'StartTimeHour', 'StartTimeMinute', 'StartTimeSecond', 'Proto_icmp',
       'Proto_rtcp', 'Proto_rtp', 'Proto_tcp', 'Proto_udp', 'Proto_udt',
       'isBotnet', 'isSpam'],
      dtype='object')

In [66]:
# dataset.to_csv('./Dataset/Dataset.csv', index=False)