In [1]:
# import library
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
from datetime import datetime
import os
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dataset_dir = '../Dataset/'

In [4]:
os.listdir(dataset_dir)

['Dataset.csv',
 'NCC-2 Dataset Simultaneous Botnet Dataset',
 'Outdated',
 'sensor1_clean.csv',
 'sensor1_encoded.csv',
 'sensor1_isbotnet.csv',
 'sensor1_isspam.csv',
 'sensor1_test.csv',
 'sensor1_train.csv',
 'sensor2_clean.csv',
 'sensor2_encoded.csv',
 'sensor2_isbotnet.csv',
 'sensor2_isspam.csv',
 'sensor2_test.csv',
 'sensor2_train.csv',
 'sensor3_clean.csv',
 'sensor3_encoded.csv',
 'sensor3_isbotnet.csv',
 'sensor3_test.csv',
 'sensor3_train.csv']

# Split Data Train n Test

In [5]:
def split_data(df):
    train, test = train_test_split(df, test_size=0.25, random_state=42)
    return train, test

# Sensor 1

In [6]:
df_1 = pd.read_csv(dataset_dir + 'sensor1_encoded.csv')

In [7]:
sensor1_train, sensor1_test = split_data(df_1)

In [10]:
sensor1_test.to_csv(dataset_dir + 'sensor1_test.csv', index=False)
sensor1_train.to_csv(dataset_dir + 'sensor1_train.csv', index=False)

## isBotnet DataTrain

In [8]:
isbotnet_1 = sensor1_train.copy()
isbotnet_1.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
1928308,0.0,4,827329,80,3,1823076,1852324,16,0,0,2,214,81,0,11,33,47,0,0
3303216,173.58,4,1,25,3,1326350,1274027,16,0,0,4,1152,1032,0,13,21,20,0,0
5390,0.13,4,527737,963,3,1,3,16,0,0,2,470,143,0,9,0,21,0,0
4147372,0.31,3,103205,90,0,6173,653182,108,0,0,34,29101,798,0,14,40,27,0,0
876341,0.05,4,527737,167,3,1823076,1852324,16,0,0,2,454,78,0,10,4,49,0,0


In [9]:
isbotnet_1 = isbotnet_1.drop(columns=['Label', 'isSpam'])

In [10]:
isbotnet_1.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet
1928308,0.0,4,827329,80,3,1823076,1852324,16,0,0,2,214,81,11,33,47,0
3303216,173.58,4,1,25,3,1326350,1274027,16,0,0,4,1152,1032,13,21,20,0
5390,0.13,4,527737,963,3,1,3,16,0,0,2,470,143,9,0,21,0
4147372,0.31,3,103205,90,0,6173,653182,108,0,0,34,29101,798,14,40,27,0
876341,0.05,4,527737,167,3,1823076,1852324,16,0,0,2,454,78,10,4,49,0


In [11]:
isbotnet_1.to_csv(dataset_dir + 'sensor1_isbotnet.csv', index=False)

## isSpam DataTrain

In [12]:
isspam_1 = sensor1_train.copy()
isspam_1 = isspam_1[isspam_1['isBotnet'] == 1]
isspam_1.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
2010835,422.44,3,6948,7198,0,1822,20499,255,0,0,78,18613,2246,1,11,41,6,1,0
2966278,0.22,3,82656,73,0,925,653182,108,0,0,14,6619,3145,1,12,58,6,1,0
4146348,133.11,3,82656,348,0,3144,505,257,0,0,72,60352,1981,1,14,40,21,1,0
2099598,1.99,3,82656,115,0,298,141542,108,0,0,29,13130,2656,1,11,48,15,1,0
2055068,500.0,3,82656,1114,0,3133,653182,248,0,0,14,5498,531,1,11,44,37,1,0


In [13]:
isspam_1 = isspam_1.drop(columns=['Label', 'isBotnet'])

In [15]:
isspam_1.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,StartTimeHour,StartTimeMinute,StartTimeSecond,isSpam
2010835,422.44,3,6948,7198,0,1822,20499,255,0,0,78,18613,2246,11,41,6,0
2966278,0.22,3,82656,73,0,925,653182,108,0,0,14,6619,3145,12,58,6,0
4146348,133.11,3,82656,348,0,3144,505,257,0,0,72,60352,1981,14,40,21,0
2099598,1.99,3,82656,115,0,298,141542,108,0,0,29,13130,2656,11,48,15,0
2055068,500.0,3,82656,1114,0,3133,653182,248,0,0,14,5498,531,11,44,37,0


In [16]:
isspam_1.to_csv(dataset_dir + 'sensor1_isspam.csv', index=False)

# Sensor 2

In [6]:
df_2 = pd.read_csv(dataset_dir + 'sensor2_encoded.csv')

In [7]:
sensor2_train, sensor2_test = split_data(df_2)

In [9]:
sensor2_test.to_csv(dataset_dir + 'sensor2_test.csv', index=False)
sensor2_train.to_csv(dataset_dir + 'sensor2_train.csv', index=False)

## isBotnet DataTrain

In [10]:
isbotnet_2 = sensor2_train.copy()
isbotnet_2.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
995993,0.0,4,1,19,3,1615327,1554370,16,0,0,2,136,76,0,9,48,10,0,0
3715901,0.0,4,1082471,109,3,2162514,2194973,16,0,0,2,214,81,0,12,35,33,0,0
4823423,0.58,4,3,34,3,1615327,1554370,16,0,0,4,452,141,0,14,34,8,0,0
4970915,0.0,4,3,72,3,1615327,198224,16,0,0,2,128,60,0,15,16,14,0,0
1198505,4.91,3,49,36,0,1615327,775304,108,0,0,14,1052,504,0,9,59,46,0,0


In [11]:
isbotnet_2 = isbotnet_2.drop(columns=['Label', 'isSpam'])

In [12]:
isbotnet_2.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet
995993,0.0,4,1,19,3,1615327,1554370,16,0,0,2,136,76,9,48,10,0
3715901,0.0,4,1082471,109,3,2162514,2194973,16,0,0,2,214,81,12,35,33,0
4823423,0.58,4,3,34,3,1615327,1554370,16,0,0,4,452,141,14,34,8,0
4970915,0.0,4,3,72,3,1615327,198224,16,0,0,2,128,60,15,16,14,0
1198505,4.91,3,49,36,0,1615327,775304,108,0,0,14,1052,504,9,59,46,0


In [13]:
isbotnet_2.to_csv(dataset_dir + 'sensor2_isbotnet.csv', index=False)

## isSpam DataTrain

In [14]:
isspam_2 = sensor2_train.copy()
isspam_2 = isspam_2[isspam_2['isBotnet'] == 1]
isspam_2.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
4777902,3596.97,4,27893,1788,3,14888,14889,16,0,0,901,73882,73800,1,14,27,9,1,0
2792028,9.01,3,103634,129,0,14,15935,302,0,0,3,186,186,2,11,25,18,1,1
4339715,15.63,3,103634,222,0,4000,13013,108,0,0,34,10028,5312,1,13,28,17,1,0
2132895,139.38,3,31786,1153,0,1028,198224,108,0,0,150,116103,4692,1,10,45,56,1,0
1226965,0.32,3,27879,199,0,1237,775304,108,0,0,10,1308,766,1,10,1,29,1,0


In [15]:
isspam_2 = isspam_2.drop(columns=['Label', 'isBotnet'])

In [16]:
isspam_2.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,StartTimeHour,StartTimeMinute,StartTimeSecond,isSpam
4777902,3596.97,4,27893,1788,3,14888,14889,16,0,0,901,73882,73800,14,27,9,0
2792028,9.01,3,103634,129,0,14,15935,302,0,0,3,186,186,11,25,18,1
4339715,15.63,3,103634,222,0,4000,13013,108,0,0,34,10028,5312,13,28,17,0
2132895,139.38,3,31786,1153,0,1028,198224,108,0,0,150,116103,4692,10,45,56,0
1226965,0.32,3,27879,199,0,1237,775304,108,0,0,10,1308,766,10,1,29,0


In [17]:
isspam_2.to_csv(dataset_dir + 'sensor2_isspam.csv', index=False)

# Sensor 3

In [6]:
df_3 = pd.read_csv(dataset_dir + 'sensor3_encoded.csv')

In [7]:
sensor3_train, sensor3_test = split_data(df_3)

In [8]:
sensor3_test.to_csv(dataset_dir + 'sensor3_test.csv', index=False)
sensor3_train.to_csv(dataset_dir + 'sensor3_train.csv', index=False)

## isBotnet DataTrain

In [9]:
isbotnet_3 = sensor3_train.copy()
isbotnet_3.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
804734,0.0,4,4001,100,3,1615276,1628176,9,0,0,2,358,85,0,10,27,1,0,0
51629,0.0,4,9686,56,3,1615276,1628176,9,0,0,2,364,75,0,9,18,4,0,0
3042823,0.0,4,657086,59,3,1615276,1628176,9,0,0,2,304,83,0,13,46,33,0,0
216667,0.0,4,587582,94,3,1615276,1628176,9,0,0,2,196,75,0,9,34,20,0,0
825556,325.13,3,86658,636,0,987,4206,225,0,0,2022,1811163,54424,1,10,28,44,1,0


In [10]:
isbotnet_3 = isbotnet_3.drop(columns=['Label', 'isSpam'])

In [11]:
isbotnet_3.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet
804734,0.0,4,4001,100,3,1615276,1628176,9,0,0,2,358,85,10,27,1,0
51629,0.0,4,9686,56,3,1615276,1628176,9,0,0,2,364,75,9,18,4,0
3042823,0.0,4,657086,59,3,1615276,1628176,9,0,0,2,304,83,13,46,33,0
216667,0.0,4,587582,94,3,1615276,1628176,9,0,0,2,196,75,9,34,20,0
825556,325.13,3,86658,636,0,987,4206,225,0,0,2022,1811163,54424,10,28,44,1


In [12]:
isbotnet_3.to_csv(dataset_dir + 'sensor3_isbotnet.csv', index=False)

## isSpam DataTrain

In [13]:
isspam_3 = sensor3_train.copy()
isspam_3 = isspam_3[isspam_3['isBotnet'] == 1]
isspam_3.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,Label,StartTimeHour,StartTimeMinute,StartTimeSecond,isBotnet,isSpam
825556,325.13,3,86658,636,0,987,4206,225,0,0,2022,1811163,54424,1,10,28,44,1,0
1462385,3596.84,4,22897,1766,3,14888,14888,9,0,0,901,73882,73800,1,11,24,4,1,0
3529736,12.87,3,21901,87,0,150,625148,81,0,0,11,1613,624,1,16,11,47,1,0
743362,9.01,3,86658,148,0,4,14386,234,0,0,3,186,186,2,10,22,2,1,1
2078897,149.77,3,21927,94,0,6416,625148,219,0,0,10,1040,543,1,12,17,45,1,0


In [14]:
isspam_3 = isspam_3.drop(columns=['Label', 'isBotnet'])

In [15]:
isspam_3.head()

Unnamed: 0,Dur,Proto,SrcAddr,Sport,Dir,DstAddr,Dport,State,sTos,dTos,TotPkts,TotBytes,SrcBytes,StartTimeHour,StartTimeMinute,StartTimeSecond,isSpam
825556,325.13,3,86658,636,0,987,4206,225,0,0,2022,1811163,54424,10,28,44,0
1462385,3596.84,4,22897,1766,3,14888,14888,9,0,0,901,73882,73800,11,24,4,0
3529736,12.87,3,21901,87,0,150,625148,81,0,0,11,1613,624,16,11,47,0
743362,9.01,3,86658,148,0,4,14386,234,0,0,3,186,186,10,22,2,1
2078897,149.77,3,21927,94,0,6416,625148,219,0,0,10,1040,543,12,17,45,0


In [16]:
isspam_3.to_csv(dataset_dir + 'sensor3_isspam.csv', index=False)