In [1]:
import pandas as pd
import numpy as np

### Train data

In [38]:
df_train = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/training_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [3]:
df_train.isna().any()

Timestamp          False
Duration           False
Protocol           False
SrcIPAddress       False
SrcPort             True
Direction          False
DestIPAddress      False
DestPort            True
State               True
SrcTypeService      True
DestTypeService     True
TotalPackets       False
TotalBytes         False
SourceBytes        False
dtype: bool

In [4]:
df_train.nunique()

Timestamp          13881753
Duration            3437518
Protocol                 14
SrcIPAddress          10811
SrcPort               97555
Direction                10
DestIPAddress        589070
DestPort              95865
State                   373
SrcTypeService            5
DestTypeService           4
TotalPackets          12327
TotalBytes           224601
SourceBytes           82174
dtype: int64

In [5]:
# It is cleaned the 'Direction' field
df_train.Direction.unique()

array(['->', '<->', '   ->', '  <?>', '  <->', '   ?>', '  <-', '  who',
       '  <?', '<?>'], dtype=object)

In [6]:
dictionary = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}
df_train = df_train.replace({"Direction": dictionary})
df_train['Direction'].value_counts()

<->    9055197
->     4605238
<-      146606
<?>      74994
Name: Direction, dtype: int64

In [7]:
# It is completed in the Direction field the <?> values. A temporary 'ratio' field is used for this purpose.
df_train['ratio'] = df_train['SourceBytes']/df_train['TotalBytes']

In [8]:
print(df_train.loc[df_train['Direction'] == '->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<?>']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<-']['ratio'].mean())

0.5846801085715531
0.36214280955332806
0.6985354617227908
0.0


In [9]:
# '<-' value in field 'Direction' usually has 'SourceBytes' field equal to 0
df_train.loc[df_train['Direction'] == '<-'].head()

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,ratio
3507,2012/12/01 13:11:29.310824,0.000396,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,RED,,0.0,3,194,0,0.0
3639,2012/12/01 13:11:30.311807,0.00039,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,2,196,0,0.0
3749,2012/12/01 13:11:31.315594,1e-05,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0,0.0
3916,2012/12/01 13:11:32.315600,0.000108,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,196,0,0.0
4000,2012/12/01 13:11:33.325606,5e-06,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0,0.0


In [10]:
# With the information of field 'ratio' is determined a rule to complete the 'Direction' field
df_train.loc[(df_train['SourceBytes'] == 0) & (df_train['Direction'] == '<?>'), 'Direction'] = '<-'
df_train.loc[(df_train['ratio'] > 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '->'
df_train.loc[(df_train['ratio'] <= 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '<->'

In [11]:
df_train = df_train.drop('ratio',1)
df_train['Direction'].value_counts()

<->    9074226
->     4659678
<-      148131
Name: Direction, dtype: int64

In [39]:
df_train['PacketsSec'] = df_train['TotalPackets']/df_train['Duration']
df_train['TotalBytesSec'] = df_train['TotalBytes']/df_train['Duration']
df_train['SourceBytesSec'] = df_train['SourceBytes']/df_train['Duration']

In [17]:
df_train.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/clean_training_data', sep=',', index=False)

### Test data

In [19]:
df_test = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/test_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [20]:
df_test.shape

(1053845, 14)

In [21]:
# It is cleaned the 'Direction' field
df_test = df_test.replace({"Direction": dictionary})
df_test['Direction'].value_counts()

->     844228
<->    207315
<?>      2298
<-          4
Name: Direction, dtype: int64

In [22]:
# It is completed in the Direction field the <?> values. A temporary 'ratio' field is used for this purpose.
df_test['ratio'] = df_test['SourceBytes']/df_test['TotalBytes']

In [23]:
# With the information of field 'ratio' is determined a rule to complete the 'Direction' field
df_test.loc[(df_test['SourceBytes'] == 0) & (df_test['Direction'] == '<?>'), 'Direction'] = '<-'
df_test.loc[(df_test['ratio'] > 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '->'
df_test.loc[(df_test['ratio'] <= 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '<->'
df_test = df_test.drop('ratio',1)

In [24]:
df_test['PacketsSec'] = df_test['TotalPackets']/df_test['Duration']
df_test['TotalBytesSec'] = df_test['TotalBytes']/df_test['Duration']
df_test['SourceBytesSec'] = df_test['SourceBytes']/df_test['Duration']

In [25]:
df_test.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/clean_test_data', sep=',', index=False)

### Validation data

In [34]:
df_valid = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/valid_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [35]:
df_valid['PacketsSec'] = df_valid['TotalPackets']/df_valid['Duration']
df_valid['TotalBytesSec'] = df_valid['TotalBytes']/df_valid['Duration']
df_valid['SourceBytesSec'] = df_valid['SourceBytes']/df_valid['Duration']

In [37]:
df_valid.loc[df_valid['PacketsSec'] != np.inf].describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,Duration,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,PacketsSec,TotalBytesSec,SourceBytesSec
count,658488.0,652952.0,423478.0,658488.0,658488.0,658488.0,658488.0,658488.0,658488.0
mean,88.1395,7.81068e-05,0.0001346,26.2775,18965.2,11610.9,759.433,74148.9,20869.9
std,373.845,0.00883747,0.0166213,2300.06,2260240.0,886891.0,5444.34,389132.0,150658.0
min,4e-06,0.0,0.0,1.0,57.0,0.0,0.000591361,0.0372557,0.0
25%,0.117642,0.0,0.0,3.0,187.0,78.0,0.4486,20.9855,20.8273
50%,1.35517,0.0,0.0,4.0,282.0,182.0,4.99625,272.759,137.58
75%,9.04036,0.0,0.0,7.0,387.0,193.0,31.9613,3198.16,1028.58
max,3946.07,1.0,3.0,1552900.0,1620170000.0,150341000.0,1000000.0,68888900.0,27166700.0


In [32]:
df_valid['Label'].value_counts()

flow=From-Botnet-V54-TCP-Attempt                            134557
flow=From-Botnet-V42-UDP-DNS                                125788
flow=From-Botnet-V44-TCP-Attempt                             71833
flow=From-Botnet-V54-UDP-DNS                                 45138
flow=From-Botnet-V51-3-ICMP                                  41452
                                                             ...  
flow=From-Botnet-V52-1-UDP-Attempt                               1
flow=From-Botnet-V52-1-TCP-Established                           1
flow=To-Normal-V46-UDP-NTP-server                                1
flow=From-Botnet-V52-1-TCP-HTTP-Google-Net-Established-6         1
flow=From-Botnet-V51-10-TCP-WEB-Established                      1
Name: Label, Length: 459, dtype: int64


In [40]:
df_train.loc[df_train['PacketsSec'] != np.inf].describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

Unnamed: 0,Duration,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,PacketsSec,TotalBytesSec,SourceBytesSec
count,13019400.0,12816000.0,12264600.0,13019400.0,13019400.0,13019400.0,13019400.0,13019400.0,13019400.0
mean,164.597,0.00420351,0.000516447,44.5322,36225.1,7375.1,7318.26,673527.0,219097.0
std,639.67,0.865268,0.0364768,3972.94,3930840.0,1715070.0,23848.5,2107500.0,1503020.0
min,1e-06,0.0,0.0,1.0,57.0,0.0,0.000529761,0.0306477,0.0
25%,0.000256,0.0,0.0,2.0,211.0,78.0,16.0199,2669.18,483.242
50%,0.000402,0.0,0.0,3.0,252.0,83.0,5649.72,598240.0,196931.0
75%,0.460705,0.0,0.0,6.0,599.0,288.0,9661.84,933921.0,310757.0
max,3937.87,192.0,3.0,4236940.0,4274630000.0,3346760000.0,3000000.0,1534500000.0,1558000000.0


End