In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Train data

In [7]:
df_train = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/training_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [8]:
df_train.isna().any()

Timestamp          False
Duration           False
Protocol           False
SrcIPAddress       False
SrcPort             True
Direction          False
DestIPAddress      False
DestPort            True
State              False
SrcTypeService      True
DestTypeService     True
TotalPackets       False
TotalBytes         False
SourceBytes        False
Label              False
dtype: bool

In [9]:
df_train.nunique()

Timestamp          1978037
Duration            709791
Protocol                11
SrcIPAddress           118
SrcPort              43482
Direction                8
DestIPAddress       140273
DestPort             37611
State                  164
SrcTypeService           2
DestTypeService          4
TotalPackets          4670
TotalBytes           72745
SourceBytes          31733
Label                  903
dtype: int64

In [10]:
df_train.Direction.unique()

array(['<->', '->', '  <->', '  <?>', '   ->', '   ?>', '  <-', '  who'],
      dtype=object)

In [11]:
dictionary = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}

In [12]:
df_train = df_train.replace({"Direction": dictionary})

In [13]:
df_train['Direction'].value_counts()

<->    1370635
->      601333
<-        3250
<?>       2949
Name: Direction, dtype: int64

In [14]:
df_train.Direction.unique()

array(['<->', '->', '<?>', '<-'], dtype=object)

In [15]:
df_train['ratio'] = df_train['SourceBytes']/df_train['TotalBytes']

In [16]:
print(df_train.loc[df_train['Direction'] == '->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<?>']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<-']['ratio'].mean())

0.6685450445142166
0.3173837371275387
0.5828182718491834
0.0


In [17]:
df_train.loc[df_train['Direction'] == '<-'].head()

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,Label,ratio
999,2011/08/17 12:01:12.365011,0.000419,icmp,147.32.87.249,,<-,147.32.87.11,0x0002,RED,,0.0,2,148,0,flow=To-Background-MatLab-Server,0.0
1078,2011/08/17 12:01:13.353704,0.000413,icmp,147.32.87.249,,<-,147.32.87.11,0x0002,RED,,0.0,2,148,0,flow=To-Background-MatLab-Server,0.0
1174,2011/08/17 12:01:14.353679,0.000411,icmp,147.32.87.249,,<-,147.32.87.11,0x0002,RED,,0.0,2,148,0,flow=To-Background-MatLab-Server,0.0
1246,2011/08/17 12:01:15.353681,0.000459,icmp,147.32.87.249,,<-,147.32.87.11,0x0002,RED,,0.0,2,148,0,flow=To-Background-MatLab-Server,0.0
1335,2011/08/17 12:01:16.353634,2.3e-05,icmp,147.32.87.249,,<-,147.32.87.11,0x0002,ECR,,0.0,2,148,0,flow=To-Background-MatLab-Server,0.0


In [18]:
df_train.loc[(df_train['SourceBytes'] == 0) & (df_train['Direction'] == '<?>'), 'Direction'] = '<-'
df_train.loc[(df_train['ratio'] > 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '->'
df_train.loc[(df_train['ratio'] <= 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '<->'

In [19]:
df_train['Direction'].value_counts()

<->    1372078
->      602839
<-        3250
Name: Direction, dtype: int64

In [20]:
df_train.drop('ratio',1)

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,Label
0,2011/08/17 11:48:29.395153,0.000318,udp,147.124.93.156,1025,<->,147.32.89.100,53,CON,0.0,0.0,3,195,62,flow=From-Botnet-V50-1-UDP-DNS
1,2011/08/17 11:48:31.525252,0.039559,tcp,147.124.93.156,1027,->,74.125.241.185,80,SRPA_SPA,0.0,0.0,7,858,607,flow=From-Botnet-V50-1-TCP-HTTP-Google-Net-Est...
2,2011/08/17 11:48:31.877057,0.000423,udp,147.124.93.156,1025,<->,147.32.89.100,53,CON,0.0,0.0,2,560,90,flow=From-Botnet-V50-1-UDP-DNS
3,2011/08/17 11:48:34.224708,2.990280,udp,147.124.93.156,123,->,65.55.65.31,123,INT,0.0,,3,175,183,flow=From-Botnet-V50-1-UDP-Attempt
4,2011/08/17 11:48:34.767425,0.000541,udp,147.124.93.156,1025,<->,147.32.89.100,53,CON,0.0,0.0,2,472,77,flow=From-Botnet-V50-1-UDP-DNS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978162,2011/08/17 17:23:16.042445,0.316720,udp,147.72.101.192,1568,<->,147.32.97.8,53,CON,0.0,0.0,3,233,80,flow=From-Botnet-V50-10-UDP-DNS
1978163,2011/08/17 17:23:16.322385,0.000000,tcp,147.72.101.192,1170,->,117.53.131.2,25,S_,0.0,,1,60,60,flow=From-Botnet-V50-10-TCP-Attempt-SPAM
1978164,2011/08/17 17:23:16.325767,0.954443,tcp,147.72.101.192,1171,->,46.4.53.103,443,S_RA,0.0,0.0,6,364,181,flow=From-Botnet-V50-10-TCP-Attempt
1978165,2011/08/17 17:23:17.214008,0.000000,tcp,147.72.101.192,1172,->,211.43.214.125,25,S_,0.0,,2,59,60,flow=From-Botnet-V50-10-TCP-Attempt-SPAM


In [21]:
df_train.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/clean_training_data_with_labels.csv', sep=',', index=False)

In [22]:
df_train_num = df_train[['Duration','TotalPackets','TotalBytes','SourceBytes']]

In [23]:
df_train_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,0.000318,3,195,62
1,0.039559,7,858,607
2,0.000423,2,560,90
3,2.99028,3,175,183
4,0.000541,2,472,77


In [24]:
scaler = StandardScaler()
scaler.fit(df_train_num)
df_train_num = pd.DataFrame(scaler.transform(df_train_num), columns=df_train_num.columns, index=df_train_num.index)

In [25]:
df_train_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,-0.191842,-0.009924,-0.00823,-0.019033
1,-0.19176,-0.008454,-0.008001,-0.016594
2,-0.191842,-0.010291,-0.008104,-0.018907
3,-0.185597,-0.009924,-0.008237,-0.018491
4,-0.191842,-0.010291,-0.008134,-0.018965


In [26]:
print(df_train_num['Duration'].max())

7.692548999124274


In [27]:
df_train_cat = df_train[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [28]:
df_train_cat = df_train_cat.replace([np.nan],-1)

In [29]:
df_train_cat.head()

Unnamed: 0,Protocol,Direction,SrcTypeService,DestTypeService
0,udp,<->,0.0,0.0
1,tcp,->,0.0,0.0
2,udp,<->,0.0,0.0
3,udp,->,0.0,-1.0
4,udp,<->,0.0,0.0


In [30]:
encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(df_train_cat)
features_names = encoder.get_feature_names(['Protocol','Direction','SrcTypeService','DestTypeService'])

In [31]:
df_train_cat = pd.DataFrame(encoder.transform(df_train_cat).toarray(),columns=features_names,index=df_train_cat.index)

In [32]:
df_train_cat.columns

Index(['Protocol_arp', 'Protocol_gre', 'Protocol_icmp', 'Protocol_igmp',
       'Protocol_ipv6', 'Protocol_pim', 'Protocol_rtcp', 'Protocol_rtp',
       'Protocol_tcp', 'Protocol_udp', 'Protocol_udt', 'Direction_->',
       'Direction_<-', 'Direction_<->', 'SrcTypeService_-1.0',
       'SrcTypeService_0.0', 'SrcTypeService_192.0', 'DestTypeService_-1.0',
       'DestTypeService_0.0', 'DestTypeService_1.0', 'DestTypeService_2.0',
       'DestTypeService_3.0'],
      dtype='object')

In [33]:
df_train_cat.head()

Unnamed: 0,Protocol_arp,Protocol_gre,Protocol_icmp,Protocol_igmp,Protocol_ipv6,Protocol_pim,Protocol_rtcp,Protocol_rtp,Protocol_tcp,Protocol_udp,...,Direction_<-,Direction_<->,SrcTypeService_-1.0,SrcTypeService_0.0,SrcTypeService_192.0,DestTypeService_-1.0,DestTypeService_0.0,DestTypeService_1.0,DestTypeService_2.0,DestTypeService_3.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [58]:
df_train['Label'].value_counts()

flow=To-Background-UDP-CVUT-DNS-Server                   723449
flow=Background-Established-cmpgw-CVUT                   124394
flow=From-Botnet-V50-1-UDP-DNS                            90690
flow=Background-UDP-Established                           80322
flow=Background-TCP-Established                           74769
                                                          ...  
flow=From-Botnet-V50-6-TCP-HTTP-Not-Encrypted-Down-2          3
flow=From-Botnet-V50-10-UDP-Custom-Encryption-1               3
flow=From-Botnet-V50-10-TCP-HTTP-Not-Encrypted-Down-3         3
flow=From-Botnet-V50-6-TCP-Established-HTTP-Ad-59             3
flow=Normal-V50-HTTP-windowsupdate                            1
Name: Label, Length: 903, dtype: int64

In [57]:
new_label = pd.DataFrame(data=np.where(df_train['Label'].str.contains("Botnet",case=False), 0, 1),columns=["Normal"])

In [59]:
train_data = pd.concat([df_train_num, df_train_cat,new_label], axis=1)

In [60]:
train_data.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes,Protocol_arp,Protocol_gre,Protocol_icmp,Protocol_igmp,Protocol_ipv6,Protocol_pim,...,Direction_<->,SrcTypeService_-1.0,SrcTypeService_0.0,SrcTypeService_192.0,DestTypeService_-1.0,DestTypeService_0.0,DestTypeService_1.0,DestTypeService_2.0,DestTypeService_3.0,Normal
0,-0.191842,-0.009924,-0.00823,-0.019033,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,-0.19176,-0.008454,-0.008001,-0.016594,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,-0.191842,-0.010291,-0.008104,-0.018907,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
3,-0.185597,-0.009924,-0.008237,-0.018491,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
4,-0.191842,-0.010291,-0.008134,-0.018965,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0


In [61]:
train_data.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A2_training_data.csv', sep=',', index=False)

### Test data

In [62]:
df_test = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/test_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [63]:
df_test = df_test.replace({"Direction": dictionary})

In [64]:
df_test['Direction'].value_counts()

<->    313456
->     258225
<-       2468
<?>       234
Name: Direction, dtype: int64

In [65]:
df_test['ratio'] = df_test['SourceBytes']/df_test['TotalBytes']

In [66]:
df_test.loc[(df_test['SourceBytes'] == 0) & (df_test['Direction'] == '<?>'), 'Direction'] = '<-'
df_test.loc[(df_test['ratio'] > 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '->'
df_test.loc[(df_test['ratio'] <= 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '<->'

In [67]:
df_test_num = df_test[['Duration','TotalPackets','TotalBytes','SourceBytes']]

In [68]:
df_test_num = pd.DataFrame(scaler.transform(df_test_num), columns=df_test_num.columns, index=df_test_num.index)

In [69]:
df_test_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,-0.191842,-0.010291,-0.008227,-0.019037
1,-0.191738,-0.008454,-0.008,-0.016361
2,-0.191842,-0.009924,-0.008098,-0.018921
3,-0.191842,-0.009924,-0.00813,-0.018974
4,-0.185474,-0.009924,-0.008232,-0.018491


In [70]:
df_test_num['Duration'].max()

7.679651172043547

In [71]:
df_test_cat = df_test[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [72]:
df_test_cat = df_test_cat.replace([np.nan],-1)

In [73]:
df_test_cat = pd.DataFrame(encoder.transform(df_test_cat).toarray(),columns=features_names,index=df_test_cat.index)

In [74]:
df_test['Label'].value_counts()

flow=From-Botnet-V50-5-UDP-DNS                                               52060
flow=From-Botnet-V50-9-UDP-DNS                                               49825
flow=From-Botnet-V50-3-UDP-DNS                                               45716
flow=From-Botnet-V50-2-UDP-DNS                                               43844
flow=From-Botnet-V50-8-UDP-DNS                                               28569
                                                                             ...  
flow=From-Botnet-V50-4-TCP-Established-HTTP-Binary-Download-1                    1
flow=From-Botnet-V50-7-UDP-Custom-Encryption-1                                   1
flow=From-Botnet-V50-7-TCP-Established-HTTP-Ad-63                                1
flow=From-Botnet-V50-7-TCP-Established-HTTP-Binary-Download-Custom-Port-5        1
flow=From-Botnet-V50-1-TCP-Established-HTTP-Binary-Download-Custom-Port-7        1
Name: Label, Length: 900, dtype: int64

In [75]:
new_label_test = pd.DataFrame(data=np.where(df_test['Label'].str.contains("Botnet",case=False), 0, 1),columns=["Normal"])

In [78]:
test_data = pd.concat([df_test_num, df_test_cat,new_label_test], axis=1)

In [79]:
test_data.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A2_test_data.csv', sep=',', index=False)