In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Train data

In [2]:
df_train = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/training_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [3]:
df_train.isna().any()

Timestamp          False
Duration           False
Protocol           False
SrcIPAddress       False
SrcPort             True
Direction          False
DestIPAddress      False
DestPort            True
State               True
SrcTypeService      True
DestTypeService     True
TotalPackets       False
TotalBytes         False
SourceBytes        False
dtype: bool

In [4]:
df_train.nunique()

Timestamp          13881753
Duration            3437518
Protocol                 14
SrcIPAddress          10811
SrcPort               97555
Direction                10
DestIPAddress        589070
DestPort              95865
State                   373
SrcTypeService            5
DestTypeService           4
TotalPackets          12327
TotalBytes           224601
SourceBytes           82174
dtype: int64

In [5]:
# It is cleaned the 'Direction' field
df_train.Direction.unique()

array(['->', '<->', '   ->', '  <?>', '  <->', '   ?>', '  <-', '  who',
       '  <?', '<?>'], dtype=object)

In [6]:
dictionary = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}
df_train = df_train.replace({"Direction": dictionary})
df_train['Direction'].value_counts()

<->    9055197
->     4605238
<-      146606
<?>      74994
Name: Direction, dtype: int64

In [7]:
# It is completed in the Direction field the <?> values. A temporary 'ratio' field is used for this purpose.
df_train['ratio'] = df_train['SourceBytes']/df_train['TotalBytes']

In [8]:
print(df_train.loc[df_train['Direction'] == '->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<?>']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<-']['ratio'].mean())

0.5846801085715531
0.36214280955332806
0.6985354617227908
0.0


In [9]:
# '<-' value in field 'Direction' usually has 'SourceBytes' field equal to 0
df_train.loc[df_train['Direction'] == '<-'].head()

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,ratio
3507,2012/12/01 13:11:29.310824,0.000396,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,RED,,0.0,3,194,0,0.0
3639,2012/12/01 13:11:30.311807,0.00039,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,2,196,0,0.0
3749,2012/12/01 13:11:31.315594,1e-05,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0,0.0
3916,2012/12/01 13:11:32.315600,0.000108,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,196,0,0.0
4000,2012/12/01 13:11:33.325606,5e-06,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0,0.0


In [10]:
# With the information of field 'ratio' is determined a rule to complete the 'Direction' field
df_train.loc[(df_train['SourceBytes'] == 0) & (df_train['Direction'] == '<?>'), 'Direction'] = '<-'
df_train.loc[(df_train['ratio'] > 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '->'
df_train.loc[(df_train['ratio'] <= 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '<->'

In [11]:
df_train = df_train.drop('ratio',1)
df_train['Direction'].value_counts()

<->    9074226
->     4659678
<-      148131
Name: Direction, dtype: int64

In [12]:
df_train.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/clean_training_data', sep=',', index=False)

In [13]:
# Numerical features are scaled
df_train_num = df_train[['Duration','TotalPackets','TotalBytes','SourceBytes']]

In [14]:
scaler = StandardScaler()
scaler.fit(df_train_num)
df_train_num = pd.DataFrame(scaler.transform(df_train_num), columns=df_train_num.columns, index=df_train_num.index)

In [15]:
df_train_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,-0.140416,-0.008024,-0.008615,-0.003897
1,-0.248682,-0.010103,-0.008919,-0.004151
2,-0.248682,-0.010363,-0.008918,-0.004149
3,-0.248682,-0.010363,-0.008917,-0.004151
4,-0.248682,-0.010623,-0.008919,-0.004149


In [16]:
# Some categorical features are selected for One Hot Encoding (value count < 20)
df_train_cat = df_train[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [17]:
df_train_cat = df_train_cat.replace([np.nan],-1)

In [18]:
encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(df_train_cat)
features_names = encoder.get_feature_names(['Protocol','Direction','SrcTypeService','DestTypeService'])

In [19]:
df_train_cat = pd.DataFrame(encoder.transform(df_train_cat).toarray(),columns=features_names,index=df_train_cat.index)

In [20]:
# Finally is concatenated numerical and categorical data
train_data = pd.concat([df_train_num, df_train_cat], axis=1)
train_data.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes,Protocol_arp,Protocol_esp,Protocol_gre,Protocol_icmp,Protocol_igmp,Protocol_ipv6,...,SrcTypeService_0.0,SrcTypeService_1.0,SrcTypeService_2.0,SrcTypeService_3.0,SrcTypeService_192.0,DestTypeService_-1.0,DestTypeService_0.0,DestTypeService_1.0,DestTypeService_2.0,DestTypeService_3.0
0,-0.140416,-0.008024,-0.008615,-0.003897,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.248682,-0.010103,-0.008919,-0.004151,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.248682,-0.010363,-0.008918,-0.004149,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.248682,-0.010363,-0.008917,-0.004151,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.248682,-0.010623,-0.008919,-0.004149,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [36]:
train_data.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A1_train_data.csv', sep=',', index=False)

### Test data

In [53]:
df_test = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/test_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [39]:
df_test.shape

(1053845, 14)

In [40]:
# It is cleaned the 'Direction' field
df_test = df_test.replace({"Direction": dictionary})
df_test['Direction'].value_counts()

->     844228
<->    207315
<?>      2298
<-          4
Name: Direction, dtype: int64

In [41]:
# It is completed in the Direction field the <?> values. A temporary 'ratio' field is used for this purpose.
df_test['ratio'] = df_test['SourceBytes']/df_test['TotalBytes']

In [42]:
# With the information of field 'ratio' is determined a rule to complete the 'Direction' field
df_test.loc[(df_test['SourceBytes'] == 0) & (df_test['Direction'] == '<?>'), 'Direction'] = '<-'
df_test.loc[(df_test['ratio'] > 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '->'
df_test.loc[(df_test['ratio'] <= 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '<->'
df_test = df_test.drop('ratio',1)

In [43]:
df_test.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/clean_test_data', sep=',', index=False)

In [44]:
# Numerical features are scaled
df_test_num = df_test[['Duration','TotalPackets','TotalBytes','SourceBytes']]
df_test_num = pd.DataFrame(scaler.transform(df_test_num), columns=df_test_num.columns, index=df_test_num.index)

In [45]:
# Some categorical features are selected for One Hot Encoding (value count < 20)
df_test_cat = df_test[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [46]:
df_test_cat = df_test_cat.replace([np.nan],-1)

In [47]:
df_test_cat = pd.DataFrame(encoder.transform(df_test_cat).toarray(),columns=features_names,index=df_test_cat.index)

In [48]:
# Finally is concatenated numerical and categorical data
test_data = pd.concat([df_test_num, df_test_cat], axis=1)

In [49]:
test_data.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A1_test_data.csv', sep=',', index=False)

In [51]:
df_valid = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/valid_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [52]:
df_valid.shape

(940062, 15)

End