In [None]:
import pandas as pd
import numpy as np

### Train data

In [None]:
df_train = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/training_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [None]:
df_train.isna().any()

In [None]:
df_train.nunique()

In [None]:
# It is cleaned the 'Direction' field
df_train.Direction.unique()

In [None]:
dictionary = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}
df_train = df_train.replace({"Direction": dictionary})
df_train['Direction'].value_counts()

In [None]:
# It is completed in the Direction field the <?> values. A temporary 'ratio' field is used for this purpose.
df_train['ratio'] = df_train['SourceBytes']/df_train['TotalBytes']

In [None]:
print(df_train.loc[df_train['Direction'] == '->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<?>']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<-']['ratio'].mean())

In [None]:
# '<-' value in field 'Direction' usually has 'SourceBytes' field equal to 0
df_train.loc[df_train['Direction'] == '<-'].head()

In [None]:
# With the information of field 'ratio' is determined a rule to complete the 'Direction' field
df_train.loc[(df_train['SourceBytes'] == 0) & (df_train['Direction'] == '<?>'), 'Direction'] = '<-'
df_train.loc[(df_train['ratio'] > 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '->'
df_train.loc[(df_train['ratio'] <= 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '<->'

In [None]:
df_train = df_train.drop('ratio',1)
df_train['Direction'].value_counts()

In [None]:
df_train['PacketsSec'] = df_train['TotalPackets']/df_train['Duration']
df_train['TotalBytesSec'] = df_train['TotalBytes']/df_train['Duration']
df_train['SourceBytesSec'] = df_train['SourceBytes']/df_train['Duration']

In [None]:
df_train.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/clean_training_data', sep=',', index=False)

### Test data

In [None]:
df_test = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/test_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [None]:
df_test.shape

In [None]:
# It is cleaned the 'Direction' field
df_test = df_test.replace({"Direction": dictionary})
df_test['Direction'].value_counts()

In [None]:
# It is completed in the Direction field the <?> values. A temporary 'ratio' field is used for this purpose.
df_test['ratio'] = df_test['SourceBytes']/df_test['TotalBytes']

In [None]:
# With the information of field 'ratio' is determined a rule to complete the 'Direction' field
df_test.loc[(df_test['SourceBytes'] == 0) & (df_test['Direction'] == '<?>'), 'Direction'] = '<-'
df_test.loc[(df_test['ratio'] > 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '->'
df_test.loc[(df_test['ratio'] <= 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '<->'
df_test = df_test.drop('ratio',1)

In [None]:
df_test['PacketsSec'] = df_test['TotalPackets']/df_test['Duration']
df_test['TotalBytesSec'] = df_test['TotalBytes']/df_test['Duration']
df_test['SourceBytesSec'] = df_test['SourceBytes']/df_test['Duration']

In [None]:
df_test.to_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/clean_test_data', sep=',', index=False)

### Validation data

In [None]:
df_valid = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_1/valid_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [None]:
df_valid['PacketsSec'] = df_valid['TotalPackets']/df_valid['Duration']
df_valid['TotalBytesSec'] = df_valid['TotalBytes']/df_valid['Duration']
df_valid['SourceBytesSec'] = df_valid['SourceBytes']/df_valid['Duration']

In [None]:
df_valid.loc[df_valid['PacketsSec'] != np.inf].describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

In [None]:
df_valid['Label'].value_counts()

In [None]:
df_train.loc[df_train['PacketsSec'] != np.inf].describe().apply(lambda s: s.apply(lambda x: format(x, 'g')))

End