In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Train data

In [2]:
df_train = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/training_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [27]:
df_train.isna().any()

Timestamp          False
Duration           False
Protocol           False
SrcIPAddress       False
SrcPort             True
Direction          False
DestIPAddress      False
DestPort            True
State              False
SrcTypeService      True
DestTypeService     True
TotalPackets       False
TotalBytes         False
SourceBytes        False
Label              False
ratio              False
dtype: bool

In [None]:
df_train.nunique()

In [None]:
df_train.Direction.unique()

In [3]:
dictionary = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}

In [4]:
df_train = df_train.replace({"Direction": dictionary})

In [5]:
df_train['Direction'].value_counts()

<->    1370635
->      601333
<-        3250
<?>       2949
Name: Direction, dtype: int64

In [None]:
df_train.Direction.unique()

In [6]:
df_train['ratio'] = df_train['SourceBytes']/df_train['TotalBytes']

In [None]:
print(df_train.loc[df_train['Direction'] == '->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<?>']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<-']['ratio'].mean())

In [None]:
df_train.loc[df_train['Direction'] == '<-'].head()

In [7]:
df_train.loc[(df_train['SourceBytes'] == 0) & (df_train['Direction'] == '<?>'), 'Direction'] = '<-'
df_train.loc[(df_train['ratio'] > 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '->'
df_train.loc[(df_train['ratio'] <= 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '<->'

In [8]:
df_train['Direction'].value_counts()

<->    1372078
->      602839
<-        3250
Name: Direction, dtype: int64

In [9]:
df_train.drop('ratio',1)

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,Label
0,2011/08/17 11:48:29.395153,0.000318,udp,147.124.93.156,1025,<->,147.32.89.100,53,CON,0.0,0.0,3,195,62,flow=From-Botnet-V50-1-UDP-DNS
1,2011/08/17 11:48:31.525252,0.039559,tcp,147.124.93.156,1027,->,74.125.241.185,80,SRPA_SPA,0.0,0.0,7,858,607,flow=From-Botnet-V50-1-TCP-HTTP-Google-Net-Est...
2,2011/08/17 11:48:31.877057,0.000423,udp,147.124.93.156,1025,<->,147.32.89.100,53,CON,0.0,0.0,2,560,90,flow=From-Botnet-V50-1-UDP-DNS
3,2011/08/17 11:48:34.224708,2.990280,udp,147.124.93.156,123,->,65.55.65.31,123,INT,0.0,,3,175,183,flow=From-Botnet-V50-1-UDP-Attempt
4,2011/08/17 11:48:34.767425,0.000541,udp,147.124.93.156,1025,<->,147.32.89.100,53,CON,0.0,0.0,2,472,77,flow=From-Botnet-V50-1-UDP-DNS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1978162,2011/08/17 17:23:16.042445,0.316720,udp,147.72.101.192,1568,<->,147.32.97.8,53,CON,0.0,0.0,3,233,80,flow=From-Botnet-V50-10-UDP-DNS
1978163,2011/08/17 17:23:16.322385,0.000000,tcp,147.72.101.192,1170,->,117.53.131.2,25,S_,0.0,,1,60,60,flow=From-Botnet-V50-10-TCP-Attempt-SPAM
1978164,2011/08/17 17:23:16.325767,0.954443,tcp,147.72.101.192,1171,->,46.4.53.103,443,S_RA,0.0,0.0,6,364,181,flow=From-Botnet-V50-10-TCP-Attempt
1978165,2011/08/17 17:23:17.214008,0.000000,tcp,147.72.101.192,1172,->,211.43.214.125,25,S_,0.0,,2,59,60,flow=From-Botnet-V50-10-TCP-Attempt-SPAM


In [11]:
new_label = pd.DataFrame(data=np.where(df_train['Label'].str.contains("Botnet",case=False), 0, 1),columns=["Normal"])

In [12]:
df_train_sel = df_train[['SrcIPAddress','DestIPAddress','Direction','Duration','TotalPackets','TotalBytes','SourceBytes']]


In [13]:
df_train_sel = pd.concat([df_train_sel,new_label], axis=1)

In [23]:
df_train_sel.head()

Unnamed: 0,SrcIPAddress,DestIPAddress,Direction,Duration,TotalPackets,TotalBytes,SourceBytes,Normal,PacketsBySec,SourceBytesBySec,TotalBytesBySec
0,147.124.93.156,147.32.89.100,<->,0.000318,3,195,62,0,9433.962264,194968.553459,613207.5
1,147.124.93.156,74.125.241.185,->,0.039559,7,858,607,0,176.950883,15344.169468,21689.12
2,147.124.93.156,147.32.89.100,<->,0.000423,2,560,90,0,4728.132388,212765.957447,1323877.0
3,147.124.93.156,65.55.65.31,->,2.99028,3,175,183,0,1.003251,61.198282,58.52295
4,147.124.93.156,147.32.89.100,<->,0.000541,2,472,77,0,3696.857671,142329.020333,872458.4


In [19]:
df_train_sel['PacketsBySec'] = df_train_sel['TotalPackets']/df_train_sel['Duration']
df_train_sel['SourceBytesBySec'] = df_train_sel['SourceBytes']/df_train_sel['Duration']
df_train_sel['TotalBytesBySec'] = df_train_sel['TotalBytes']/df_train_sel['Duration']

In [34]:
df_train_fil =  df_train_sel[~df_train_sel.isin([np.nan, np.inf, -np.inf]).any(1)]

In [35]:
df_group = df_train_fil.groupby(['SrcIPAddress']).mean()

In [45]:
df_group['Normal'].value_counts()

1.0    70
0.0    48
Name: Normal, dtype: int64

In [44]:
df_group.loc[(df_group['Normal'] < 0.5), 'Normal'] = 0.0

In [69]:
df_group.head()

Unnamed: 0_level_0,Duration,TotalPackets,TotalBytes,SourceBytes,Normal,PacketsBySec,SourceBytesBySec,TotalBytesBySec
SrcIPAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
124.232.153.174,0.104518,2.286232,149.264493,78.702899,1.0,13882.919749,431491.252983,833081.520768
147.102.96.196,35.829232,13.564209,9173.177942,1398.44855,0.0,1662.120009,48544.183819,237982.640615
147.115.98.191,15.056139,16.217323,11788.204134,1437.70748,0.0,1773.958664,52080.610326,257499.951242
147.119.98.190,36.46852,7.01024,2880.259186,917.122696,0.0,1635.948241,47796.257082,237380.888041
147.124.93.156,14.920804,5.567289,2039.830527,593.554574,0.0,1751.074834,50859.49735,255220.597559


In [70]:
df_group.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A2_training_data_by_ip.csv', sep=',')

### Test data

In [47]:
df_test = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/test_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [48]:
df_test = df_test.replace({"Direction": dictionary})

In [49]:
df_test['Direction'].value_counts()

<->    313456
->     258225
<-       2468
<?>       234
Name: Direction, dtype: int64

In [50]:
df_test['ratio'] = df_test['SourceBytes']/df_test['TotalBytes']

In [51]:
df_test.loc[(df_test['SourceBytes'] == 0) & (df_test['Direction'] == '<?>'), 'Direction'] = '<-'
df_test.loc[(df_test['ratio'] > 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '->'
df_test.loc[(df_test['ratio'] <= 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '<->'

In [52]:
new_label_test = pd.DataFrame(data=np.where(df_test['Label'].str.contains("Botnet",case=False), 0, 1),columns=["Normal"])

In [53]:
df_test_sel = df_test[['SrcIPAddress','DestIPAddress','Direction','Duration','TotalPackets','TotalBytes','SourceBytes']]


In [54]:
df_test_sel = pd.concat([df_test_sel,new_label_test], axis=1)

In [55]:
df_test_sel.head()

Unnamed: 0,SrcIPAddress,DestIPAddress,Direction,Duration,TotalPackets,TotalBytes,SourceBytes,Normal
0,147.143.92.183,147.32.88.101,<->,0.000281,2,203,61,0
1,147.143.92.183,74.125.240.193,->,0.050073,7,861,659,0
2,147.143.92.183,147.32.88.101,<->,0.000455,3,575,87,0
3,147.143.92.183,147.32.88.101,<->,0.000402,3,484,75,0
4,147.143.92.183,65.55.64.32,->,3.049152,3,188,183,0


In [56]:
df_test_sel['PacketsBySec'] = df_test_sel['TotalPackets']/df_test_sel['Duration']
df_test_sel['SourceBytesBySec'] = df_test_sel['SourceBytes']/df_test_sel['Duration']
df_test_sel['TotalBytesBySec'] = df_test_sel['TotalBytes']/df_test_sel['Duration']

In [59]:
df_test_fil = df_test_sel[~df_test_sel.isin([np.nan, np.inf, -np.inf]).any(1)]

In [61]:
df_group_test = df_test_fil.groupby(['SrcIPAddress']).mean()

In [66]:
df_group_test['Normal'].value_counts()

1.0    29
0.0    28
Name: Normal, dtype: int64

In [65]:
df_group_test.loc[(df_group_test['Normal'] < 0.5), 'Normal'] = 0.0

In [72]:
df_group_test.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A2_test_data_by_ip.csv', sep=',')