In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Train data

In [2]:
df_train = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/training_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [25]:
new_label = pd.DataFrame(data=np.where(df_train['Label'].str.contains("Botnet",case=False), 1, 0),columns=["Botnet"])

In [26]:
df_train_sel = df_train[['SrcIPAddress','DestIPAddress','Direction','Duration','TotalPackets','TotalBytes','SourceBytes']]


In [27]:
df_train_sel = pd.concat([df_train_sel,new_label], axis=1)

In [28]:
df_train_sel.head()

Unnamed: 0,SrcIPAddress,DestIPAddress,Direction,Duration,TotalPackets,TotalBytes,SourceBytes,Botnet
0,147.124.93.156,147.32.89.100,<->,0.000318,3,195,62,1
1,147.124.93.156,74.125.241.185,->,0.039559,7,858,607,1
2,147.124.93.156,147.32.89.100,<->,0.000423,2,560,90,1
3,147.124.93.156,65.55.65.31,->,2.99028,3,175,183,1
4,147.124.93.156,147.32.89.100,<->,0.000541,2,472,77,1


In [29]:
df_train_sel['PacketsBySec'] = df_train_sel['TotalPackets']/df_train_sel['Duration']
df_train_sel['SourceBytesBySec'] = df_train_sel['SourceBytes']/df_train_sel['Duration']
df_train_sel['TotalBytesBySec'] = df_train_sel['TotalBytes']/df_train_sel['Duration']

In [31]:
df_train_filtered =  df_train_sel[~df_train_sel.isin([np.nan, np.inf, -np.inf]).any(1)]

In [36]:
df_group = df_train_filtered.groupby(['SrcIPAddress']).mean()

In [37]:
df_group['Botnet'].value_counts()

0.000000    70
1.000000    46
0.999937     1
0.999934     1
Name: Botnet, dtype: int64

In [38]:
df_group.loc[(df_group['Botnet'] > 0.5), 'Botnet'] = 1.0

In [39]:
df_group.head()

Unnamed: 0_level_0,Duration,TotalPackets,TotalBytes,SourceBytes,Botnet,PacketsBySec,SourceBytesBySec,TotalBytesBySec
SrcIPAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
124.232.153.174,0.104518,2.286232,149.264493,78.702899,0.0,13882.919749,431491.252983,833081.520768
147.102.96.196,35.829232,13.564209,9173.177942,1398.44855,1.0,1662.120009,48544.183819,237982.640615
147.115.98.191,15.056139,16.217323,11788.204134,1437.70748,1.0,1773.958664,52080.610326,257499.951242
147.119.98.190,36.46852,7.01024,2880.259186,917.122696,1.0,1635.948241,47796.257082,237380.888041
147.124.93.156,14.920804,5.567289,2039.830527,593.554574,1.0,1751.074834,50859.49735,255220.597559


In [40]:
df_group.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A2_training_data_by_ip.csv', sep=',')

### Test data

In [41]:
df_test = pd.read_csv('../../ml-data/cyberattack_detection/1_raw_data/A2_2/test_data_with_labels', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes','Label'])

In [46]:
new_label_test = pd.DataFrame(data=np.where(df_test['Label'].str.contains("Botnet",case=False), 1, 0),columns=["Botnet"])

In [47]:
df_test_sel = df_test[['SrcIPAddress','DestIPAddress','Direction','Duration','TotalPackets','TotalBytes','SourceBytes']]


In [48]:
df_test_sel = pd.concat([df_test_sel,new_label_test], axis=1)

In [49]:
df_test_sel.head()

Unnamed: 0,SrcIPAddress,DestIPAddress,Direction,Duration,TotalPackets,TotalBytes,SourceBytes,Botnet
0,147.143.92.183,147.32.88.101,<->,0.000281,2,203,61,1
1,147.143.92.183,74.125.240.193,->,0.050073,7,861,659,1
2,147.143.92.183,147.32.88.101,<->,0.000455,3,575,87,1
3,147.143.92.183,147.32.88.101,<->,0.000402,3,484,75,1
4,147.143.92.183,65.55.64.32,->,3.049152,3,188,183,1


In [50]:
df_test_sel['PacketsBySec'] = df_test_sel['TotalPackets']/df_test_sel['Duration']
df_test_sel['SourceBytesBySec'] = df_test_sel['SourceBytes']/df_test_sel['Duration']
df_test_sel['TotalBytesBySec'] = df_test_sel['TotalBytes']/df_test_sel['Duration']

In [51]:
df_test_filtered = df_test_sel[~df_test_sel.isin([np.nan, np.inf, -np.inf]).any(1)]

In [52]:
df_group_test = df_test_filtered.groupby(['SrcIPAddress']).mean()

In [55]:
df_group_test['Botnet'].value_counts()

0.0    29
1.0    28
Name: Botnet, dtype: int64

In [54]:
df_group_test.loc[(df_group_test['Botnet'] > 0.5), 'Botnet'] = 1.0

In [56]:
df_group_test.to_csv('../../ml-data/cyberattack_detection/2_processed_data/A2_test_data_by_ip.csv', sep=',')

End.