In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Train data

In [2]:
df_train = pd.read_csv('../ml-data/cyberattack_detection/A2_1/training_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [3]:
df_train.isna().any()

Timestamp          False
Duration           False
Protocol           False
SrcIPAddress       False
SrcPort             True
Direction          False
DestIPAddress      False
DestPort            True
State               True
SrcTypeService      True
DestTypeService     True
TotalPackets       False
TotalBytes         False
SourceBytes        False
dtype: bool

In [4]:
df_train.nunique()

Timestamp          13881753
Duration            3437518
Protocol                 14
SrcIPAddress          10811
SrcPort               97555
Direction                10
DestIPAddress        589070
DestPort              95865
State                   373
SrcTypeService            5
DestTypeService           4
TotalPackets          12327
TotalBytes           224601
SourceBytes           82174
dtype: int64

In [5]:
df_train.Direction.unique()

array(['->', '<->', '   ->', '  <?>', '  <->', '   ?>', '  <-', '  who',
       '  <?', '<?>'], dtype=object)

In [3]:
dictionary = {'   ->':'->','  <?>':'<?>','  <->':'<->','   ?>':'<?>','  <-':'<-','  who':'<?>','  <?':'<?>'}

In [4]:
df_train = df_train.replace({"Direction": dictionary})

In [5]:
df_train['Direction'].value_counts()

<->    9055197
->     4605238
<-      146606
<?>      74994
Name: Direction, dtype: int64

In [9]:
df_train.Direction.unique()

array(['->', '<->', '<?>', '<-'], dtype=object)

In [6]:
df_train['ratio'] = df_train['SourceBytes']/df_train['TotalBytes']

In [11]:
print(df_train.loc[df_train['Direction'] == '->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<->']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<?>']['ratio'].mean())
print(df_train.loc[df_train['Direction'] == '<-']['ratio'].mean())

0.5846801085715531
0.36214280955332806
0.6985354617227908
0.0


In [12]:
df_train.loc[df_train['Direction'] == '<-'].head()

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes,ratio
3507,2012/12/01 13:11:29.310824,0.000396,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,RED,,0.0,3,194,0,0.0
3639,2012/12/01 13:11:30.311807,0.00039,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,2,196,0,0.0
3749,2012/12/01 13:11:31.315594,1e-05,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0,0.0
3916,2012/12/01 13:11:32.315600,0.000108,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,196,0,0.0
4000,2012/12/01 13:11:33.325606,5e-06,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0,0.0


In [7]:
df_train.loc[(df_train['SourceBytes'] == 0) & (df_train['Direction'] == '<?>'), 'Direction'] = '<-'
df_train.loc[(df_train['ratio'] > 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '->'
df_train.loc[(df_train['ratio'] <= 0.5) & (df_train['Direction'] == '<?>'), 'Direction'] = '<->'

In [8]:
df_train['Direction'].value_counts()

<->    9074226
->     4659678
<-      148131
Name: Direction, dtype: int64

In [11]:
df_train.drop('ratio',1)

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes
0,2012/12/01 03:25:11.124065,67.205656,tcp,181.74.92.157,4832,->,203.187.236.103,3389,FSPA_FSPA,0.0,0.0,11,1216,480
1,2012/12/01 03:25:23.962955,0.000000,tcp,181.74.92.157,4842,->,128.129.8.3,135,S_,0.0,,3,59,58
2,2012/12/01 03:25:23.989079,0.000000,tcp,181.74.92.157,4836,->,133.185.92.5,135,S_,0.0,,2,61,61
3,2012/12/01 03:25:23.994181,0.000000,tcp,181.74.92.157,4839,->,199.184.92.4,135,S_,0.0,,2,64,58
4,2012/12/01 03:25:24.028562,0.000000,tcp,181.74.92.157,4833,->,133.83.93.101,135,S_,0.0,,1,58,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13882030,2012/12/10 21:49:09.891002,3570.245907,tcp,138.190.92.2,6667,->,157.24.106.143,1027,PA_PA,0.0,0.0,101,7970,4375
13882031,2012/12/10 22:03:55.915950,3581.845639,tcp,138.190.92.2,6667,->,157.24.106.143,1027,PA_PA,0.0,0.0,98,7739,4196
13882032,2012/12/10 22:36:29.925603,3350.561915,tcp,138.190.92.2,6667,<->,157.24.106.143,1027,PA_PA,0.0,0.0,107,8253,3940
13882033,2012/12/10 22:51:15.908994,3433.603673,tcp,138.190.92.2,6667,->,157.24.106.143,1027,PA_PA,0.0,0.0,89,7479,4297


In [12]:
df_train.to_csv('../ml-data/cyberattack_detection/A2_1/clean_training_data', sep=',', index=False)

In [15]:
df_train_num = df_train[['Duration','TotalPackets','TotalBytes','SourceBytes']]

In [16]:
df_train_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,67.205656,11,1216,480
1,0.0,3,59,58
2,0.0,2,61,61
3,0.0,2,64,58
4,0.0,1,58,61


In [17]:
scaler = StandardScaler()
scaler.fit(df_train_num)
df_train_num = pd.DataFrame(scaler.transform(df_train_num), columns=df_train_num.columns, index=df_train_num.index)

In [18]:
df_train_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,-0.140416,-0.008024,-0.008615,-0.003897
1,-0.248682,-0.010103,-0.008919,-0.004151
2,-0.248682,-0.010363,-0.008918,-0.004149
3,-0.248682,-0.010363,-0.008917,-0.004151
4,-0.248682,-0.010623,-0.008919,-0.004149


In [20]:
print(df_train_num['Duration'].max())

6.095065256229643


In [21]:
df_train_cat = df_train[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [22]:
df_train_cat = df_train_cat.replace([np.nan],-1)

In [23]:
df_train_cat.head()

Unnamed: 0,Protocol,Direction,SrcTypeService,DestTypeService
0,tcp,->,0.0,0.0
1,tcp,->,0.0,-1.0
2,tcp,->,0.0,-1.0
3,tcp,->,0.0,-1.0
4,tcp,->,0.0,-1.0


In [24]:
encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(df_train_cat)
features_names = encoder.get_feature_names(['Protocol','Direction','SrcTypeService','DestTypeService'])

In [25]:
df_train_cat = pd.DataFrame(encoder.transform(df_train_cat).toarray(),columns=features_names,index=df_train_cat.index)

In [26]:
df_train_cat.columns

Index(['Protocol_arp', 'Protocol_esp', 'Protocol_gre', 'Protocol_icmp',
       'Protocol_igmp', 'Protocol_ipv6', 'Protocol_ipv6-icmp', 'Protocol_pim',
       'Protocol_rsvp', 'Protocol_rtcp', 'Protocol_rtp', 'Protocol_tcp',
       'Protocol_udp', 'Protocol_udt', 'Direction_->', 'Direction_<-',
       'Direction_<->', 'SrcTypeService_-1.0', 'SrcTypeService_0.0',
       'SrcTypeService_1.0', 'SrcTypeService_2.0', 'SrcTypeService_3.0',
       'SrcTypeService_192.0', 'DestTypeService_-1.0', 'DestTypeService_0.0',
       'DestTypeService_1.0', 'DestTypeService_2.0', 'DestTypeService_3.0'],
      dtype='object')

In [27]:
df_train_cat.head()

Unnamed: 0,Protocol_arp,Protocol_esp,Protocol_gre,Protocol_icmp,Protocol_igmp,Protocol_ipv6,Protocol_ipv6-icmp,Protocol_pim,Protocol_rsvp,Protocol_rtcp,...,SrcTypeService_0.0,SrcTypeService_1.0,SrcTypeService_2.0,SrcTypeService_3.0,SrcTypeService_192.0,DestTypeService_-1.0,DestTypeService_0.0,DestTypeService_1.0,DestTypeService_2.0,DestTypeService_3.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [29]:
train_data = pd.concat([df_train_num, df_train_cat], axis=1)

In [30]:
train_data.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes,Protocol_arp,Protocol_esp,Protocol_gre,Protocol_icmp,Protocol_igmp,Protocol_ipv6,...,SrcTypeService_0.0,SrcTypeService_1.0,SrcTypeService_2.0,SrcTypeService_3.0,SrcTypeService_192.0,DestTypeService_-1.0,DestTypeService_0.0,DestTypeService_1.0,DestTypeService_2.0,DestTypeService_3.0
0,-0.140416,-0.008024,-0.008615,-0.003897,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.248682,-0.010103,-0.008919,-0.004151,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.248682,-0.010363,-0.008918,-0.004149,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.248682,-0.010363,-0.008917,-0.004151,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.248682,-0.010623,-0.008919,-0.004149,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [46]:
train_data.to_csv('/Applications/Splunk/etc/apps/Splunk_ML_Toolkit/lookups/A1_train_data.csv', sep=',', index=False)

### Test data

In [32]:
df_test = pd.read_csv('../ml-data/cyberattack_detection/A2_1/test_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])
#dtype = {'SrcPort': str,'DestPort': str}

In [33]:
df_test = df_test.replace({"Direction": dictionary})

In [34]:
df_test['Direction'].value_counts()

->     844228
<->    207315
<?>      2298
<-          4
Name: Direction, dtype: int64

In [35]:
df_test['ratio'] = df_test['SourceBytes']/df_test['TotalBytes']

In [36]:
df_test.loc[(df_test['SourceBytes'] == 0) & (df_test['Direction'] == '<?>'), 'Direction'] = '<-'
df_test.loc[(df_test['ratio'] > 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '->'
df_test.loc[(df_test['ratio'] <= 0.5) & (df_test['Direction'] == '<?>'), 'Direction'] = '<->'

In [37]:
df_test_num = df_test[['Duration','TotalPackets','TotalBytes','SourceBytes']]

In [38]:
df_test_num = pd.DataFrame(scaler.transform(df_test_num), columns=df_test_num.columns, index=df_test_num.index)

In [39]:
df_test_num.head()

Unnamed: 0,Duration,TotalPackets,TotalBytes,SourceBytes
0,5.727052,0.010949,-0.007308,-0.001807
1,5.08317,0.00913,-0.007336,-0.001802
2,5.281135,0.011209,-0.007299,-0.00189
3,5.502734,0.013548,-0.006943,-0.001699
4,5.224429,0.00965,-0.00731,-0.002012


In [40]:
df_test_num['Duration'].max()

6.074527847619242

In [41]:
df_test_cat = df_test[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [42]:
df_test_cat = df_test_cat.replace([np.nan],-1)

In [43]:
df_test_cat = pd.DataFrame(encoder.transform(df_test_cat).toarray(),columns=features_names,index=df_test_cat.index)

In [44]:
test_data = pd.concat([df_test_num, df_test_cat], axis=1)

In [45]:
test_data.to_csv('/Applications/Splunk/etc/apps/Splunk_ML_Toolkit/lookups/A1_test_data.csv', sep=',', index=False)

### Validation

In [48]:
df_val = pd.read_csv('../ml-data/cyberattack_detection/A2_1/valid_data', sep=','
                       ,names=['Timestamp','Duration','Protocol','SrcIPAddress','SrcPort','Direction',
                               'DestIPAddress','DestPort','State','SrcTypeService','DestTypeService',
                               'TotalPackets','TotalBytes','SourceBytes'])

In [49]:
df_val = df_val.replace({"Direction": dictionary})

In [50]:
df_val['Direction'].value_counts()

->     732810
<->    200970
<-       5585
<?>       697
Name: Direction, dtype: int64

In [51]:
df_val['ratio'] = df_val['SourceBytes']/df_val['TotalBytes']

In [52]:
df_val.loc[(df_val['SourceBytes'] == 0) & (df_val['Direction'] == '<?>'), 'Direction'] = '<-'
df_val.loc[(df_val['ratio'] > 0.5) & (df_val['Direction'] == '<?>'), 'Direction'] = '->'
df_val.loc[(df_val['ratio'] <= 0.5) & (df_val['Direction'] == '<?>'), 'Direction'] = '<->'

In [53]:
df_val_num = df_val[['Duration','TotalPackets','TotalBytes','SourceBytes']]

In [54]:
df_val_num = pd.DataFrame(scaler.transform(df_val_num), columns=df_val_num.columns, index=df_val_num.index)

In [58]:
df_val_num['Duration'].max()

6.108265538666062

In [55]:
df_val_cat = df_val[['Protocol','Direction','SrcTypeService','DestTypeService']]

In [56]:
df_val_cat = df_val_cat.replace([np.nan],-1)

In [57]:
df_val_cat = pd.DataFrame(encoder.transform(df_val_cat).toarray(),columns=features_names,index=df_val_cat.index)

In [59]:
df_val_cat.head()

Unnamed: 0,Protocol_arp,Protocol_esp,Protocol_gre,Protocol_icmp,Protocol_igmp,Protocol_ipv6,Protocol_ipv6-icmp,Protocol_pim,Protocol_rsvp,Protocol_rtcp,...,SrcTypeService_0.0,SrcTypeService_1.0,SrcTypeService_2.0,SrcTypeService_3.0,SrcTypeService_192.0,DestTypeService_-1.0,DestTypeService_0.0,DestTypeService_1.0,DestTypeService_2.0,DestTypeService_3.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [60]:
validation_data = pd.concat([df_val_num, df_val_cat], axis=1)

In [61]:
validation_data.to_csv('/Applications/Splunk/etc/apps/Splunk_ML_Toolkit/lookups/A1_validation_data.csv', sep=',', index=False)

In [11]:
training_data[training_data['SrcPort'].isnull()]

Unnamed: 0,Timestamp,Duration,Protocol,SrcIPAddress,SrcPort,Direction,DestIPAddress,DestPort,State,SrcTypeService,DestTypeService,TotalPackets,TotalBytes,SourceBytes
3064,2012/12/01 13:11:26.252055,3488.878144,pim,162.217.94.6,,->,150.184.7.6,,INT,0.0,,191,14239,13359
3507,2012/12/01 13:11:29.310824,0.000396,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,RED,,0.0,3,194,0
3639,2012/12/01 13:11:30.311807,0.000390,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,2,196,0
3749,2012/12/01 13:11:31.315594,0.000010,icmp,152.205.93.89,,<-,146.204.94.31,0xcb07,ECR,,0.0,3,193,0
3859,2012/12/01 13:11:31.736798,3473.586043,arp,169.177.94.33,,who,229.173.94.6,,CON,,,110,6921,3316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13878759,2012/12/10 15:09:47.345212,0.000000,igmp,162.217.94.6,,->,155.89.7.6,,INT,192.0,,1,60,61
13878767,2012/12/10 15:09:47.672333,0.000000,igmp,157.196.94.31,,->,202.190.9.243,,INT,0.0,,1,62,60
13878801,2012/12/10 15:09:48.279993,0.000000,igmp,174.114.94.3,,->,145.162.9.246,,INT,0.0,,2,60,57
13878802,2012/12/10 15:09:48.280647,0.000000,igmp,174.114.94.3,,->,192.67.8.15,,INT,0.0,,1,57,62
