# **Cyber Defense Attacks**

### *Libraries and functions*

In [16]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### *Data Origin*
<p>https://research.unsw.edu.au/projects/unsw-nb15-dataset</p>
<p>UNSW_NB15_training-set.csv</p>

In [17]:
training_set_df = pd.read_csv('data/UNSW_NB15_testing-set.csv');
print('Original Training Set', training_set_df.head);

Original Training Set <bound method NDFrame.head of           id       dur proto service state  spkts  dpkts  sbytes  dbytes  \
0          1  0.000011   udp       -   INT      2      0     496       0   
1          2  0.000008   udp       -   INT      2      0    1762       0   
2          3  0.000005   udp       -   INT      2      0    1068       0   
3          4  0.000006   udp       -   INT      2      0     900       0   
4          5  0.000010   udp       -   INT      2      0    2126       0   
...      ...       ...   ...     ...   ...    ...    ...     ...     ...   
82327  82328  0.000005   udp       -   INT      2      0     104       0   
82328  82329  1.106101   tcp       -   FIN     20      8   18062     354   
82329  82330  0.000000   arp       -   INT      1      0      46       0   
82330  82331  0.000000   arp       -   INT      1      0      46       0   
82331  82332  0.000009   udp       -   INT      2      0     104       0   

                rate  ...  ct_dst_s

<p>UNSW_NB15_testing-set.csv</p>

In [18]:
testing_set_df = pd.read_csv('data/UNSW_NB15_training-set.csv');
print('Original Testing Set', testing_set_df.head);

Original Testing Set <bound method NDFrame.head of             id       dur proto service state  spkts  dpkts  sbytes  dbytes  \
0            1  0.121478   tcp       -   FIN      6      4     258     172   
1            2  0.649902   tcp       -   FIN     14     38     734   42014   
2            3  1.623129   tcp       -   FIN      8     16     364   13186   
3            4  1.681642   tcp     ftp   FIN     12     12     628     770   
4            5  0.449454   tcp       -   FIN     10      6     534     268   
...        ...       ...   ...     ...   ...    ...    ...     ...     ...   
175336  175337  0.000009   udp     dns   INT      2      0     114       0   
175337  175338  0.505762   tcp       -   FIN     10      8     620     354   
175338  175339  0.000009   udp     dns   INT      2      0     114       0   
175339  175340  0.000009   udp     dns   INT      2      0     114       0   
175340  175341  0.000009   udp     dns   INT      2      0     114       0   

            

### *Compare column headings in each set*

In [19]:
unique_columns_training = training_set_df.columns.unique();
unique_columns_testing = testing_set_df.columns.unique();
print('training columns:', unique_columns_training, 'length:', len(unique_columns_training));
print('testing columns:', unique_columns_testing, 'length:', len(unique_columns_testing));
print('just in training:', [x for x in unique_columns_training if x not in unique_columns_testing], 'length:', len([x for x in unique_columns_training if x not in unique_columns_testing]));
print('just in testing:', [x for x in unique_columns_testing if x not in unique_columns_training], 'length:', len([x for x in unique_columns_testing if x not in unique_columns_training]));

training columns: Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object') length: 45
testing columns: Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 

### *Both files share the same categorical values.*

### *Remove unnecessary columns*

In [20]:
training_set_df.drop(['id', 'attack_cat'], axis=1, inplace=True);
testing_set_df.drop(['id', 'attack_cat'], axis=1, inplace=True);

### *Remove rows with missing values*

In [21]:
training_set_df.dropna(inplace=True);
print('Training Set after dropna', training_set_df.head);
print('Training set shape:', training_set_df.shape);


Training Set after dropna <bound method NDFrame.head of             dur proto service state  spkts  dpkts  sbytes  dbytes  \
0      0.000011   udp       -   INT      2      0     496       0   
1      0.000008   udp       -   INT      2      0    1762       0   
2      0.000005   udp       -   INT      2      0    1068       0   
3      0.000006   udp       -   INT      2      0     900       0   
4      0.000010   udp       -   INT      2      0    2126       0   
...         ...   ...     ...   ...    ...    ...     ...     ...   
82327  0.000005   udp       -   INT      2      0     104       0   
82328  1.106101   tcp       -   FIN     20      8   18062     354   
82329  0.000000   arp       -   INT      1      0      46       0   
82330  0.000000   arp       -   INT      1      0      46       0   
82331  0.000009   udp       -   INT      2      0     104       0   

                rate  sttl  ...  ct_src_dport_ltm  ct_dst_sport_ltm  \
0       90909.090200   254  ...             

In [22]:
testing_set_df.dropna(inplace=True);
print('Testing Set after dropna', testing_set_df.head);
print('Testing set shape:', testing_set_df.shape);

Testing Set after dropna <bound method NDFrame.head of              dur proto service state  spkts  dpkts  sbytes  dbytes  \
0       0.121478   tcp       -   FIN      6      4     258     172   
1       0.649902   tcp       -   FIN     14     38     734   42014   
2       1.623129   tcp       -   FIN      8     16     364   13186   
3       1.681642   tcp     ftp   FIN     12     12     628     770   
4       0.449454   tcp       -   FIN     10      6     534     268   
...          ...   ...     ...   ...    ...    ...     ...     ...   
175336  0.000009   udp     dns   INT      2      0     114       0   
175337  0.505762   tcp       -   FIN     10      8     620     354   
175338  0.000009   udp     dns   INT      2      0     114       0   
175339  0.000009   udp     dns   INT      2      0     114       0   
175340  0.000009   udp     dns   INT      2      0     114       0   

                 rate  sttl  ...  ct_src_dport_ltm  ct_dst_sport_ltm  \
0           74.087490   252  ...

### *Encode categorical features one-hot*

In [23]:
training_one_hot = training_set_df.copy();
testing_one_hot = testing_set_df.copy();
for column in training_one_hot.select_dtypes(include=['object']).columns: 
    encode_text_dummy(training_one_hot, column);
    encode_text_dummy(testing_one_hot, column);

  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy

### *Encode categorical features index*

In [24]:
training_index = training_set_df.copy();
testing_index = testing_set_df.copy();
for column in training_index.select_dtypes(include=['object']).columns: 
    encode_text_index(training_index, column);
    encode_text_index(testing_index, column);

### *Normalize numeric features*

In [25]:
for column in training_set_df.select_dtypes(include=['float64', 'int64']).columns:
    encode_numeric_zscore(training_one_hot, column);
    encode_numeric_zscore(testing_one_hot, column);
    encode_numeric_zscore(training_index, column);
    encode_numeric_zscore(testing_index, column);

### *Training one-hot set after encoding*

In [26]:
print(training_one_hot.head);

<bound method NDFrame.head of             dur     spkts     dpkts    sbytes    dbytes      rate      sttl  \
0     -0.213726 -0.124454 -0.151815 -0.043683 -0.087368  0.057181  0.719436   
1     -0.213727 -0.124454 -0.151815 -0.036308 -0.087368  0.286563  0.719436   
2     -0.213727 -0.124454 -0.151815 -0.040351 -0.087368  0.791205  0.719436   
3     -0.213727 -0.124454 -0.151815 -0.041330 -0.087368  0.566919  0.719436   
4     -0.213726 -0.124454 -0.151815 -0.034187 -0.087368  0.118349  0.719436   
...         ...       ...       ...       ...       ...       ...       ...   
82327 -0.213727 -0.124454 -0.151815 -0.045967 -0.087368  0.791205  0.719436   
82328  0.021090  0.009958 -0.082596  0.058657 -0.085031 -0.554342  0.719436   
82329 -0.213728 -0.131922 -0.151815 -0.046305 -0.087368 -0.554506 -1.782698   
82330 -0.213728 -0.131922 -0.151815 -0.046305 -0.087368 -0.554506 -1.782698   
82331 -0.213727 -0.124454 -0.151815 -0.045967 -0.087368  0.193111  0.719436   

           dttl     s

### *Testing one-hot set after encoding*

In [27]:
print(testing_one_hot.head);

<bound method NDFrame.head of              dur     spkts     dpkts    sbytes    dbytes      rate      sttl  \
0      -0.191028 -0.104456 -0.135768 -0.049133 -0.102725 -0.576370  0.703837   
1      -0.109484 -0.046013  0.172598 -0.046410  0.188544 -0.576343 -1.141898   
2       0.040699 -0.089845 -0.026933 -0.048527 -0.012133 -0.576732 -1.141898   
3       0.049729 -0.060624 -0.063212 -0.047016 -0.098562 -0.576735 -1.141898   
4      -0.140417 -0.075234 -0.117629 -0.047554 -0.102057 -0.576616  0.723266   
...          ...       ...       ...       ...       ...       ...       ...   
175336 -0.209773 -0.133677 -0.172047 -0.049957 -0.103923  0.094951  0.723266   
175337 -0.131727 -0.075234 -0.099490 -0.047062 -0.101458 -0.576614  0.723266   
175338 -0.209773 -0.133677 -0.172047 -0.049957 -0.103923  0.094951  0.723266   
175339 -0.209773 -0.133677 -0.172047 -0.049957 -0.103923  0.094951  0.723266   
175340 -0.209773 -0.133677 -0.172047 -0.049957 -0.103923  0.094951  0.723266   

         

### *Training index set after encoding*

In [28]:
print(training_index.head);

<bound method NDFrame.head of             dur  proto  service  state     spkts     dpkts    sbytes  \
0     -0.213726    117        0      4 -0.124454 -0.151815 -0.043683   
1     -0.213727    117        0      4 -0.124454 -0.151815 -0.036308   
2     -0.213727    117        0      4 -0.124454 -0.151815 -0.040351   
3     -0.213727    117        0      4 -0.124454 -0.151815 -0.041330   
4     -0.213726    117        0      4 -0.124454 -0.151815 -0.034187   
...         ...    ...      ...    ...       ...       ...       ...   
82327 -0.213727    117        0      4 -0.124454 -0.151815 -0.045967   
82328  0.021090    111        0      3  0.009958 -0.082596  0.058657   
82329 -0.213728      6        0      4 -0.131922 -0.151815 -0.046305   
82330 -0.213728      6        0      4 -0.131922 -0.151815 -0.046305   
82331 -0.213727    117        0      4 -0.124454 -0.151815 -0.045967   

         dbytes      rate      sttl  ...  ct_src_dport_ltm  ct_dst_sport_ltm  \
0     -0.087368  0.057181

### *Testing index set after encoding*

In [29]:
print(testing_index.head);

<bound method NDFrame.head of              dur  proto  service  state     spkts     dpkts    sbytes  \
0      -0.191028    113        0      2 -0.104456 -0.135768 -0.049133   
1      -0.109484    113        0      2 -0.046013  0.172598 -0.046410   
2       0.040699    113        0      2 -0.089845 -0.026933 -0.048527   
3       0.049729    113        3      2 -0.060624 -0.063212 -0.047016   
4      -0.140417    113        0      2 -0.075234 -0.117629 -0.047554   
...          ...    ...      ...    ...       ...       ...       ...   
175336 -0.209773    119        2      3 -0.133677 -0.172047 -0.049957   
175337 -0.131727    113        0      2 -0.075234 -0.099490 -0.047062   
175338 -0.209773    119        2      3 -0.133677 -0.172047 -0.049957   
175339 -0.209773    119        2      3 -0.133677 -0.172047 -0.049957   
175340 -0.209773    119        2      3 -0.133677 -0.172047 -0.049957   

          dbytes      rate      sttl  ...  ct_src_dport_ltm  ct_dst_sport_ltm  \
0      -0.10

### *Save dataframes*

In [30]:
training_one_hot.to_pickle('data/training_one_hot.pkl');
testing_one_hot.to_pickle('data/testing_one_hot.pkl');
training_index.to_pickle('data/training_index.pkl');
testing_index.to_pickle('data/testing_index.pkl');