# Anomaly Detection for UNSW NB-15 cyberattack dataset
Identifying cyberattacks can be considered both a classification problem and an anomaly detection problem. In this notebook, I treat it as an anomaly detection problem. Given that [the training and test csv's provided by the researchers](https://cloudstor.aarnet.edu.au/plus/index.php/s/2DhnLGDdEECo4ys?path=%2FUNSW-NB15%20-%20CSV%20Files%2Fa%20part%20of%20training%20and%20testing%20set) are balanced between the normal and anomaly classes, I'm going to down-sample to reduce the available number of anomalies.

In [20]:
# Custom modules
import data_prep as dp 
import model_abstraction as moda

# Data Structures
import pandas as pd
import numpy as np
import pickle as pkl

# Preprocessing or data manipulation methods
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Modeling methods and selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, StratifiedKFold

# Anomaly detection
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

# Model assessment
from sklearn.metrics import confusion_matrix, roc_auc_score

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Modeling with more developed features
The model selection process is similar to that of the previous notebook, however in this attempt the dataset is different. I've added more contextual features as one-hot-encoded variables, I've removed some features that were not useful in differentiating between attack and normal data, and have incorporated standard scaling for the numeric features that remain. 

In [12]:
# Load train and test data
X_train, y_train = dp.load_agg_Xy(path='./data/', sample_size=0.4, strat_cat='label')

# Reformat for anomaly detection labeling
y_train = y_train.apply(dp.y_anomaly_format)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
# Create holdout set to verify generalizability of results
X_train, X_hold, y_train, y_hold  = train_test_split(X_train,y_train, test_size = 0.25,
                                                     random_state = 42, stratify=y_train)

The type of anomaly detection methods used below start by training on normal data and building a profile. From there, they can be used to predict on unseen data.

In [14]:
## Create 'masks' to filter the dataframe by whether or not the observation belongs to normal or attack class.
train_normal = y_train==1

In [21]:
## Bring in column transformations to process contextual features and standardize numerical values
#with open('ct_ohe_ssc_xyagg.pkl', 'rb') as f:
#    col_trans = pkl.load(f)

# Define the individual steps
ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
ssc_step = ('std_sclr', StandardScaler())

# Make the step part of a pipeline
ohe_pipe = Pipeline([ohe_step])
ssc_pipe = Pipeline([ssc_step])

# Columns to transform: categorical columns for encoding, numeric feature columns for standardizing
ohe_cols = ['proto', 'state', 'service']
binary_cols = ['is_sm_ips_ports', 'is_ftp_login']
non_ssc_cols = ohe_cols+binary_cols
ssc_cols = [col for col in X_train.columns if col not in non_ssc_cols]

# Transformer input: tuple w/ contents ('name', SomeTransformer(Parameters), columns)
transformer = [
    ('one_hot_encoding', ohe_pipe, ohe_cols),
    ('standard_scaling', ssc_pipe, ssc_cols)
]
col_trans = ColumnTransformer(transformers=transformer, remainder='passthrough')

In [22]:
ilf = IsolationForest()
ilf.fit(col_trans.fit_transform(X_train[train_normal]))


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


IsolationForest(behaviour='old', bootstrap=False, contamination='legacy',
        max_features=1.0, max_samples='auto', n_estimators=100,
        n_jobs=None, random_state=None, verbose=0)

In [23]:
y_pred_nm = ilf.predict(col_trans.transform(X_train[train_normal]))
y_pred_out = ilf.predict(col_trans.transform(X_train[~train_normal]))
y_pred_train = ilf.predict(col_trans.transform(X_train))
y_pred_test = ilf.predict(col_trans.transform(X_hold))


  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


In [24]:
confusion_matrix(y_train[train_normal], y_pred_nm)

array([[     0,      0],
       [ 66555, 599074]])

In [25]:
confusion_matrix(y_train[~train_normal], y_pred_out)

array([[94344,  2041],
       [    0,     0]])

In [26]:
confusion_matrix(y_train, y_pred_train)

array([[ 94344,   2041],
       [ 66555, 599074]])

In [28]:
print(roc_auc_score(y_hold, y_pred_test))
confusion_matrix(y_hold, y_pred_test)

0.9395126924236064


array([[ 31484,    644],
       [ 22394, 199483]])

## Basic One Class SVM

In [None]:
ocsvm = OneClassSVM(kernel='rbf')
ocsvm.fit(col_trans.transform(X_train[train_normal]))
y_ocsvm_nm = ocsvm.fit_predict(col_trans.transform(X_train[train_normal]))
y_ocsvm_out = ocsvm.predict(col_trans.transform(X_train[~train_normal]))
y_ocsvm_train = ocsvm.predict(col_trans.transform(X_train))
y_ocsvm_hold = ocsvm.predict(col_trans.transform(X_hold))

  Xt = transform.transform(Xt)


In [None]:
confusion_matrix(y_train[train_normal], y_ocsvm_nm)

In [None]:
confusion_matrix(y_train[~train_normal], y_ocsvm_out)

In [None]:
confusion_matrix(y_train, y_ocsvm_train)

In [None]:
roc_auc_score(y_hold, y_ocsvm_hold)

## Eliptical Envelope

In [None]:
eenv = EllipticEnvelope(contamination=0.2, random_state=0)
eenv.fit(col_trans.transform(X_train[train_normal]))

In [None]:
roc_auc_score(y_test,eenv.predict(col_trans.transform(X_hold)))

# Local Outlier Factor

In [59]:
lof = LocalOutlierFactor()
y_pred = lof.fit_predict(X_train)



In [43]:
neg_scores = lof.negative_outlier_factor_
out_scores = (neg_scores.max()-neg_scores)/(neg_scores.max()-neg_scores.min())

In [53]:
pd.Series(neg_scores).describe()

count    1.315050e+05
mean    -7.703878e+12
std      2.633116e+14
min     -2.320057e+16
25%     -1.078605e+00
50%     -1.004495e+00
75%     -1.000000e+00
max     -8.769621e-01
dtype: float64

In [54]:
for lim in np.linspace(1e-5,1e1,10):
    print(roc_auc_score(y_train, list(map(y_anomaly_format, out_scores > lim))))

0.5005816494689044
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5


In [61]:
roc_auc_score(y_train, y_pred)

0.5393576376419513