In [14]:
#| echo: false

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# Ignore warnings.
import warnings
warnings.filterwarnings("ignore")

plt.style.use('ggplot')
# figure size in inches
rcParams['figure.figsize'] = 8, 4

sns.set(font_scale=1.5)

import joblib

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
#| echo: false

path = '../data/all-data/'
df = pd.read_csv(f'{path}intrusion_detection_data_v1.csv')

In [None]:
#| echo: false

# Create train, test split
train, test = train_test_split(df, train_size=0.7, random_state=123, stratify=df.label.values)
# print(f'Train size: {train.shape[0]}')
# print(f'Test size: {test.shape[0]}')

In [17]:
#| echo: false

# Split train and test into features and target.
X_train = train.iloc[:, :-2]
y_train = train['label']

X_test = test.iloc[:, :-2]
y_test = test['label']

In [18]:
#| echo: false

# Standarize numaric features.
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
x_test_std = scaler.fit_transform(X_test)

# One-hot encode labels.
ohe = OneHotEncoder()
ohe.fit(y_train.values.reshape(-1, 1))
y_train_ohe = ohe.transform(y_train.values.reshape(-1, 1)).toarray()
y_test_ohe = ohe.transform(y_test.values.reshape(-1, 1)).toarray()

#### Logistic Regression

In [11]:
#| echo: false

# Define Logistic Regression model.
lr_model = LogisticRegression(solver='sag', dual=False, max_iter=1000)

# Train the model.
lr_model = lr_model.fit(X_train_std, y_train)

# Test the model.
y_pred = lr_model.predict(x_test_std)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

      benign       0.97      0.96      0.97    150796
         bot       1.00      0.02      0.04       584
 brute_force       0.91      0.45      0.60      2745
        ddos       0.87      1.00      0.93     38404
         dos       0.96      0.92      0.94     58124
    portscan       0.97      0.99      0.98     27208
  web_attack       0.71      0.03      0.07       643

    accuracy                           0.95    278504
   macro avg       0.91      0.63      0.65    278504
weighted avg       0.95      0.95      0.95    278504



#### Linear SVC

In [8]:
#| echo: false

# Define Linear SVC model object.
l_svm_model = LinearSVC(dual=False)

# Train the model.
l_svm_model.fit(X_train_std, y_train)

# Test the model.
y_pred = l_svm_model.predict(x_test_std)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

      benign       0.98      0.99      0.98    150796
         bot       0.25      0.02      0.04       584
 brute_force       0.95      0.96      0.96      2745
        ddos       0.97      1.00      0.98     38404
         dos       0.99      0.97      0.98     58124
    portscan       0.98      1.00      0.99     27208
  web_attack       0.17      0.04      0.07       643

    accuracy                           0.98    278504
   macro avg       0.76      0.71      0.71    278504
weighted avg       0.98      0.98      0.98    278504



#### KNN

In [6]:
#| echo: false

# Define KNN model object.
knn_model = KNeighborsClassifier(n_neighbors=50)

# Train the model.
knn_model.fit(X_train_std, y_train_ohe)

# Test the model.
y_pred = knn_model.predict(x_test_std)
print(classification_report(y_test_ohe, y_pred, target_names=ohe.categories_[0]))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    150796
           1       0.95      0.70      0.81       584
           2       0.99      0.97      0.98      2745
           3       1.00      0.99      0.99     38404
           4       0.99      1.00      0.99     58124
           5       1.00      1.00      1.00     27208
           6       0.94      0.89      0.91       643

   micro avg       0.99      0.99      0.99    278504
   macro avg       0.98      0.94      0.95    278504
weighted avg       0.99      0.99      0.99    278504
 samples avg       0.99      0.99      0.99    278504



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#| echo: false

# Save KNN trained model.
# file_name = '../../assets/model/knn_model.sav'
# joblib.dump(knn_model, file_name)