In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('ggplot')
# figure size in inches
rcParams['figure.figsize'] = 8, 4

sns.set(font_scale=1.5)

import joblib

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [12]:
path = '../data/all-data/'
df = pd.read_csv(f'{path}intrusion_detection_data_v1.csv')
df.shape

(928344, 70)

In [13]:
# Create train, test split
train, test = train_test_split(df, train_size=0.7, random_state=123, stratify=df.label.values)
print(f'Train size: {train.shape[0]}')
print(f'Test size: {test.shape[0]}')

# Split train and test into features and target.
X_train = train.iloc[:, :-2]
y_train = train['label']

X_test = test.iloc[:, :-2]
y_test = test['label']

Train size: 649840
Test size: 278504


#### KNN

In [14]:
# Define KNN model object.
knn_model = KNeighborsClassifier()

# Define scoring metrics.
score = 'accuracy'

# Define 10 fold CV object.
kfold = KFold(n_splits=2)

# Define grid of hyperparameter values.
# hyper_grid = {'knn__n_neighbors': range(1, 55, 10)}
hyper_grid = {'knn__n_neighbors': [50]}

# Remove near-zero variance features that are categorical.
nzv_encoder = VarianceThreshold(threshold=0.1)

# Center and scale (i.e., standardize) all numeric features.
scaler = StandardScaler()

# Perform dimension reduction by applying PCA to all numeric features.
pca = PCA(n_components=40)

# One-hot encode remaining categorical features.
encoder = OneHotEncoder(handle_unknown="ignore")

In [15]:
# Combine all steps into a preprocessing pipeline
preprocessor = ColumnTransformer(
  remainder="passthrough",
  transformers=[
  ("nzv_encode", nzv_encoder, selector(dtype_include="number")),
  ("std_encode", scaler, selector(dtype_include="number")),
  ("pca_encode", pca, selector(dtype_include="number")),
  ("one-hot", encoder, selector(dtype_include="object")),
  ])
preprocessor

ColumnTransformer(remainder='passthrough',
                  transformers=[('nzv_encode', VarianceThreshold(threshold=0.1),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B3AF0>),
                                ('std_encode', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B3FA0>),
                                ('pca_encode', PCA(n_components=40),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B3A00>),
                                ('one-hot',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B31C0>)])

In [20]:
model_pipeline = Pipeline(steps=[
  ("preprocessor", preprocessor),
  ("knn", KNeighborsClassifier(n_neighbors=50)),
])
model_pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nzv_encode',
                                                  VarianceThreshold(threshold=0.1),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B3AF0>),
                                                 ('std_encode',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B3FA0>),
                                                 ('pca_encode',
                                                  PCA(n_components=40),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001C5487B3A00>),
                                                 ('one-hot',
          

In [53]:
# Tune a knn model using grid search
grid_search = GridSearchCV(model_pipeline, hyper_grid, cv=kfold, scoring=score)
results = grid_search.fit(X_train, y_train)

In [55]:
results.best_score_

0.9813877262095285

In [56]:
# Save KNN trained model.
file_name = '../../app/assets/knn_model.sav'
joblib.dump(results, file_name)

['../../app/assets/knn_model.sav']

In [5]:
# Load trained model.
model_path = '../../app/assets/knn_model.sav'
loaded_model = joblib.load(model_path)

In [7]:
# Test the model.
y_pred = loaded_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      benign       0.99      0.99      0.99    150796
         bot       0.93      0.61      0.73       584
 brute_force       0.96      0.96      0.96      2745
        ddos       0.98      0.99      0.98     38404
         dos       0.98      0.98      0.98     58124
    portscan       0.99      0.99      0.99     27208
  web_attack       0.96      0.87      0.91       643

    accuracy                           0.99    278504
   macro avg       0.97      0.91      0.94    278504
weighted avg       0.99      0.99      0.99    278504



In [57]:
# Test model with new data.
test_data = '../data/all-data/test_data.csv'
test_data = pd.read_csv(test_data)

y_pred = results.predict(test_data)
y_pred

array(['benign', 'benign', 'benign', 'benign', 'benign', 'dos', 'benign',
       'benign', 'dos', 'benign'], dtype=object)