In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.figure import Figure
from matplotlib.axes._axes import Axes

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from typing import List, Dict

set_config(display="diagram")

# 1. Load Data

In [None]:
colnames: List[str] = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","lnum_compromised","lroot_shell","lsu_attempted","lnum_root","lnum_file_creations","lnum_shells","lnum_access_files",
    "lnum_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label", "difficulty"
]

data: pd.DataFrame = pd.read_csv("../databases/KDDTrain+.txt", names=colnames, header=None)

In [None]:
# check column names, dataset size (n rows and memory), columns dtypes (n columns p/ dtype) and null count
data.info()

In [None]:
data["label"].value_counts()

In [None]:
# replace all attack types by only one attack label
data["label"]: pd.Series = data["label"].apply(lambda x: "attack" if x != "normal" else "normal")

In [None]:
data["label"].value_counts(normalize=True)

# 2. Exploratory Data Analysis

## 2.1. Categorical

In [None]:
# Transport layer protocols (as expected)
data["protocol_type"].value_counts()

In [None]:
def show_bar_plot(labels: np.ndarray, count: np.ndarray) -> None:
    y_pos: np.ndarray = np.arange((len(labels)))

    # Create bars
    plt.bar(y_pos, count)

    # Create names on the x-axis
    plt.xticks(y_pos, labels)

    # Show graphic
    plt.show()

In [None]:
protocols: np.ndarray = data["protocol_type"].value_counts().index.values
protocol_frequency: np.ndarray = data["protocol_type"].value_counts().values

show_bar_plot(protocols, protocol_frequency)

In [None]:
data["service"].value_counts()

In [None]:
data["flag"].value_counts()

In [None]:
protocols: np.ndarray = data["flag"].value_counts().index.values
protocol_frequency: np.ndarray = data["flag"].value_counts().values

show_bar_plot(protocols, protocol_frequency)

## 2.2. Numerical

In [None]:
data.describe()

In [None]:
plt.hist(data["dst_host_srv_count"].values)  # density=False would make counts
plt.ylabel("Count")
plt.xlabel("Bins")
plt.show()

In [None]:
data["dst_host_srv_count"].plot(kind="hist")

In [None]:
plt.boxplot(data["dst_host_srv_count"].values)
plt.ylabel("Count")
plt.xlabel("Bins")
plt.show()

## 2.3. Multivariate

In [None]:
def get_count(protocol: str, label: str) -> int:
    return len(data[(data["protocol_type"] == protocol) & (data["label"] == label)])

protocols: np.ndarray = data["protocol_type"].value_counts().index.values
labels: np.ndarray = data["label"].value_counts().index.values

normalCount: List[int] = [get_count(protocol, "normal") for protocol in protocols]
attackCount: List[int] = [get_count(protocol, "normal") for protocol in protocols]

ind: np.ndarray = np.arange(len(protocols))
width: float = 0.35

fig: Figure = plt.figure()
ax: Axes = fig.add_axes([0,0,1,1])
ax.bar(ind, normalCount, width, color="r")
ax.bar(ind, attackCount, width, bottom=normalCount, color="b")
ax.set_ylabel("Count")
ax.set_title("Label by protocol")
ax.set_xticks(ind, protocols)
ax.legend(labels=["Normal", "Attack"])

plt.show()

In [None]:
temp: pd.DataFrame = pd.DataFrame()
temp["srv_count"]: pd.Series = data["srv_count"]
temp["dst_host_srv_count"]: pd.Series = data["dst_host_srv_count"]

temp.plot(kind="box")

In [None]:
data.info()

In [None]:
data["label"]: pd.Series = data["label"].apply(lambda x: 1 if x != "normal" else 0)
plt.figure(figsize=(16, 16))

corr: pd.DataFrame = data.select_dtypes(include=[float, int]).corr()

mask: np.ndarray = np.triu(np.ones_like(corr, dtype=bool))
heatmap: Axes = sns.heatmap(corr, vmin=-1, mask=mask, vmax=1, cmap="BrBG")
heatmap.set_title("Correlation Heatmap", fontdict={"fontsize":12}, pad=12)

# 3. Preprocessing/Modeling Pipeline

In [None]:
y: np.ndarray = data["label"].values
X: pd.DataFrame = data.drop(columns=["label"])

In [None]:
numerical_features: List[str] = [
    "duration","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins","logged_in","lnum_compromised",
    "lroot_shell","lsu_attempted","lnum_root","lnum_file_creations","lnum_shells","lnum_access_files","lnum_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate",
    "dst_host_srv_rerror_rate"
]
categorical_features: List[str] = ["protocol_type", "service", "flag"]

# drop_features("difficulty")

numerical_transformer: Pipeline = Pipeline(
    steps=[
        ("scaler", MinMaxScaler())
    ]
)

categorical_transformer: Pipeline = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# Apply numeric transformer (MinMaxScaler) to numeric features and categorical transformer (OneHotEncoder) to categorical features
# Drop remaining features, in this case, difficulty column
preprocessor: ColumnTransformer = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ], remainder="drop"
)

# Apply column preprocessing, reduce feature space with PCA and then apply KNN classifier
clf: Pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("dimreduction", PCA()),
        ("classifier", KNeighborsClassifier())
    ]
)

In [None]:
clf

# 4. Hyperparameter tunning and Validation

In [None]:
# Define hyperparameter space
param_grid: Dict = {
    "dimreduction__n_components": [8, 16, 32],
    "classifier__n_neighbors": [4, 8, 16, 32, 64],
    "classifier__weights": ["uniform", "distance"],
}

# Explore all possibilities with Grid Search using Cross-Validation
grid: GridSearchCV = GridSearchCV(clf, param_grid=param_grid, cv=5, scoring="accuracy", verbose=2,  n_jobs=5)

In [None]:
grid.fit(X, y)

In [None]:
grid.best_params_

In [None]:
# accuracy in validation (folds averaged)
grid.best_score_

# 5. Evaluation

In [None]:
data: pd.DataFrame = pd.read_csv("../databases/KDDTest+.txt", names=colnames, header=None)

In [None]:
data["label"]: pd.Series = data["label"].apply(lambda x: 1 if x != "normal" else 0)
y: np.ndarray = data["label"].values
X: pd.DataFrame = data.drop(columns=["label"])

In [None]:
y_pred: np.ndarray = grid.predict(X)

In [None]:
# final evaluation
print(classification_report(y, y_pred, target_names=["Normal", "Attack"]))