## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

font = {'weight' : 'bold',
        'size'   : 14}



%matplotlib inline
# SET PATH
PATH = 'Dataset/NSL-KDD/'

In [7]:
import random
random.seed(1)

In [8]:
import time
import pickle as pkl
from collections import Counter
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, train_test_split

In [9]:
import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [10]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report

In [14]:
from sklearn.utils import shuffle

## Get dataframe and get it ready for training models

In [5]:
with open("ids_dataframe_nor.pkl", 'rb') as pkld:
    normal_df = pkl.load(pkld)
with open("ids_dataframe.pkl", 'rb') as pkld:
    main_df = pkl.load(pkld)

In [12]:
# 0=normal, 1=DoS, 2=Probe, 3=R2L and 4=U2R.
normal_df = main_df[main_df['label'] == 0]
dos_df = main_df[main_df['label'] == 1]
probe_df = main_df[main_df['label'] == 2]
r2l_df = main_df[main_df['label'] == 3]
u2r_df = main_df[main_df['label'] == 4]

In [23]:
n_dos_df = shuffle(pd.concat([normal_df, dos_df], axis=0))
n_dos_df.reset_index(drop=True, inplace=True)
n_probe_df = shuffle(pd.concat([normal_df, probe_df], axis=0))
n_probe_df.reset_index(drop=True, inplace=True)
n_r2l_df = shuffle(pd.concat([normal_df, r2l_df], axis=0))
n_r2l_df.reset_index(drop=True, inplace=True)
n_u2r_df = shuffle(pd.concat([normal_df, u2r_df], axis=0))
n_u2r_df.reset_index(drop=True, inplace=True)

In [28]:
targets_dos = n_dos_df["label"]
targets_n_probe = n_probe_df["label"]
targets_n_r2l = n_r2l_df["label"]
targets_n_u2r = n_u2r_df["label"]

In [29]:
n_dos_df.drop(columns=["label"], inplace=True)
n_probe_df.drop(columns=["label"], inplace=True)
n_r2l_df.drop(columns=["label"], inplace=True)
n_u2r_df.drop(columns=["label"], inplace=True)

## Integrate grid search with the model training to check for multiple algorithms

In [None]:
def skfold(X, y, model):
    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X, y)
    results = cross_val_score(model, X, y, cv=skf)
    return results


xgb = skfold(X, y, xgboost.XGBClassifier())
gnb = skfold(X, y, GaussianNB())
rf = skfold(X, y,RandomForestClassifier())
svc = skfold(X, y,SVC())
ada = skfold(X, y,AdaBoostClassifier())
knn = skfold(X, y,KNeighborsClassifier())
MLPc = skfold(X, y,MLPClassifier())
dic = {'XGB':xgb, 'RandomForest':rf,'SVC':svc,'GaussianNB':gnb,'adaBoost':ada,'KNN':knn, 'MLPc':MLPc}

## train model by grid search with hyperparameter tuning

In [30]:
def cv_hparam_tune(data, targets):
    # A parameter grid for XGBoost

    params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6, 7]
    }


    X_train, X_test, Y_train, Y_test = train_test_split(data, targets, test_size=0.20)


    xgb_ = xgboost.XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                                silent=True, nthread=1)


    folds = 5
    param_comb = 5

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)

    random_search = RandomizedSearchCV(xgb_, param_distributions=params, n_iter=param_comb,
                                    scoring='roc_auc', n_jobs=8, cv=skf.split(X_train, Y_train),
                                    verbose=3, random_state=1001
                                    )

    # Here we go
    random_search.fit(X_train, Y_train)
    print('\n Best estimator:')
    print(random_search.best_estimator_)
    print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' %
        (folds, param_comb))
    print(random_search.best_score_ * 2 - 1)
    print('\n Best hyperparameters:')
    print(random_search.best_params_)

"""    grid = GridSearchCV(estimator=xgb_, param_grid=params, scoring='roc_auc',
                        n_jobs=8, cv=skf.split(X_train, Y_train), verbose=3)
    grid.fit(X_train, Y_train)
    print('\n Best estimator:')
    print(grid.best_estimator_)
    print('\n Best score:')
    print(grid.best_score_ * 2 - 1)
    print('\n Best parameters:')
    print(grid.best_params_)
 """  

"    grid = GridSearchCV(estimator=xgb_, param_grid=params, scoring='roc_auc',\n                        n_jobs=8, cv=skf.split(X_train, Y_train), verbose=3)\n    grid.fit(X_train, Y_train)\n    print('\n Best estimator:')\n    print(grid.best_estimator_)\n    print('\n Best score:')\n    print(grid.best_score_ * 2 - 1)\n    print('\n Best parameters:')\n    print(grid.best_params_)\n "

In [32]:
cv_hparam_tune(n_dos_df, targets_dos)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  19 out of  25 | elapsed: 10.8min remaining:  3.4min
[Parallel(n_jobs=8)]: Done  25 out of  25 | elapsed: 13.3min finished



 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.5, learning_rate=0.02,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=600, n_jobs=1, nthread=1, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1.0)

 Best normalized gini score for 5-fold search with 5 parameter combinations:
0.9999986899617652

 Best hyperparameters:
{'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 3, 'gamma': 0.5, 'colsample_bytree': 0.8}
