# Import Common Package

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from jcopml.utils import save_model, load_model

# Import Data

In [2]:
import sklearn
print('sklearn = {}',sklearn.__version__)
np.__version__
pd.__version__
sns.__version__

sklearn = {} 1.0.2


'0.11.2'

In [3]:
df = pd.read_csv('./nsl-kdd/KDDTrain+.txt')
df.head()

Unnamed: 0,0,tcp,ftp_data,SF,491,0.1,0.2,0.3,0.4,0.5,...,0.17,0.03,0.17.1,0.00.6,0.00.7,0.00.8,0.05,0.00.9,normal,20
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


# Set Column for the Dataset

In [4]:
columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot'
,'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations'
,'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate'
,'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count'
,'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate'
,'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','outcome','level'])

## Normalization for Outcome

In [5]:
df.columns = columns
df.drop(columns=['flag','land','wrong_fragment','urgent','hot'
,'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations'
,'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate'
,'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count'
,'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate'
,'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate', 'level'], inplace=True)
df.loc[df['outcome'] == "normal", "outcome"] = 0
df.loc[df['outcome'] != 0, "outcome"] = 1

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df.head(100)

Unnamed: 0,duration,protocol_type,service,src_bytes,dst_bytes,outcome
0,0,udp,other,146,0,0
1,0,tcp,private,0,0,1
2,0,tcp,http,232,8153,0
3,0,tcp,http,199,420,0
4,0,tcp,private,0,0,1
5,0,tcp,private,0,0,1
6,0,tcp,private,0,0,1
7,0,tcp,remote_job,0,0,1
8,0,tcp,private,0,0,1
9,0,tcp,private,0,0,1


# Dataset Splitting

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df.drop(columns="outcome")
y = df.outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

y_train = y_train.astype('int')
y_test = y_test.astype('int')
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((100777, 5), (25195, 5), (100777,), (25195,))

# Data Preprocessing 

In [9]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, PolynomialFeatures

## Create Pipeline for Scaler and Imputer

In [10]:
num_pip = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", MinMaxScaler()),
    ("poly", PolynomialFeatures())
])

cat_pip = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown='ignore'))
])

## Add column into pipeline

In [11]:
from sklearn.compose import ColumnTransformer

In [12]:
cat_cols = ['protocol_type','service']
num_cols = ['duration', 'src_bytes', 'dst_bytes']

preprocessor = ColumnTransformer([
        ("numeric", num_pip, num_cols),
        ("categoric", cat_pip, cat_cols)
    ])


# Training

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [14]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestClassifier(n_jobs=-1, random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.rf_poly_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)

In [15]:
rsp.rf_poly_params

{'prep__numeric__poly__degree': Integer(low=1, high=3),
 'prep__numeric__poly__interaction_only': [True, False],
 'algo__n_estimators': Integer(low=100, high=200),
 'algo__max_depth': Integer(low=20, high=80),
 'algo__max_features': Real(low=0.1, high=1, prior='uniform'),
 'algo__min_samples_leaf': Integer(low=1, high=20)}

In [16]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('prep',
                                              ColumnTransformer(transformers=[('numeric',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer()),
                                                                                               ('scaler',
                                                                                                MinMaxScaler()),
                                                                                               ('poly',
                                                                                                PolynomialFeatures())]),
                                                                               ['duration',
                                                                          

In [17]:
# print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_ ,model.score(X_test, y_test))

0.9536005239290711 0.9582741645841576 0.9521333597936098


# Model without Hyperparameter Tuning

In [18]:
model2 = pipeline

In [19]:
model2.fit(X_train, y_train)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   MinMaxScaler()),
                                                                  ('poly',
                                                                   PolynomialFeatures())]),
                                                  ['duration', 'src_bytes',
                                                   'dst_bytes']),
                                                 ('categoric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                               

In [20]:
  # print(model.best_params_)
print(model2.score(X_train, y_train), model2.score(X_test, y_test))

0.952925766792026 0.9517364556459615


In [21]:
save_model(model.best_estimator_, "rf_nsl_with_hyperparams_estimator.pkl")
save_model(model, "rf_nsl_with_hyperparams_model.pkl")
save_model(model2, "rf_nsl_without_hyperparams.pkl")

Model is pickled as model/rf_nsl_with_hyperparams_estimator.pkl
Model is pickled as model/rf_nsl_with_hyperparams_model.pkl
Model is pickled as model/rf_nsl_without_hyperparams.pkl
