In [1]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("data/merged_and_edited_dataset.csv",
                dtype={
                        "src_ip": "string",
                        "dst_ip": "string",
                        "client_fingerprint": "string",
                        "application_name": "string",
                        "application_category_name": "string",
                        "requested_server_name": "string",
                        "atk_type": "string",
                        "traffic_type": "string"
                    }).drop([
                        "Unnamed: 0",
                        "server_fingerprint",
                        "user_agent",
                        "content_type", 
                        "src_ip", 
                        "dst_ip", 
                        "splt_direction", 
                        "splt_ps", 
                        "splt_piat_ms", 
                        "application_name", 
                        "application_category_name", 
                        "requested_server_name", 
                        "client_fingerprint"
                    ], axis=1)
X = df.drop(['id', 'traffic_type', 'atk_type'], axis=1)
y = LabelEncoder().fit_transform(df['atk_type'])

In [3]:
classifier = XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           eval_metric='logloss', 
                           tree_method="gpu_hist")
scaler = MinMaxScaler()
pipe = Pipeline(
    [
        ("scaling", scaler),
        ("fs", "passthrough"),
        ("classify", classifier)
    ],
    memory = "cache/"
)

param_grid = [
    {
        "fs": [PCA()],
        "fs__n_components": np.arange(5, len(X.columns), 10)
    }
]

search = GridSearchCV(pipe, param_grid, cv=5, verbose=4)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5] END ......fs=PCA(), fs__n_components=5;, score=0.957 total time=   5.4s
[CV 2/5] END ......fs=PCA(), fs__n_components=5;, score=0.984 total time=   5.2s
[CV 3/5] END ......fs=PCA(), fs__n_components=5;, score=0.988 total time=   5.1s
[CV 4/5] END ......fs=PCA(), fs__n_components=5;, score=0.986 total time=   4.8s
[CV 5/5] END ......fs=PCA(), fs__n_components=5;, score=0.974 total time=   5.0s
[CV 1/5] END .....fs=PCA(), fs__n_components=15;, score=0.958 total time=   5.9s
[CV 2/5] END .....fs=PCA(), fs__n_components=15;, score=0.988 total time=   5.7s
[CV 3/5] END .....fs=PCA(), fs__n_components=15;, score=0.990 total time=   6.3s
[CV 4/5] END .....fs=PCA(), fs__n_components=15;, score=0.988 total time=   5.7s
[CV 5/5] END .....fs=PCA(), fs__n_components=15;, score=0.977 total time=   5.8s
[CV 1/5] END .....fs=PCA(), fs__n_components=25;, score=0.957 total time=   7.3s
[CV 2/5] END .....fs=PCA(), fs__n_components=25;,