In [1]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin, clone

from sklearn.naive_bayes import GaussianNB

from sklearn.feature_selection import SelectFromModel
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import set_config

import xgboost as xgb
from xgboost import XGBClassifier

from catboost import CatBoostClassifier

import random
from deap import base, creator, tools, algorithms
from operator import attrgetter, itemgetter
import functools
import itertools

import pyswarms as ps

import multiprocessing
from joblib import Parallel, delayed

In [2]:
class PSOTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, n_particles=10, max_iter=180):
        self.model = model
        self.n_particles = n_particles
        self.max_iter = max_iter
    
    def fit(self, X, y=None):
        self.feature_names_in_ = list(X.columns)
        total_features = len(X.columns)
        
        # Define objective function
        def f_per_particle(m, alpha):
            """Computes for the objective function per particle

            Inputs
            ------
            m : numpy.ndarray
                Binary mask that can be obtained from BinaryPSO, will
                be used to mask features.
            alpha: float (default is 0.5)
                Constant weight for trading-off classifier performance
                and number of features

            Returns
            -------
            numpy.ndarray
                Computed objective function
            """
            # Get the subset of the features from the binary mask
            features = [name for name, selected in zip(list(X.columns), m) if selected == 1]
            X_subset = X[features]
            # Perform classification and store performance in P
            classifier = clone(self.model)
            classifier.fit(X_subset, y)
            P = accuracy_score(y, classifier.predict(X_subset))
            
            j = (alpha * (1.0 - P)
                + (1.0 - alpha) * (1 - (len(X_subset.columns) / total_features)))

            return j

        def f(x, alpha):
            """Higher-level method to do classification in the
            whole swarm.

            Inputs
            ------
            x: numpy.ndarray of shape (n_particles, dimensions)
                The swarm that will perform the search

            Returns
            -------
            numpy.ndarray of shape (n_particles, )
                The computed loss for each particle
            """
            j = [f_per_particle(x[i], alpha) for i in range(self.n_particles)]
            return np.array(j)
        
        # Initialize swarm, arbitrary
        options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': self.n_particles, 'p':2}

        # Call instance of PSO
        optimizer = ps.discrete.BinaryPSO(n_particles=self.n_particles,
                                          dimensions=total_features, 
                                          options=options,
                                          ftol=0.00005,
                                          ftol_iter=35)

        # Perform optimization
        cost, pos = optimizer.optimize(functools.partial(f, alpha=0.88), 
                                       iters=self.max_iter, 
                                       verbose=2)
        
        self.best_position = pos
        return self
    
    def transform(self, X, y=None):
        features = [name for name, selected in zip(list(X.columns), self.best_position) if selected == 1]
        return X[features]
    
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            return [name for name, selected in zip(input_features, self.best_position) if selected == 1]
        else:
            return [name for name, selected in zip(self.feature_names_in_, self.best_position) if selected == 1]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [3]:
class FeatureSetChromosome(object):
    def __init__(self, genes, size):
        self.genes = np.random.choice(genes, size)
    
    def get_genes(self):
        return self.genes
    
    def __len__(self):
        return len(self.genes)
    
    def __iter__(self):
        yield from self.genes
    
    def __getitem__(self, key):
        return self.genes[key]
    
    def __setitem__(self, key, data):
        self.genes[key] = data

# setting individual creator
creator.create('FitnessMax', base.Fitness, weights=(1,))
creator.create('Individual', FeatureSetChromosome, fitness=creator.FitnessMax)

def mutate(individual, pb=0):
    # maximal amount of mutated genes
    n_mutated_max = max(1, int(len(individual) * pb))
    # generate the random amount of mutated genes
    n_mutated = random.randint(1, n_mutated_max)
    # select random genes which need to be mutated
    mutated_indexes = random.sample(list(range(len(individual))), n_mutated)
    # mutation
    for index in mutated_indexes:
        individual[index] = 0 if individual[index] else 1 # flip between 0s and 1s
    return individual,

def select_best(individuals, k, fit_attr='fitness'):
    return sorted(set(individuals), key=attrgetter(fit_attr))[:k]

def evaluate(individual, model, X, y, n_splits=3):
    features = [name for name, selected in zip(list(X.columns), individual.get_genes()) if selected == 1]
    acc_folds = cross_val_score(
        model, 
        X[features], 
        y, 
        cv=n_splits,
        scoring='accuracy')
    return acc_folds.mean(),

class GATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):   
        self.model = model
    
    def fit(self, X, y=None):
        self.feature_names_in_ = list(X.columns)
        
        def init_individual(ind_class, genes=None, size=None):
            return ind_class(genes, size)
        
        toolbox = base.Toolbox()
        
        n_features = len(X.columns)
        toolbox.register(
            'individual', init_individual, creator.Individual,
            genes=[0, 1], size=n_features)
        toolbox.register(
            'population', tools.initRepeat, list, toolbox.individual)

        # raise population
        self.pop = toolbox.population(10)

        toolbox.register('mate', tools.cxTwoPoint)
        toolbox.register('mutate', mutate, pb=0.2)
        toolbox.register('evaluate', evaluate, model=self.model, X=X, y=y, n_splits=3)
        toolbox.register('select', select_best)
        
        pool = multiprocessing.Pool()
        toolbox.register("map", pool.map)
        
        hof = tools.HallOfFame(3)
        algorithms.eaMuPlusLambda(
            self.pop, toolbox,
            mu=10, lambda_=30, cxpb=0.2, mutpb=0.8,
            ngen=25, halloffame=hof, verbose=True)
        
        self.best_features = hof[0].get_genes()
        return self
    
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            return [name for name, selected in zip(input_features, self.best_features) if selected == 1]
        else:
            return [name for name, selected in zip(self.feature_names_in_, self.best_features) if selected == 1]
        
    def transform(self, X, y=None):
        features = [name for name, selected in zip(list(X.columns), self.best_features) if selected == 1]
        return X[features]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [4]:
df = pd.read_csv("data/merged_and_edited_dataset.csv",
                dtype={
                        "src_ip": "string",
                        "dst_ip": "string",
                        "client_fingerprint": "string",
                        "application_name": "string",
                        "application_category_name": "string",
                        "requested_server_name": "string",
                        "atk_type": "string",
                        "traffic_type": "string"
                    }).drop([
                        "Unnamed: 0",
                        "server_fingerprint",
                        "user_agent",
                        "content_type", 
                        "src_ip", 
                        "dst_ip", 
                        "splt_direction", 
                        "splt_ps", 
                        "splt_piat_ms", 
                        "application_name", 
                        "application_category_name", 
                        "requested_server_name", 
                        "client_fingerprint"
                    ], axis=1)
X = df.drop(['id', 'traffic_type', 'atk_type'], axis=1)
y = LabelEncoder().fit_transform(df['atk_type'])

In [5]:
def custom_scorer(clf, X, y):
    y_pred = clf.predict(X)
    return {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred, average="macro"),
        "recall": recall_score(y, y_pred, average="macro"),
        "n_features_in": len(X.columns) if clf['fs']=="passthrough" else len(clf['fs'].get_feature_names_out())
    }

In [6]:
set_config(transform_output="pandas")

In [7]:
classifier = XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           tree_method='gpu_hist',
                           eval_metric='logloss')
scaler = StandardScaler()
pipe = Pipeline(
    [
        ("scaling", scaler),
        ("fs", "passthrough"),
        ("classify", classifier)
    ]
)

param_grid = [
    {
        "fs": ["passthrough",
               GATransformer(model=GaussianNB()), 
               PSOTransformer(model=GaussianNB())],
        "classify": [XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           tree_method='gpu_hist',
                           eval_metric='logloss')]
    },
    {
        "fs": [SelectFromModel(estimator=LinearDiscriminantAnalysis()),
               SelectFromModel(estimator=XGBClassifier(n_estimators = 150,
                           max_depth = 3,
                           learning_rate = 0.1,
                           verbose=None,
                           tree_method='gpu_hist',
                           eval_metric='logloss'))],
        "fs__max_features": np.arange(5, 60, 10),
        "classify": [XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           tree_method='gpu_hist',
                           eval_metric='logloss')]
    }
]

search = GridSearchCV(pipe, param_grid, scoring=custom_scorer, refit=False, cv=5, verbose=4)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END classify=XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None, ...), fs=passthrough;, score=nan total time=   0.2s
[CV 2/5] END classify=XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=No

Process ForkPoolWorker-12:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Process ForkPoolWorker-3:
Process ForkPoolWorker-11:
Process ForkPoolWorker-9:
Process ForkPoolWorker-5:
Process ForkPoolWorker-8:
Process ForkPoolWorker-10:
Traceback (most recent call last):
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/ari/mambaforge/envs/ccd/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ari/mambaforge/envs/ccd/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ari/mambaforge/envs/ccd/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args,

KeyboardInterrupt: 

Process ForkPoolWorker-22:
Process ForkPoolWorker-18:
Process ForkPoolWorker-24:
Process ForkPoolWorker-21:
Process ForkPoolWorker-23:
Process ForkPoolWorker-16:
Process ForkPoolWorker-20:
Process ForkPoolWorker-19:
Process ForkPoolWorker-17:
Process ForkPoolWorker-13:
Process ForkPoolWorker-15:
Process ForkPoolWorker-14:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ari/mambaforge/envs/ccd/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/ari/mambaforge/envs/ccd/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ari/mambaforge/envs/ccd/lib/python3

In [None]:
search_scores = search.cv_results_
for params, acc, rank, features_in in zip(search_scores['params'], 
                                          search_scores['mean_test_accuracy'], 
                                          search_scores['rank_test_accuracy'], 
                                          search_scores['mean_test_n_features_in']):
    print("params:", params, "accuracy:", acc, "rank:", rank, "features selected:", features_in)