In [1]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score
from sklearn import set_config

import xgboost as xgb
from xgboost import XGBClassifier

import random
from deap import base, creator, tools, algorithms
from operator import attrgetter, itemgetter
import functools
import itertools

import pyswarms as ps

import multiprocessing
from joblib import Parallel, delayed

In [2]:
class PSOTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, n_particles=10, max_iter=180):
        self.model = model
        self.n_particles = n_particles
        self.max_iter = max_iter
    
    def fit(self, X, y=None):
        self.feature_names_in_ = list(X.columns)
        total_features = len(X.columns)
        
        # Define objective function
        def f_per_particle(m, alpha):
            """Computes for the objective function per particle

            Inputs
            ------
            m : numpy.ndarray
                Binary mask that can be obtained from BinaryPSO, will
                be used to mask features.
            alpha: float (default is 0.5)
                Constant weight for trading-off classifier performance
                and number of features

            Returns
            -------
            numpy.ndarray
                Computed objective function
            """
            # Get the subset of the features from the binary mask
            if np.count_nonzero(m) == 0:
                X_subset = X
            else:
                features = [name for name, selected in zip(list(X.columns), m) if selected == 1]
                X_subset = X[features]
            # Perform classification and store performance in P
            classifier = clone(self.model)
            classifier.fit(X_subset, y)
            P = accuracy_score(y, classifier.predict(X_subset))
            # Compute for the objective function
            j = (alpha * (1.0 - P)
                + (1.0 - alpha) * (1 - (len(X_subset.columns) / total_features)))

            return j

        def f(x, alpha):
            """Higher-level method to do classification in the
            whole swarm.

            Inputs
            ------
            x: numpy.ndarray of shape (n_particles, dimensions)
                The swarm that will perform the search

            Returns
            -------
            numpy.ndarray of shape (n_particles, )
                The computed loss for each particle
            """
            j = [f_per_particle(x[i], alpha) for i in range(self.n_particles)]
            return np.array(j)
        
        # Initialize swarm, arbitrary
        options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': self.n_particles, 'p':2}

        # Call instance of PSO
        optimizer = ps.discrete.BinaryPSO(n_particles=self.n_particles, dimensions=total_features, options=options)

        # Perform optimization
        cost, pos = optimizer.optimize(functools.partial(f, alpha=0.88), iters=self.max_iter, verbose=2)
        
        self.best_position = pos
        return self
    
    def transform(self, X, y=None):
        features = [name for name, selected in zip(list(X.columns), self.best_position) if selected == 1]
        print(len(features), features)
        return X[features]
    
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            return [name for name, selected in zip(input_features, self.best_features) if selected == 1]
        else:
            return [name for name, selected in zip(self.feature_names_in_, self.best_features) if selected == 1]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [3]:
df = pd.read_csv("data/merged_and_edited_dataset.csv",
                dtype={
                        "src_ip": "string",
                        "dst_ip": "string",
                        "client_fingerprint": "string",
                        "application_name": "string",
                        "application_category_name": "string",
                        "requested_server_name": "string",
                        "atk_type": "string",
                        "traffic_type": "string"
                    }).drop([
                        "Unnamed: 0",
                        "server_fingerprint",
                        "user_agent",
                        "content_type", 
                        "src_ip", 
                        "dst_ip", 
                        "splt_direction", 
                        "splt_ps", 
                        "splt_piat_ms", 
                        "application_name", 
                        "application_category_name", 
                        "requested_server_name", 
                        "client_fingerprint"
                    ], axis=1)
X = df.drop(['id', 'traffic_type', 'atk_type'], axis=1)
y = LabelEncoder().fit_transform(df['atk_type'])

In [4]:
class FeatureSetChromosome(object):
    def __init__(self, genes, size):
        self.genes = np.random.choice(genes, size)
    
    def get_genes(self):
        return self.genes
    
    def __len__(self):
        return len(self.genes)
    
    def __iter__(self):
        yield from self.genes
    
    def __getitem__(self, key):
        return self.genes[key]
    
    def __setitem__(self, key, data):
        self.genes[key] = data

# setting individual creator
creator.create('FitnessMax', base.Fitness, weights=(1,))
creator.create('Individual', FeatureSetChromosome, fitness=creator.FitnessMax)

def mutate(individual, pb=0):
    # maximal amount of mutated genes
    n_mutated_max = max(1, int(len(individual) * pb))
    # generate the random amount of mutated genes
    n_mutated = random.randint(1, n_mutated_max)
    # select random genes which need to be mutated
    mutated_indexes = random.sample(list(range(len(individual))), n_mutated)
    # mutation
    for index in mutated_indexes:
        individual[index] = 0 if individual[index] else 1 # flip between 0s and 1s
    return individual,

def select_best(individuals, k, fit_attr='fitness'):
    return sorted(set(individuals), key=attrgetter(fit_attr))[:k]

def evaluate(individual, model, X, y, n_splits=3):
    features = [name for name, selected in zip(list(X.columns), individual.get_genes()) if selected == 1]
    acc_folds = cross_val_score(
        model, 
        X[features], 
        y, 
        cv=n_splits,
        scoring='accuracy')
    return acc_folds.mean(),

class GATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):   
        self.model = model
    
    def fit(self, X, y=None):
        self.feature_names_in_ = list(X.columns)
        
        def init_individual(ind_class, genes=None, size=None):
            return ind_class(genes, size)
        
        toolbox = base.Toolbox()
        
        n_features = len(X.columns)
        toolbox.register(
            'individual', init_individual, creator.Individual,
            genes=[0, 1], size=n_features)
        toolbox.register(
            'population', tools.initRepeat, list, toolbox.individual)

        # raise population
        self.pop = toolbox.population(10)

        toolbox.register('mate', tools.cxTwoPoint)
        toolbox.register('mutate', mutate, pb=0.2)
        toolbox.register('evaluate', evaluate, model=self.model, X=X, y=y, n_splits=3)
        toolbox.register('select', select_best)
        
        pool = multiprocessing.Pool()
        toolbox.register("map", pool.map)
        
        hof = tools.HallOfFame(3)
        algorithms.eaMuPlusLambda(
            self.pop, toolbox,
            mu=10, lambda_=20, cxpb=0.2, mutpb=0.8,
            ngen=15, halloffame=hof, verbose=True)
        
        self.best_features = hof[0].get_genes()
        return self
    
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            return [name for name, selected in zip(input_features, self.best_features) if selected == 1]
        else:
            return [name for name, selected in zip(self.feature_names_in_, self.best_features) if selected == 1]
        
    def transform(self, X, y=None):
        features = [name for name, selected in zip(list(X.columns), self.best_features) if selected == 1]
        print(len(features), features)
        return X[features]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [5]:
set_config(transform_output="pandas")
classifier = XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           tree_method='gpu_hist',
                           eval_metric='logloss')
scaler = StandardScaler()
pipe = Pipeline(
    [
        ("scaling", scaler),
        ("fs", "passthrough"),
        ("classify", classifier)
    ]
)

param_grid = [
    {
        "fs": ["passthrough",
               GATransformer(model=GaussianNB()), 
               PSOTransformer(model=GaussianNB())]
    }
]

search = GridSearchCV(pipe, param_grid, cv=3, verbose=4)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 1/3] END ....................fs=passthrough;, score=0.727 total time=   3.6s
[CV 2/3] END ....................fs=passthrough;, score=1.000 total time=   4.3s
[CV 3/3] END ....................fs=passthrough;, score=0.769 total time=   4.8s
gen	nevals
0  	10    
1  	20    
2  	20    
3  	20    
4  	20    
5  	20    
6  	20    
7  	20    
8  	20    
9  	20    
10 	20    
11 	20    
12 	20    
13 	20    
14 	20    
15 	20    
30 ['expiration_id', 'src_port', 'dst_port', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'src2dst_last_seen_ms', 'src2dst_packets', 'bidirectional_min_ps', 'bidirectional_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'src2dst_max_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_max_piat_ms', 'src2dst_max_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'bidirectional_cwr_packets', 'bidirectional_fin_packets', 'src2dst_cwr_pac

2023-04-24 23:04:24,017 - pyswarms.discrete.binary - INFO - Optimize for 180 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 10, 'p': 2}


32 ['src_port', 'dst_ip_is_private', 'ip_version', 'vlan_id', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'src2dst_bytes', 'dst2src_duration_ms', 'dst2src_bytes', 'bidirectional_mean_ps', 'bidirectional_max_ps', 'src2dst_mean_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'src2dst_min_piat_ms', 'dst2src_stddev_piat_ms', 'bidirectional_cwr_packets', 'bidirectional_ece_packets', 'bidirectional_urg_packets', 'bidirectional_psh_packets', 'bidirectional_fin_packets', 'src2dst_ece_packets', 'src2dst_urg_packets', 'src2dst_psh_packets', 'src2dst_rst_packets', 'src2dst_fin_packets', 'dst2src_syn_packets', 'dst2src_cwr_packets', 'dst2src_urg_packets', 'dst2src_rst_packets']
[CV 3/3] END fs=GATransformer(model=GaussianNB());, score=0.748 total time=  32.7s


pyswarms.discrete.binary: 100%|███████████████████████████████████████|180/180, best_cost=0.0258
2023-04-24 23:06:57,804 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.02576321387661594, best pos: [1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 1 0
 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 0]


57 ['expiration_id', 'src_ip_is_private', 'src_port', 'dst_ip_is_private', 'protocol', 'ip_version', 'vlan_id', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_max_piat_ms', 'src2dst_min_piat_ms', 'src2dst_mean_piat_ms', 'src2dst_max_piat_ms', 'dst2src_min_piat_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'bidirectional_syn_packets', 'bidirectional_ece_packets', 'bidirectional_ack_packets', 'bidirectional_rst_packets', 'bidirectional_fin_packets', 'src2ds

2023-04-24 23:07:01,122 - pyswarms.discrete.binary - INFO - Optimize for 180 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 10, 'p': 2}


57 ['expiration_id', 'src_ip_is_private', 'src_port', 'dst_ip_is_private', 'protocol', 'ip_version', 'vlan_id', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_min_ps', 'src2dst_mean_ps', 'dst2src_min_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_max_piat_ms', 'src2dst_min_piat_ms', 'src2dst_mean_piat_ms', 'src2dst_max_piat_ms', 'dst2src_min_piat_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'bidirectional_syn_packets', 'bidirectional_ece_packets', 'bidirectional_ack_packets', 'bidirectional_rst_packets', 'bidirectional_fin_packets', 'src2ds

pyswarms.discrete.binary: 100%|███████████████████████████████████████|180/180, best_cost=0.0324
2023-04-24 23:09:34,857 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.03242677139584354, best pos: [1 1 0 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1]


56 ['expiration_id', 'src_ip_is_private', 'protocol', 'ip_version', 'vlan_id', 'bidirectional_first_seen_ms', 'bidirectional_duration_ms', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_min_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps', 'src2dst_max_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', 'bidirectional_stddev_piat_ms', 'bidirectional_max_piat_ms', 'src2dst_min_piat_ms', 'src2dst_mean_piat_ms', 'src2dst_stddev_piat_ms', 'src2dst_max_piat_ms', 'dst2src_min_piat_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'bidirectional_syn_packets', 'bidirectional_ece_packets', 'bidirectional_urg_packets', 'bidirectional_ack_packets', 'bidirectional_psh_packets', 'bidirectiona

2023-04-24 23:09:39,005 - pyswarms.discrete.binary - INFO - Optimize for 180 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 10, 'p': 2}


56 ['expiration_id', 'src_ip_is_private', 'protocol', 'ip_version', 'vlan_id', 'bidirectional_first_seen_ms', 'bidirectional_duration_ms', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_duration_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_min_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'src2dst_mean_ps', 'src2dst_stddev_ps', 'src2dst_max_ps', 'dst2src_mean_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', 'bidirectional_stddev_piat_ms', 'bidirectional_max_piat_ms', 'src2dst_min_piat_ms', 'src2dst_mean_piat_ms', 'src2dst_stddev_piat_ms', 'src2dst_max_piat_ms', 'dst2src_min_piat_ms', 'dst2src_mean_piat_ms', 'dst2src_stddev_piat_ms', 'dst2src_max_piat_ms', 'bidirectional_syn_packets', 'bidirectional_ece_packets', 'bidirectional_urg_packets', 'bidirectional_ack_packets', 'bidirectional_psh_packets', 'bidirectiona

pyswarms.discrete.binary: 100%|███████████████████████████████████████|180/180, best_cost=0.0294
2023-04-24 23:12:12,465 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.02940271641302568, best pos: [1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1
 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 0 1 0 1 1]


56 ['expiration_id', 'src_ip_is_private', 'src_port', 'dst_port', 'ip_version', 'vlan_id', 'bidirectional_first_seen_ms', 'bidirectional_last_seen_ms', 'bidirectional_duration_ms', 'bidirectional_packets', 'bidirectional_bytes', 'src2dst_first_seen_ms', 'src2dst_last_seen_ms', 'src2dst_duration_ms', 'src2dst_packets', 'src2dst_bytes', 'dst2src_first_seen_ms', 'dst2src_last_seen_ms', 'dst2src_packets', 'dst2src_bytes', 'bidirectional_min_ps', 'bidirectional_mean_ps', 'bidirectional_stddev_ps', 'bidirectional_max_ps', 'src2dst_mean_ps', 'src2dst_max_ps', 'dst2src_min_ps', 'dst2src_stddev_ps', 'dst2src_max_ps', 'bidirectional_min_piat_ms', 'bidirectional_mean_piat_ms', 'bidirectional_max_piat_ms', 'src2dst_min_piat_ms', 'src2dst_max_piat_ms', 'dst2src_min_piat_ms', 'dst2src_stddev_piat_ms', 'bidirectional_syn_packets', 'bidirectional_cwr_packets', 'bidirectional_ece_packets', 'bidirectional_urg_packets', 'bidirectional_ack_packets', 'bidirectional_psh_packets', 'bidirectional_rst_packets'

In [6]:
with open("fs_results.score", "wb") as handle:
    pickle.dump(search.cv_results_, handle)

In [7]:
no_fs_scores = cross_validate(pipe, X, y, cv=3)
print(no_fs_scores)
with open("no_fs_results.score", "wb") as handle:
    pickle.dump(no_fs_scores, handle)

{'fit_time': array([3.4447794 , 4.15493917, 4.49056602]), 'score_time': array([0.08846116, 0.13968325, 0.14218163]), 'test_score': array([0.72721322, 1.        , 0.76851579])}


In [8]:
np.mean(no_fs_scores['test_score'])

0.8319096710849289