In [None]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin, clone
import pyswarms as ps
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

import xgboost as xgb
from xgboost import XGBClassifier

import random
from deap import base, creator, tools, algorithms
from operator import attrgetter

import multiprocessing
from joblib import Parallel, delayed

In [None]:
class PSOTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y=None):
        
        # Define objective function
        def f_per_particle(m, alpha):
            """Computes for the objective function per particle

            Inputs
            ------
            m : numpy.ndarray
                Binary mask that can be obtained from BinaryPSO, will
                be used to mask features.
            alpha: float (default is 0.5)
                Constant weight for trading-off classifier performance
                and number of features

            Returns
            -------
            numpy.ndarray
                Computed objective function
            """
            total_features = X.shape[1]
            # Get the subset of the features from the binary mask
            if np.count_nonzero(m) == 0:
                X_subset = X
            else:
                X_subset = X[:,m==1]
            # Perform classification and store performance in P
            self.model.fit(X_subset, y)
            P = (self.model.predict(X_subset) == y).mean()
            # Compute for the objective function
            j = (alpha * (1.0 - P)
                + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

            return j
        
        def f(x, alpha=0.88):
            """Higher-level method to do classification in the
            whole swarm.

            Inputs
            ------
            x: numpy.ndarray of shape (n_particles, dimensions)
                The swarm that will perform the search

            Returns
            -------
            numpy.ndarray of shape (n_particles, )
                The computed loss for each particle
            """
            n_particles = x.shape[0]
            j = Parallel(n_jobs=4)(
                delayed(f_per_particle)(x[i], alpha) for i in range(n_particles))
            return np.array(j)
        
        # Initialize swarm, arbitrary
        options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

        # Call instance of PSO
        dimensions = X.shape[1] # dimensions should be the number of features
        optimizer = ps.discrete.BinaryPSO(n_particles=30, 
                                          dimensions=dimensions, 
                                          options=options)

        # Perform optimization
        cost, pos = optimizer.optimize(f, 
                                       iters=1000, 
                                       verbose=2)
        self.positions = pos
        
        return self
    
    def transform(self, X, y=None):
        return X[:, self.positions==1]
    
    def get_feature_names_out(self, input_features=None):
        return [name for name, selected in zip(input_features, self.positions) if selected == 1]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [None]:
df = pd.read_csv("data/merged_and_edited_dataset.csv",
                dtype={
                        "src_ip": "string",
                        "dst_ip": "string",
                        "client_fingerprint": "string",
                        "application_name": "string",
                        "application_category_name": "string",
                        "requested_server_name": "string",
                        "atk_type": "string",
                        "traffic_type": "string"
                    }).drop([
                        "Unnamed: 0",
                        "server_fingerprint",
                        "user_agent",
                        "content_type", 
                        "src_ip", 
                        "dst_ip", 
                        "splt_direction", 
                        "splt_ps", 
                        "splt_piat_ms", 
                        "application_name", 
                        "application_category_name", 
                        "requested_server_name", 
                        "client_fingerprint"
                    ], axis=1)
X = df.drop(['id', 'traffic_type', 'atk_type'], axis=1)
y = LabelEncoder().fit_transform(df['atk_type'])

In [None]:
class FeatureSetChromosome(object):
    def __init__(self, genes, size):
        self.genes = np.random.choice(genes, size)
    
    def get_genes(self):
        return self.genes
    
    def __len__(self):
        return len(self.genes)
    
    def __iter__(self):
        yield from self.genes
    
    def __getitem__(self, key):
        return self.genes[key]
    
    def __setitem__(self, key, data):
        self.genes[key] = data

# setting individual creator
creator.create('FitnessMax', base.Fitness, weights=(1,))
creator.create('Individual', FeatureSetChromosome, fitness=creator.FitnessMax)

def mutate(individual, pb=0):
    # maximal amount of mutated genes
    n_mutated_max = max(1, int(len(individual) * pb))
    # generate the random amount of mutated genes
    n_mutated = random.randint(1, n_mutated_max)
    # select random genes which need to be mutated
    mutated_indexes = random.sample(
        [index for index in range(len(individual))], n_mutated)
    # mutation
    for index in mutated_indexes:
        individual[index] = 0 if individual[index] else 1 # flip between 0s and 1s
    return individual,

def select_best(individuals, k, fit_attr='fitness'):
    return sorted(set(individuals), key=attrgetter(fit_attr))[:k]

def evaluate(individual, model, X, y, n_splits=3):
    acc_folds = cross_val_score(
        model, 
        X[:, individual.get_genes()==1], 
        y, 
        cv=n_splits,
        scoring='accuracy')
    return acc_folds.mean(),

class GATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):   
        self.model = model
    
    def fit(self, X, y=None):
        def init_individual(ind_class, genes=None, size=None):
            return ind_class(genes, size)
        
        toolbox = base.Toolbox()
        
        n_features = X.shape[1]
        toolbox.register(
            'individual', init_individual, creator.Individual,
            genes=[0, 1], size=n_features)
        toolbox.register(
            'population', tools.initRepeat, list, toolbox.individual)

        # raise population
        self.pop = toolbox.population(50)

        toolbox.register('mate', tools.cxTwoPoint)
        toolbox.register('mutate', mutate, pb=0.2)
        toolbox.register('evaluate', evaluate, model=self.model, X=X, y=y, n_splits=3)
        toolbox.register('select', select_best)
        
        pool = multiprocessing.Pool()
        toolbox.register("map", pool.map)
        
        hof = tools.HallOfFame(5)
        algorithms.eaMuPlusLambda(
            self.pop, toolbox,
            mu=10, lambda_=30, cxpb=0.2, mutpb=0.8,
            ngen=20, halloffame=hof, verbose=True)
        
        self.feature_list = hof[0].get_genes()
        print(self.feature_list)
        
        return self
        
    def transform(self, X, y=None):
        return X[:, self.feature_list==1]
    
    def get_feature_names_out(self, input_features=None):
        return [name for name, selected in zip(input_features, self.feature_list) if selected == 1]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [None]:
classifier = XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           eval_metric='logloss')
scaler = StandardScaler()
pipe = Pipeline(
    [
        ("scaling", scaler),
        ("fs", "passthrough"),
        ("classify", classifier)
    ],
    memory = "cache/"
)

param_grid = [
    {
        "fs": [GATransformer(model=SGDClassifier(loss="log_loss")), 
               PSOTransformer(model=SGDClassifier(loss="log_loss"))]
    }
]

search = GridSearchCV(pipe, param_grid, cv=3, verbose=4)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)