In [11]:
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

import random
from deap import base, creator, tools, algorithms
from operator import attrgetter, itemgetter
import functools
import itertools

import pyswarms as ps

import multiprocessing
from joblib import Parallel, delayed

In [2]:
class PSOTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model, n_particles=15, max_iter=200):
        self.model = model
        self.n_particles = n_particles
        self.max_iter = max_iter
    
    def fit(self, X, y=None):
        total_features = X.shape[1]
        
        # Define objective function
        def f_per_particle(m, alpha):
            """Computes for the objective function per particle

            Inputs
            ------
            m : numpy.ndarray
                Binary mask that can be obtained from BinaryPSO, will
                be used to mask features.
            alpha: float (default is 0.5)
                Constant weight for trading-off classifier performance
                and number of features

            Returns
            -------
            numpy.ndarray
                Computed objective function
            """
            # Get the subset of the features from the binary mask
            if np.count_nonzero(m) == 0:
                X_subset = X
            else:
                X_subset = X[:,m==1]
            # Perform classification and store performance in P
            classifier = clone(self.model)
            classifier.fit(X_subset, y)
            P = accuracy_score(y, classifier.predict(X_subset))
            # Compute for the objective function
            j = (alpha * (1.0 - P)
                + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

            return j

        def f(x, alpha):
            """Higher-level method to do classification in the
            whole swarm.

            Inputs
            ------
            x: numpy.ndarray of shape (n_particles, dimensions)
                The swarm that will perform the search

            Returns
            -------
            numpy.ndarray of shape (n_particles, )
                The computed loss for each particle
            """
            j = [f_per_particle(x[i], alpha) for i in range(self.n_particles)]
            return np.array(j)
        
        # Initialize swarm, arbitrary
        options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': self.n_particles, 'p':2}

        # Call instance of PSO
        optimizer = ps.discrete.BinaryPSO(n_particles=self.n_particles, dimensions=total_features, options=options)

        # Perform optimization
        cost, pos = optimizer.optimize(functools.partial(f, alpha=0.88), iters=self.max_iter, verbose=2)
        
        self.best_position = pos
        return self
    
    def transform(self, X, y=None):
        return X[:, self.best_position==1]
    
    def get_feature_names_out(self, input_features=None):
        return [name for name, selected in zip(input_features, self.best_position) if selected == 1]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [3]:
df = pd.read_csv("data/merged_and_edited_dataset.csv",
                dtype={
                        "src_ip": "string",
                        "dst_ip": "string",
                        "client_fingerprint": "string",
                        "application_name": "string",
                        "application_category_name": "string",
                        "requested_server_name": "string",
                        "atk_type": "string",
                        "traffic_type": "string"
                    }).drop([
                        "Unnamed: 0",
                        "server_fingerprint",
                        "user_agent",
                        "content_type", 
                        "src_ip", 
                        "dst_ip", 
                        "splt_direction", 
                        "splt_ps", 
                        "splt_piat_ms", 
                        "application_name", 
                        "application_category_name", 
                        "requested_server_name", 
                        "client_fingerprint"
                    ], axis=1)
X = df.drop(['id', 'traffic_type', 'atk_type'], axis=1)
y = LabelEncoder().fit_transform(df['atk_type'])

In [4]:
class FeatureSetChromosome(object):
    def __init__(self, genes, size):
        self.genes = np.random.choice(genes, size)
    
    def get_genes(self):
        return self.genes
    
    def __len__(self):
        return len(self.genes)
    
    def __iter__(self):
        yield from self.genes
    
    def __getitem__(self, key):
        return self.genes[key]
    
    def __setitem__(self, key, data):
        self.genes[key] = data

# setting individual creator
creator.create('FitnessMax', base.Fitness, weights=(1,))
creator.create('Individual', FeatureSetChromosome, fitness=creator.FitnessMax)

def mutate(individual, pb=0):
    # maximal amount of mutated genes
    n_mutated_max = max(1, int(len(individual) * pb))
    # generate the random amount of mutated genes
    n_mutated = random.randint(1, n_mutated_max)
    # select random genes which need to be mutated
    mutated_indexes = random.sample(list(range(len(individual))), n_mutated)
    # mutation
    for index in mutated_indexes:
        individual[index] = 0 if individual[index] else 1 # flip between 0s and 1s
    return individual,

def select_best(individuals, k, fit_attr='fitness'):
    return sorted(set(individuals), key=attrgetter(fit_attr))[:k]

def evaluate(individual, model, X, y, n_splits=3):
    acc_folds = cross_val_score(
        model, 
        X[:, individual.get_genes()==1], 
        y, 
        cv=n_splits,
        scoring='accuracy')
    return acc_folds.mean(),

class GATransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model):   
        self.model = model
    
    def fit(self, X, y=None):
        def init_individual(ind_class, genes=None, size=None):
            return ind_class(genes, size)
        
        toolbox = base.Toolbox()
        
        n_features = X.shape[1]
        toolbox.register(
            'individual', init_individual, creator.Individual,
            genes=[0, 1], size=n_features)
        toolbox.register(
            'population', tools.initRepeat, list, toolbox.individual)

        # raise population
        self.pop = toolbox.population(10)

        toolbox.register('mate', tools.cxTwoPoint)
        toolbox.register('mutate', mutate, pb=0.2)
        toolbox.register('evaluate', evaluate, model=self.model, X=X, y=y, n_splits=3)
        toolbox.register('select', select_best)
        
        pool = multiprocessing.Pool()
        toolbox.register("map", pool.map)
        
        hof = tools.HallOfFame(3)
        algorithms.eaMuPlusLambda(
            self.pop, toolbox,
            mu=10, lambda_=25, cxpb=0.2, mutpb=0.8,
            ngen=15, halloffame=hof, verbose=True)
        
        self.best_features = hof[0].get_genes()
        return self
    
    def get_feature_names_out(self, input_features=None):
        return [name for name, selected in zip(input_features, self.best_features) if selected == 1]
        
    def transform(self, X, y=None):
        return X[:, self.best_features==1]
    
    def get_params(self, deep=True):
        return {"model": self.model}
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [5]:
classifier = XGBClassifier(n_estimators = 500,
                           max_depth = 7,
                           learning_rate = 0.1,
                           verbose=None,
                           eval_metric='logloss')
scaler = StandardScaler()
pipe = Pipeline(
    [
        ("scaling", scaler),
        ("fs", "passthrough"),
        ("classify", classifier)
    ],
    memory = "cache/"
)

param_grid = [
    {
        "fs": [GATransformer(model=LinearDiscriminantAnalysis(solver="svd")), 
               PSOTransformer(model=LinearDiscriminantAnalysis(solver="svd"))]
    }
]

search = GridSearchCV(pipe, param_grid, cv=3, verbose=4)
search.fit(X, y)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
gen	nevals
0  	10    
1  	25    
2  	25    
3  	25    
4  	25    
5  	25    
6  	25    
7  	25    
8  	25    
9  	25    
10 	25    
11 	25    
12 	25    
13 	25    
14 	25    
15 	25    
[CV 1/3] END fs=GATransformer(model=LinearDiscriminantAnalysis());, score=0.986 total time= 1.6min
gen	nevals
0  	10    
1  	25    
2  	25    
3  	25    
4  	25    
5  	25    
6  	25    
7  	25    
8  	25    
9  	25    
10 	25    
11 	25    
12 	25    
13 	25    
14 	25    
15 	25    
[CV 2/3] END fs=GATransformer(model=LinearDiscriminantAnalysis());, score=1.000 total time= 1.6min
gen	nevals
0  	10    
1  	25    
2  	25    
3  	25    
4  	25    
5  	25    
6  	25    
7  	25    
8  	25    
9  	25    
10 	25    
11 	25    
12 	25    
13 	25    
14 	25    
15 	25    


2023-04-19 03:24:11,363 - pyswarms.discrete.binary - INFO - Optimize for 200 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 15, 'p': 2}


[CV 3/3] END fs=GATransformer(model=LinearDiscriminantAnalysis());, score=1.000 total time= 1.6min


pyswarms.discrete.binary: 100%|███████████████████████████████████████|200/200, best_cost=0.0271
2023-04-19 03:38:15,916 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.027124365897561755, best pos: [1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1
 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0]
2023-04-19 03:38:27,691 - pyswarms.discrete.binary - INFO - Optimize for 200 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 15, 'p': 2}


[CV 1/3] END fs=PSOTransformer(model=LinearDiscriminantAnalysis());, score=0.986 total time=14.3min


pyswarms.discrete.binary: 100%|███████████████████████████████████████|200/200, best_cost=0.0468
2023-04-19 03:52:15,921 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.04684519718540339, best pos: [1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1
 1 1 1 0 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0]
2023-04-19 03:52:29,176 - pyswarms.discrete.binary - INFO - Optimize for 200 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 15, 'p': 2}


[CV 2/3] END fs=PSOTransformer(model=LinearDiscriminantAnalysis());, score=1.000 total time=14.0min


pyswarms.discrete.binary: 100%|███████████████████████████████████████|200/200, best_cost=0.0476
2023-04-19 04:06:04,276 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.047568319423989563, best pos: [1 1 1 0 0 1 1 1 0 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 1 1 0 1 0
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1]


[CV 3/3] END fs=PSOTransformer(model=LinearDiscriminantAnalysis());, score=1.000 total time=13.8min
gen	nevals
0  	10    
1  	25    
2  	25    
3  	25    
4  	25    
5  	25    
6  	25    
7  	25    
8  	25    
9  	25    
10 	25    
11 	25    
12 	25    
13 	25    
14 	25    
15 	25    
Best parameter (CV score=0.995):
{'fs': GATransformer(model=LinearDiscriminantAnalysis())}


In [9]:
with open("fs_results.score", "wb") as handle:
    pickle.dump(search.cv_results_, handle)

In [12]:
no_fs_scores = cross_validate(pipe, X, y, cv=3)
print(no_fs_scores)
with open("no_fs_results.score", "wb") as handle:
    pickle.dump(no_fs_scores, handle)

{'fit_time': array([14.09238529, 15.35090399, 14.94798279]), 'score_time': array([0.03921533, 0.03841019, 0.03927851]), 'test_score': array([0.98638521, 1.        , 1.        ])}


In [13]:
np.mean(no_fs_scores['test_score'])

0.9954617356679213