In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matrixprofile as mp
import random
import time
import os
from matplotlib.patches import Rectangle

In [2]:
def plot_ddos(df: pd.DataFrame):
    xAxis = list(range(len(df)))
    yAxis = df["Label"].to_list()
    # Create a figure
    fig = plt.figure(figsize=(25,15))
    # Add a subplot
    ax = fig.add_subplot()

    ax.plot(xAxis, yAxis)
    rect = Rectangle((7200, 0), 300, 1, facecolor='lightgreen')
    ax.add_patch(rect)
    rect = Rectangle((3600*3 + 7200 +280, 0), 300, 1, facecolor='lightgreen')
    ax.add_patch(rect)
    plt.ylabel('Label')
    plt.xlabel('Minute')
    plt.title('Network Traffic')
    plt.show()

In [3]:
def upload_dataset_with_time(path:str):
    startingT = time.perf_counter()
    veriseti = pd.read_csv(path, low_memory=False)
    endingT = time.perf_counter()
    print(f"Dataset is loaded in {endingT - startingT} seconds")
    return veriseti

data_path = '../verisetleri/ddos_dataset_on_seconds.csv'
syn_benign_df = upload_dataset_with_time(data_path)

Dataset is loaded in 0.11131854200000024 seconds


In [4]:
ntp_path = '../verisetleri/ntp_by_seconds_data.csv'
ntp_df = upload_dataset_with_time(ntp_path)

def label_ddos(label:int):
    if label > 20:
        return 1
    else:
        return 0
ntp_df["Label"] = ntp_df["Label"].apply(lambda x: label_ddos(x))
ddos_ntp = ntp_df[ntp_df["Label"] == 1].sample(524).reset_index(drop=True)
ddos_ntp = ddos_ntp[[*(syn_benign_df.columns)]]

Dataset is loaded in 0.01708687500000039 seconds


In [55]:
syn_slice_1 = syn_benign_df.iloc[:12000, :].copy()
syn_slice_2 = syn_benign_df.iloc[12000:, :].copy()
merged_df = pd.concat([syn_slice_1, ddos_ntp ,syn_slice_2, ddos_ntp], axis = 0).reset_index(drop=True)
df = merged_df.copy()

In [56]:
import pandas as pd
import numpy as np
import stumpy
import random

class MatrixProfileManager:
    if 'global_df' not in dir():
        global_df = pd.read_csv('../verisetleri/ddos_dataset_on_seconds.csv', low_memory=True)

    THRESHOLD_BASE_ACTIVE = False
    threshold = 5.0

    def __init__(self, df:pd.DataFrame, window_size:int = 60, discord_number = 476, method='mpx', measure='acc'):
        self.measurement = measure
        self.window_size = window_size
        self.discord_number = discord_number
        self.discord_dict = {}
        self.discords = []
        self.df = df
        self.mp_method = method
        self.curr_mp_dict = {}

    def calculate_mp_multivariate_stumpy(self):
        curr_mps, curr_indices = stumpy.mstump(self.df, self.window_size)
        self.mps = curr_mps

    def calculate_mp_seperately_mpx(self):
        import matrixprofile as mp
        mp_list = []
        
        for ft in self.df.columns:
            inputSignal = self.df[ft].to_list()
            matrix_profile = mp.compute(inputSignal, windows=self.window_size, threshold=0.95, n_jobs=4)
            mp_list.append(matrix_profile['mp'])

        self.mps = np.array(mp_list)
    
    def calculate_discords(self):
        from collections import Counter
        curr_mps_dict = dict()
        curr_mps_dict = {f_idx: np.argsort(self.mps[idx])[::-1][:1000] for idx, f_idx in enumerate(self.df.columns)}
        self.curr_mp_dict = {f_idx: np.sort(self.mps[idx])[::-1][:self.discord_number] for idx, f_idx in enumerate(self.df.columns)}

        for idx, indices in curr_mps_dict.items():
            # print(f'now processing current idx: {idx}') 
            indice_list = []
            for indice in indices:
                #get mp point window
                indice_list.extend(list(range(indice, indice + self.window_size - 1)))
            #sort the indices by count
            if (MatrixProfileManager.THRESHOLD_BASE_ACTIVE == True):
                sorted_discord_indexes = indice_list.copy()
            else:
                sorted_discords = sorted(Counter(indice_list).items(), key=lambda t:t[1], reverse=True)
                sorted_discord_indexes = [elem[0] for elem in sorted_discords[:self.discord_number]]

            self.discord_dict[idx] = sorted_discord_indexes

    def majority_vote_discords(self):
        from collections import Counter
        overall_list = []
        for ft, ids_list in self.discord_dict.items():
            overall_list.extend(ids_list)

        if (MatrixProfileManager.THRESHOLD_BASE_ACTIVE):
            self.discords = list(set(overall_list))
        else:
            sorted_overall = (sorted(Counter(overall_list).items(), key=lambda t:t[1], reverse=True))
            self.discords = [elem[0] for elem in sorted_overall[:self.discord_number]]


    def obtain_y_vals(self):
        df_idxs = list(range(0, len(MatrixProfileManager.global_df)))
        for idx in self.discords:
            try:
                df_idxs.remove(idx)
            except:
                print(f"idx : {idx} not found in global df indexes.")
  
        
        self.pred_df = pd.DataFrame()
        self.pred_df['y_true'] = MatrixProfileManager.global_df["Label"].copy()
        self.pred_df["y_pred"] = MatrixProfileManager.global_df["Label"].copy()
        
        self.pred_df.iloc[df_idxs, 0] = 0
        self.pred_df.iloc[self.discords, 0] = 1

    def calculate_classification_report(self):
        from sklearn.metrics import classification_report
        if 'y_true' not in self.pred_df.columns:
            raise ValueError('true vals not included in df')

        if 'y_pred' not in self.pred_df.columns:
            raise ValueError('pred vals not included in df')

        self.creport = classification_report(self.pred_df["y_true"].to_list(),
                                             self.pred_df["y_pred"].to_list(), output_dict=True)["1"]

    def get_f1_score(self):
        if self.creport is None:
            raise ValueError('Classification Report is not ready!')
            
        return self.creport['f1-score']

    def get_mp_score(self):
        #maximize this
        return sum([sum(mp_score) for mp_score in self.curr_mp_dict.values()]) / len(self.curr_mp_dict.keys())

    def calculate_cost(self):
        if self.mp_method.lower() == 'mpx':
            self.calculate_mp_seperately_mpx()
        else:
            self.calculate_mp_multivariate_stumpy()
        
        self.calculate_discords()
        self.majority_vote_discords()
        self.obtain_y_vals()
        self.calculate_classification_report()

        f1_score = self.get_f1_score()
        mp_score = self.get_mp_score()
        return mp_score, f1_score

    def calculate_thresholded_discords(self):
        from collections import Counter
        curr_mps_dict = dict()
        threshold = MatrixProfileManager.threshold
        curr_mps_dict = {f_idx: np.where(self.mps[idx] > threshold)[0].tolist() for idx, f_idx in enumerate(self.df.columns)}
        self.curr_mp_dict = {f_idx: np.sort(self.mps[idx])[::-1][:10] for idx, f_idx in enumerate(self.df.columns)}

        for idx, indices in curr_mps_dict.items():
            # print(f'now processing current idx: {idx}') 
            indice_list = []
            for indice in indices:
                #get mp point window
                indice_list.extend(list(range(indice, indice + self.window_size - 1)))

            if (MatrixProfileManager.THRESHOLD_BASE_ACTIVE):
                self.discord_dict[idx] = list(set(indice_list))
            else:
                AssertionError("wrong func!")

    def calculate_threshold_based_cost(self):    
        if self.mp_method.lower() == 'mpx':
            self.calculate_mp_seperately_mpx()
        else:
            self.calculate_mp_multivariate_stumpy()

        self.calculate_thresholded_discords()
        self.majority_vote_discords()
        self.obtain_y_vals()
        self.calculate_classification_report()

        f1_score = self.get_f1_score()
        mp_score = self.get_mp_score()
        return mp_score, f1_score


class GeneticAlgo:
    verbosity_level = 0
    thresholded_mp = False
    def __init__(self, df:pd.DataFrame, max_features:int, population_bag_size:int = 3, fitness = 'MP'):
        print('Genetic Algorithm Process is ready to start')
        self.df = df.copy()
        self.y = df[["Label"]]
        self.X = df.drop(["Label"], axis = 1)
        self.feature_map = {i : feat_name for i, feat_name in enumerate(self.X.columns)}
        self.X.columns = list(range(0, len(self.X.columns)))
        self.feature_number = max_features
        self.pop_bag_size = population_bag_size
        self.creport = None
        self.eval_result = None
        self.fitness_type = fitness
        

    def initialize_population(self):
        self.population_bag = []
        for _ in range(self.pop_bag_size):
            #0 veya 1 atiyoruz feature pick or not pick, 1 olanlari appendliyoruz.
            genes = [random.randrange(0,2) for _ in range(self.feature_number)]
            gene_indexes = [idx for idx, f in enumerate(genes) if f == 1]
            if (len(gene_indexes) == 0):
                gene_indexes.append(random.randint(1,self.feature_number))

            self.population_bag.append(self.X.iloc[:, gene_indexes])

        return self.population_bag

    def create_population(self, pop_bag) -> pd.DataFrame:
        self.population_bag.clear()
        for elem in pop_bag:
            self.population_bag.append(self.X.iloc[:, elem])
            
        return self.population_bag

    def fitness_function(self, individual:pd.DataFrame):
        if (GeneticAlgo.thresholded_mp == True):
            MatrixProfileManager.THRESHOLD_BASE_ACTIVE = True
        else:
            MatrixProfileManager.THRESHOLD_BASE_ACTIVE = False

        mp_manager = MatrixProfileManager(individual, window_size=60, discord_number=1000, method='mpx', measure='f1')
        if (GeneticAlgo.thresholded_mp == False):
            cost, f1_score = mp_manager.calculate_cost()
        elif (GeneticAlgo.thresholded_mp == True):
            cost, f1_score = mp_manager.calculate_threshold_based_cost()
            
        if (GeneticAlgo.verbosity_level < 2):
            print(f'processing solution: {individual.columns.to_list()}')
            print(f"f1-score is: {mp_manager.get_f1_score()}")
        #return f1score instead of cost in order to maximize f1-score:
        
        # return cost, f1_score
        del mp_manager
        return cost, f1_score

    def eval_fit_population(self, pop_bag):
        #This evaluation is based on minimizing the cost!
        result = {}
        fit_vals_lst = []
        f1_score_lst = []
        solutions = []
        for individual in pop_bag:
            if (type(individual) != pd.DataFrame):
                assert(True)

            cost, f1_sc = self.fitness_function(individual.copy())
            fit_vals_lst.append(cost)
            f1_score_lst.append(f1_sc)
            solutions.append(individual.columns.to_list())
            
        result["fit_vals"] = fit_vals_lst
        result["f1-scores"] = f1_score_lst 
        if self.fitness_type == "MP":
            min_wgh = [abs(np.min(list(result['fit_vals'])) - i) for i in list(result['fit_vals'])]
        else:
            min_wgh = [abs(np.min(list(result['f1-scores'])) - i) for i in list(result['f1-scores'])]
        
        from scipy.special import logsumexp
        result["fit_wgh"]  = [i/logsumexp(min_wgh) for i in min_wgh]
        result["solution"] = np.array(solutions, dtype=list).tolist()
        
        self.eval_result = result.copy()
        return result

    def find_best(self, eval_result:dict)->dict:
        # Best individual so far
        best_fit = np.max(eval_result["fit_vals"])
        best_fit_index = eval_result["fit_vals"].index(best_fit)
        best_solution  = eval_result["solution"][best_fit_index]
        f1_sc = eval_result["f1-scores"][best_fit_index]
        print(f'best fit: {best_fit}\nsolution: {best_solution}\nf1Score: {f1_sc}')
        return {'best_fit': best_fit, 'index' : best_fit_index,
                 'solution': best_solution, 'f1-score' : f1_sc}

    def pick_one(self, pop_bag):
        
        if self.eval_result is None:
            eval_result = self.eval_fit_population(pop_bag)
        else:
            eval_result = self.eval_result

        notPicked=True
        cnt = 0
        pickedSol = list()
        while (notPicked == True):
            rnIndex = random.randint(0, len(pop_bag)-1)
            rnPick  = eval_result["fit_wgh"][rnIndex]
            r = random.random()
            if  r <= rnPick:
                pickedSol = eval_result["solution"][rnIndex]
                notPicked = False
            if (cnt > 250):
                pickedSol = eval_result["solution"][rnIndex]
                notPicked = False
            cnt += 1

        return pickedSol

    def crossover(self, solA, solB):
        
        n     = len(solA)
        child: list = []

        num_els = random.randint(0, self.feature_number)
        str_pnt = random.randint(0, max(0,n-3))
        end_pnt = n if int(str_pnt+num_els) > n else int(str_pnt+num_els)

        blockA = list(solA[str_pnt:end_pnt])
        child = blockA.copy()

        for elem in solB:
            if len(child) >= num_els:
                break
            if elem not in blockA:
                child.append(elem)  

        if (len(child) < 1):
            return solA

        return child

    def mutation(self,sol):
        
        # n = len(sol)
        # pos_1 = random.randint(0,n-1)
        # pos_2 = random.randint(0,n-1)
        # result = self.swap(sol, pos_1, pos_2)
        if (len(sol) > 2):
            rd_idx = random.randint(0, len(sol) - 1)
            del sol[rd_idx]
        return sol

    def swap(self,sol, posA, posB):
        result = sol.copy()
        elA = sol[posA]
        elB = sol[posB]
        result[posA] = elB
        result[posB] = elA
        return result

In [58]:
import mp_genetic_utils
import pandas as pd



import random as rnd
MatrixProfileManager.THRESHOLD_BASE_ACTIVE = True
MatrixProfileManager.global_df = df.copy()
MatrixProfileManager.threshold = 8
GeneticAlgo.thresholded_mp = True
genetic_algo = GeneticAlgo(df.copy(), max_features=38, population_bag_size=10, fitness="F1")
pop_bag = genetic_algo.initialize_population()
generation_number = 5
for generation in range(generation_number):
    print(f"Generation {generation} is started!")
    
    res = genetic_algo.eval_fit_population(pop_bag)
    best_fit, _, best_solution, f1_score = genetic_algo.find_best(res).values()
    
    if (generation == 0):
        best_fit_global      = best_fit
        best_solution_global = best_solution
        best_f1_global = f1_score
    else:
        if (f1_score >= best_f1_global):
            best_fit_global      = best_fit
            best_solution_global = best_solution
            best_f1_global = f1_score

    new_pop_bag = []
    for i in range(len(genetic_algo.population_bag)):
                # Pick 2 parents from the bag
        pA = genetic_algo.pick_one(pop_bag)
        pB = genetic_algo.pick_one(pop_bag)
        new_element = pA
            # Crossover the parents
        if rnd.random() <= 0.87:
            new_element = genetic_algo.crossover(pA, pB)
            # Mutate the child
        if rnd.random() <= 0.5:
            new_element = genetic_algo.mutation(new_element) 
        new_pop_bag.append(new_element)
            # Set the new bag as the population bag
    pop_bag = genetic_algo.create_population(new_pop_bag)

print("\n\n**** Generations Over ****\n")
print(f"Best Fitness: {best_fit_global}")
print(f"Best Solution: {best_solution_global}")
print(f"F1-Score: {f1_score}")


Genetic Algorithm Process is ready to start
Generation 0 is started!
processing solution: [0, 11, 12, 13, 14, 17, 19, 21, 23, 25, 26, 28, 29, 30, 33, 34, 35, 36, 37]
f1-score is: 0.10783446029666868
processing solution: [2, 9, 10, 12, 13, 14, 19, 20, 23, 24, 30, 32, 33, 35]
f1-score is: 0.07421383647798743
processing solution: [5, 9, 10, 12, 15, 16, 17, 18, 19, 21, 23, 24, 25, 28, 34, 35]
f1-score is: 0.10783446029666868
processing solution: [1, 2, 3, 7, 9, 15, 16, 17, 21, 23, 26, 31, 33, 37]
f1-score is: 0.07421383647798743
processing solution: [2, 6, 7, 8, 9, 11, 13, 15, 19, 20, 21, 22, 24, 26, 27, 29, 30, 31, 32, 33, 34, 36, 37]
f1-score is: 0.10783446029666868
processing solution: [1, 2, 3, 5, 7, 9, 10, 12, 13, 15, 16, 19, 21, 23, 24, 26, 27, 30, 31, 36, 37]
f1-score is: 0.10783446029666868
processing solution: [0, 1, 6, 8, 9, 10, 12, 13, 16, 17, 20, 21, 22, 24, 25, 27, 28, 29, 31, 32, 34, 35]
f1-score is: 0.10783446029666868
processing solution: [0, 1, 4, 5, 8, 9, 13, 14, 16, 17, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


processing solution: [26]
f1-score is: 0.0
processing solution: [26, 27, 30, 31, 37, 0, 4, 8, 14, 17, 18, 20, 24, 25, 33, 34, 36, 7, 9, 10, 12, 13, 15, 16, 19, 1, 5, 29, 35]
f1-score is: 0.10783446029666868
processing solution: [21, 23, 24, 35, 1, 2, 3, 5, 7, 9, 10, 12, 13, 15, 16, 19, 26, 27, 30, 31, 37, 0]
f1-score is: 0.10783446029666868
processing solution: [10, 11, 15, 26, 28, 30, 31, 32, 21, 23, 24, 25, 34, 35, 1, 2, 3, 5, 7, 12, 13, 16, 19, 27, 37]
f1-score is: 0.10783446029666868
processing solution: [21, 23, 24, 25, 34, 35, 1, 2, 5, 7, 10, 12, 13, 15, 16, 19, 27, 37, 22, 28, 29]
f1-score is: 0.10783446029666868
processing solution: [31, 32, 33, 36, 0, 1, 2, 10, 11, 15, 35, 3, 5, 7]
f1-score is: 0.07190737355271176
processing solution: [2, 10, 11, 14, 15, 16, 17, 18, 23, 26, 30, 33, 36, 35, 1, 3, 5, 7, 9, 12, 13, 19, 27, 31, 37, 0, 4, 8, 20, 24, 25, 34]
f1-score is: 0.10783446029666868
processing solution: [0, 1, 2, 10, 11, 14, 15, 16, 17, 18, 23, 26, 30]
f1-score is: 0.0719073

In [48]:
random.randint(0, 3)

3

In [35]:
a_list = list(range(0, len(MatrixProfileManager.global_df)))

In [37]:
len(mp_manager.discords)

635

Genetic Algorithm Process is ready to start


In [42]:
gA = GeneticAlgo(df.copy(), max_features=10, population_bag_size=5, fitness="F1")
first_pop = gA.initialize_population()

MatrixProfileManager.THRESHOLD_BASE_ACTIVE = True
MatrixProfileManager.global_df = df.copy()
MatrixProfileManager.threshold = 5

for pop in first_pop:
    mp_deneme_manager = MatrixProfileManager(pop, window_size=60, discord_number=1000, method='mpx', measure='f1')
    cost, f1_score = mp_deneme_manager.calculate_threshold_based_cost()

Genetic Algorithm Process is ready to start
threshold based discords
threshold based discords
threshold based discords
threshold based discords
threshold based discords
