In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matrixprofile as mp
import random
import time
import os
from matplotlib.patches import Rectangle
from collections import Counter
import math


In [2]:
data_paths = {  'ntp' : '../verisetleri/ntp.pkl',
                'udp' : '../verisetleri/udp.pkl',
                'syn' : '../verisetleri/syn.pkl',}

dataset_dict = {    'ntp' : None, 'udp' : None,
                    'syn' : None}

In [3]:
def upload_dataset_with_time(path:str):
    startingT = time.perf_counter()
    if 'pkl' in path:
        veriseti = pd.read_pickle(path)
    else:
        veriseti = pd.read_csv(path, low_memory=False)
    endingT = time.perf_counter()
    print(f"Dataset is loaded in {endingT - startingT} seconds")
    return veriseti

In [4]:
for data_name, path in data_paths.items():
    data = upload_dataset_with_time(path)
    dataset_dict[data_name] = data

Dataset is loaded in 1.0632842 seconds
Dataset is loaded in 2.5220478 seconds
Dataset is loaded in 1.0451814000000006 seconds


In [5]:
Counter(dataset_dict["syn"][" Label"])

Counter({'Syn': 1582289, 'BENIGN': 392})

In [6]:
Counter(dataset_dict["ntp"][" Label"])

Counter({'DrDoS_NTP': 1202642, 'BENIGN': 14365})

In [7]:
Counter(dataset_dict["udp"][" Label"])

Counter({'DrDoS_UDP': 3134645, 'BENIGN': 2157})

In [8]:
def parse_df_to_time(df:pd.DataFrame)->pd.DataFrame:
    time_df = pd.DataFrame()
    time_df["Time_Date"] = df[" Timestamp"].apply(lambda x: str(x).split(" ")[0])
    time_df["Date_Second"] = df[" Timestamp"].apply(lambda x : str(x).split(" ")[1].split(".")[0])
    time_df["Date_Hour_Minute"] = df[" Timestamp"].apply(lambda x: ":".join(str(x).split(" ")[1].split(":")[:2]))
    
    return time_df

In [9]:
drop_list = [ ' Fwd Header Length.1', 'Fwd Avg Bytes/Bulk',
       ' Fwd Avg Packets/Bulk', ' Fwd Avg Bulk Rate', ' Bwd Avg Bytes/Bulk',
       ' Bwd Avg Packets/Bulk',' Subflow Fwd Bytes',
        ' Subflow Bwd Packets', ' Subflow Bwd Bytes',
       'Init_Win_bytes_forward', ' Init_Win_bytes_backward',
       ' act_data_pkt_fwd', ' min_seg_size_forward', 'Active Mean',
       ' Active Std', ' Active Max', ' Active Min', 'Idle Mean', ' Idle Std',
       ' Idle Max', ' Idle Min', 'SimillarHTTP', ' Inbound', 'Unnamed: 0', ' Source IP', ' Source Port',
       ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',
        'Flow ID', ' Label']

In [10]:
def prepare_timed_df(time_df, pure_df):
    global drop_list
    concat_pure = pure_df.drop(drop_list, axis = 1)
    
    new_timed_df = pd.concat([time_df, concat_pure], axis = 1)
    new_timed_df["Label"] = pure_df[" Label"].apply(lambda x: 0 if x == "BENIGN" else 1)
    
    unique_seconds = set(new_timed_df["Date_Second"])
    print(f"length of unique seconds: {len(unique_seconds)}")
    return new_timed_df

In [11]:
def save_data_frame(df_name:str, df:pd.DataFrame):
    minutes = math.floor(df.shape[0] / 60)
    data_last_index = minutes * 60
    
    save_df : pd.DataFrame = df.iloc[:data_last_index,:]
    df_name = f"{df_name}_{minutes}_minutes.csv"
    save_df.to_csv(f"{df_name}", index=False)

In [12]:
def prepare_ddos_data(df : pd.DataFrame):
    df_numeric = pd.concat([df[["Date_Second"]], df.select_dtypes(include='number')], axis=1)
    df_ddos = df_numeric[df_numeric["Label"] == 1].reset_index(drop=True)
    
    df_ddos_grouped = df_ddos.groupby("Date_Second").sum().reset_index(drop=False)
    
    df_ddos_grouped["Label"] = df_ddos_grouped["Label"].apply(lambda x: 1 if x > 5 else 0)
    df_ddos_grouped = df_ddos_grouped[df_ddos_grouped["Label"] == 1]
    
    return df_ddos_grouped

### 1. SYN Datasetinden DDoS Verisi Hazırlama

In [13]:
syn_by_time = parse_df_to_time(dataset_dict["syn"])
syn_by_time = prepare_timed_df(syn_by_time, dataset_dict["syn"])
del dataset_dict["syn"]

length of unique seconds: 238


In [14]:
syn_benign = syn_by_time[syn_by_time["Label"] == 0].groupby("Date_Second").sum(numeric_only=True).reset_index(drop=False)
save_data_frame("syn_benign", syn_benign)

In [15]:
grouped_syn_df = prepare_ddos_data(syn_by_time)
del syn_by_time
save_data_frame("syn_ddos", grouped_syn_df)

In [16]:
del syn_benign

### 2.UDP Datasetinden DDoS Verisi Hazırlama

In [17]:
udp_by_time = parse_df_to_time(dataset_dict["udp"])
#out of memory sebebiyle 5 dk udp logu aliyoruz
final_udp_idx = len((udp_by_time[udp_by_time["Date_Hour_Minute"] < "12:41"]["Date_Hour_Minute"]))
dataset_dict["udp"] = dataset_dict["udp"].iloc[:final_udp_idx+1, :]

udp_by_time = prepare_timed_df(udp_by_time, dataset_dict["udp"])
del dataset_dict["udp"]

length of unique seconds: 1521


In [18]:
grouped_udp_df = prepare_ddos_data(udp_by_time)
del udp_by_time
save_data_frame("udp_ddos", grouped_udp_df)

In [19]:
del grouped_udp_df

### 3. NTP Datasetinden Benign ve DDoS Verisi Hazırlama

In [20]:
ntp_by_time = parse_df_to_time(dataset_dict["ntp"])
ntp_by_time = prepare_timed_df(ntp_by_time, dataset_dict["ntp"])
del dataset_dict["ntp"]

length of unique seconds: 3458


In [21]:
ntp_benign = ntp_by_time[ntp_by_time["Label"] == 0].groupby("Date_Second").sum(numeric_only=True).reset_index(drop=False)
save_data_frame("ntp_benign", ntp_benign)

In [22]:
grouped_ntp_df = prepare_ddos_data(ntp_by_time)
del ntp_by_time
save_data_frame("ntp_ddos", grouped_ntp_df)

#### UDP LAG Veriseti

In [23]:
udp_lag_path = "../verisetleri/udp_lag.pkl"
udp_lag = upload_dataset_with_time(udp_lag_path)

Dataset is loaded in 0.3398153999999991 seconds


In [24]:
udp_lag_by_time = parse_df_to_time(udp_lag)
udp_lag_by_time = prepare_timed_df(udp_lag_by_time, udp_lag)
del udp_lag

length of unique seconds: 1164


In [25]:
grouped_udp_lag_df = prepare_ddos_data(udp_lag_by_time)
save_data_frame("udp_lag_ddos", grouped_udp_lag_df)

In [26]:
udp_benign_df = udp_lag_by_time[udp_lag_by_time["Label"] == 0].groupby("Date_Second").sum(numeric_only=True).reset_index(drop=False)

In [27]:
save_data_frame("udp_lag_benign", udp_benign_df)

In [28]:
del udp_benign_df
del grouped_udp_lag_df
del udp_lag_by_time

#### LDAP Veriseti

In [29]:
ldap_path = "../verisetleri/DrDoS_LDAP.csv"
ldap_df = upload_dataset_with_time(ldap_path)

Dataset is loaded in 20.949982700000003 seconds


In [30]:
ldap_df_by_time = parse_df_to_time(ldap_df)
ldap_df_by_time = prepare_timed_df(ldap_df_by_time, ldap_df)
del ldap_df

length of unique seconds: 593


In [31]:
grouped_ldap_df = prepare_ddos_data(ldap_df_by_time)
save_data_frame("ldap_ddos", grouped_ldap_df)

In [32]:
ldap_benign = ldap_df_by_time[ldap_df_by_time["Label"] == 0].groupby("Date_Second").sum(numeric_only=True).reset_index(drop=False)

In [33]:
save_data_frame("ldap_benign", ldap_benign)

In [34]:
del ldap_benign
del grouped_ldap_df
del ldap_df_by_time