# Imports

In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from tqdm import tqdm
from scipy.fft import fft
import os

# Data Explanation

In [2]:
# FONTOS: Minden event-re tamadaskent hivatkozunk (ettol fuggetlenul persze a nagy reszuk nem volt tamdas, csak magas forgalom)

# Tomoren: A components tabla elemei az egyes tamadasok soran keszult 'snapshot'-ok a halozat allapotarol (egy tamadashoz akar tobb ilyen 'snapshot' is lehet)
# Az events tabla elemei pedig egy-egy tamadast irnak le, igazabol az events tabla sorai az azonos Attack ID-val rendelkezo komponensek aggregacioja (atlaga, osszege, stb.)

##### Components #####
# Egy adott idopillanatban tukrozi halozat allapotat, egy snaphot
# Az Attack ID azonositja, hogy melyik tamadasnak a resze az adot idopillanatbeli allapot, ebben a tablaban az Attack ID nem egyedi
Components = {
    "Attack ID": "Egy adott tamadashoz (ami egy event) tartozo azonosito. Egy sora a tablanak azt mondja meg, hogy az adott tamadasnak az adott idopillanatban milyen allapotaban van a halozat.",
    "Detect count": "Azt mondja meg, hogy a bejegyzes az adott Attack ID-ju tamadasnak hanyadik detektalasa. pl lehet, hogy egy tamadast csak egyszer detektalunk, de lehet, hogy a tamadas soran tobbszor is valami gyanusat erzekelt a detektor es ezeket mindet feljegyezte",
    "Card": "A detektor melyik kartyaja vegezte az adatgyujtest",
    "Victim IP": "A tamadas aldozatanak IP cime (anonimizalva)",
    "Port number": "A tamadas cel portja",
    "Attack code": "A tamadas jelleget irja le, amibol gyanus volt a detektornak, hogy tamadas tortenik. pl.: High volume / Suspicious traffic, CLDAP. Akar tobb is lehet egy sorban.",
    "Significant flag": "A DDoS detektor belso flagje, SZAMUNKRA NEM FONTOS",
    "Packet speed": "csomagráta (hálózaton áthaladó csomagok száma másodpercenként) [pps]",
    "Data speed": "adatrata (hálózaton áthaladó adatmennyiség másodpercenként) [bps]",
    "Avg packet len": "átlagos csomaghossz [byte]",
    "Source IP count": "Egyedi IP cimek szama, akik az adott idopillanatban tamadnak",
    "Time": "Timestamp, hogy mikor keszult a snapshot"
}


##### Events #####
# egy-egy tamadast reprezental
# Azonos Attack ID-val rendelkezo komponensek aggregacioja, a teljes tamadast reprezentalja, az Attack ID egyedi
Events = {
    "Attack ID": "ugyanaz, mint a Components tablaban, de itt egyedi",
    "Card": "ugyanaz, mint a Components tablaban",
    "Victim IP": "ugyanaz, mint a Components tablaban",
    "Port number": "ugyanaz, mint a Components tablaban",
    "Attack code": "uyanaz, mint a Components tablaban, az adott Attack ID-hoz tartozo osszes komponens Attack code-janak az osszessege",
    "Detect count": "Hanyszor volt az adott Attack ID-val rendelkezo tamadas detektalva.",
    "Singificant flag": "ugyanugy nem fontos",
    "Packet speed": "Atlagos csomagráta a tamadas soran",
    "Data speed": "Atlagos adatrata a tamadas soran",
    "Avg packet len": "Atlagos csomaghossz a tamadas soran",
    "Avg Source IP count": "Atlagososan hany IP-rol tortent a tamadas",
    "Start time": "Timestamp, mikor kezdodott a tamadas",
    "End time": "Timestamp, mikor vegezodott a tamadas",
    "Whitelist flag": "Detektor belso flagje, SZAMUNKRA NEM FONTOS",
    "Type": "Az esemeny kategoriaja, aminek a detektor felcimkezte, ezt kell majd nekunk prediktalni neuralis haloval. Itt ez a 'ground truth' adat. Lehetseges ertekei: DDoS attack, Suspicious traffic vagy Normal traffic"
}

# Initial Feature Engineering

In [3]:
''' We should do the feature engineering here. For the model training this will be exported to a proper DataClass'''

component_columns = [
    "Attack ID", "Detect count", "Card", "Victim IP", "Port number",
    "Attack code", "Significant flag", "Packet speed", "Data speed", "Avg packet len",
    "Source IP count", "Time"
]

event_columns = [
    "Attack ID", "Card", "Victim IP", "Port number", "Attack code", 
    "Detect count", "Significant flag", "Packet speed", "Data speed", 
    "Avg packet len", "Avg source IP count", "Start time", "End time", 
    "Whitelist flag", "Type"
]

def get_fft(column, rel_times):
    # If only one value is present, return zeros to avoid errors
    if len(column) == 1:
        return 0,0

    values = column.values
    N = len(values)
    T = rel_times[-1] if rel_times[-1] > 0 else 1  # Time span in seconds

    fft_vals = fft(values)
    fft_mag = np.abs(fft_vals[:N // 2])  # Magnitudes
    fft_freq = np.fft.fftfreq(N, T / N)[:N // 2]  # Frequencies

    # Get the indices of the top magnitudes
    num_mags = min(len(fft_mag), 3)  # Handle cases with <3 magnitudes
    top_indices = np.argsort(fft_mag)[-num_mags:][::-1]  # Largest magnitudes in descending order

    # Select corresponding frequencies and phases
    top_magnitudes = fft_mag[top_indices].tolist()
    top_frequencies = fft_freq[top_indices].tolist()

    # Ensure the return values always have 3 elements
    while len(top_magnitudes) < 3:
        top_magnitudes.append(0)
        top_frequencies.append(0)

    return sum(top_magnitudes[1:]), sum(top_frequencies[1:])


def process_components(events, components):
    data = components.copy(deep=True)
    data.columns = component_columns
    
    e_data = events.copy(deep=True)
    e_data.columns = event_columns
    
    calculated_data = []
    # Group by 'Attack ID' and iterate over each group
        # Pre-convert 'Time' to datetime once
    data["Time"] = pd.to_datetime(data["Time"])
    
    # List to hold calculated values for each group
    calculated_data = []

    # Group by 'Attack ID' and iterate over each group
    grouped_data = data.groupby("Attack ID")
    for attack_id, group in tqdm(grouped_data):
        group = group.sort_values("Detect count")
        # Calculate relative time using numpy (much faster than pandas)
        times = group["Time"].values
        relative_time = np.cumsum(np.concatenate(([0], np.diff(times).astype(np.float64)))) / 10**9

        # Calculate the detection frequency (detections per second) [Hz]
        detection_frequency = 0 if relative_time[-1] == 0 else len(group) / relative_time[-1]

        # Standard deviation in network parameters
        packet_speed_std = group["Packet speed"].std(ddof=0)
        data_speed_std = group["Data speed"].std(ddof=0)
        avg_packet_len_std = group["Avg packet len"].std(ddof=0)
        source_ip_count_std = group["Source IP count"].std(ddof=0)
        
        packet_speed_burst_ratio = group["Packet speed"].max() / (group["Packet speed"].median() + 1)
        data_speed_burst_ratio = group["Data speed"].max() / (group["Data speed"].median() + 1)
        avg_packet_len_burst_ratio = group["Avg packet len"].max() / (group["Avg packet len"].median() + 1)
        source_ip_burst_ratio = group["Source IP count"].max() / (group["Source IP count"].median() + 1)
            
        # Measures how many packets are transmitted per unit of average packet length.
        packet_transmission_rate = np.where(group['Avg packet len'] == 0, 0, group['Packet speed'] / group['Avg packet len']).max()
        
        # Measures how many packets are transmitted per unit of data speed.
        packet_density = np.where(group['Data speed'] == 0, 0, group['Packet speed'] / group['Data speed']).max()   
        
        # Measures how much data (on average) is carried per unit of data speed. 
        packet_size_efficiency = np.where(group['Data speed'] == 0, 0, group['Avg packet len'] / group['Data speed']).max()        
        


        frequ_domain_values = []
        for col in ["Packet speed", "Data speed", "Avg packet len", "Source IP count"]:
            magnitudes, frequencies = get_fft(group[col], relative_time)
            frequ_domain_values.append([magnitudes, frequencies])


        # Collect calculated data for each attack
        calculated_data.append({
            "Attack ID": attack_id,
            "Detection Frequency": detection_frequency,
            "Packet Speed Std": packet_speed_std,
            "Data Speed Std": data_speed_std,
            "Avg Packet Len Std": avg_packet_len_std,
            "Source IP Count Std": source_ip_count_std,
            "Packet Speed Burst Ratio": packet_speed_burst_ratio,
            "Data Speed Burst Ratio": data_speed_burst_ratio,
            "Avg Packet Len Burst Ratio": avg_packet_len_burst_ratio,
            "Source IP Burst Ratio": source_ip_burst_ratio,
            "Packet Transmission Rate": packet_transmission_rate,
            "Packet Density": packet_density,
            "Packet Size Efficiency": packet_size_efficiency,
            "Packet Speed Ac Magnitude": frequ_domain_values[0][0],
            "Packet Speed Ac Frequency": frequ_domain_values[0][1],
            "Data Speed Ac Magnitude": frequ_domain_values[1][0],
            "Data Speed Ac Frequency": frequ_domain_values[1][1],
            "Avg Packet Len Ac Magnitude": frequ_domain_values[2][0],
            "Avg Packet Len Ac Frequency": frequ_domain_values[2][1],
            "Source IP Count Ac Magnitude": frequ_domain_values[3][0],
            "Source IP Count Ac Frequency": frequ_domain_values[3][1]
        })
        
    # Convert the calculated data into a DataFrame
    calculated_df = pd.DataFrame(calculated_data)

    # Merge the calculated data back into the events DataFrame
    e_data = pd.merge(e_data, calculated_df, on="Attack ID", how="left")
    print(e_data.columns)
    return e_data.reset_index(drop=True)


def add_time_features(p_events):
    events = p_events.copy(deep=True)
    events = events[events['End time'].astype(str) != '0']
    events['Start time'] = pd.to_datetime(events['Start time'])
    events['End time'] = pd.to_datetime(events['End time'])
    events['Duration'] = (events['End time'] - events['Start time']).dt.total_seconds().astype(int)
    events = events.drop(columns=['Victim IP', 'Significant flag', 'Whitelist flag', 'Start time', 'End time', "Card", "Attack code", "Attack ID"])
    cols = list(events.columns)
    cols[6], cols[-1] = cols[-1], cols[6]
    events = events[cols]
    
    # # 2. Hour of the day (optional: you can choose to store this directly or use booleans)
    # events['start_hour'] = events['Start time'].dt.hour
    # events['end_hour'] = events['End time'].dt.hour
    
    # 3. Boolean flags for each hour (this is optional, but if you'd like, here's an example)
    # for hour in range(24):
    #     events[f'is_hour_{hour}'] = events.apply(
    #         lambda row: hour >= row['start_hour'] and hour <= row['end_hour'], axis=1
    #     )
    return events

# Function to convert only numeric columns to float32
def convert_to_float32(col):
    # Check if the column is numeric
    if pd.api.types.is_numeric_dtype(col):
        return col.astype('float32')  # Convert to float32
    else:
        return col  # Return the column as is if not numeric

### Example usage

In [4]:
components_a = pd.read_csv("/home/appuser/data/train/SCLDDoS2024_SetA_components.csv")
events_a = pd.read_csv("/home/appuser/data/train/SCLDDoS2024_SetA_events.csv")

components_b = pd.read_csv("/home/appuser/data/train/SCLDDoS2024_SetB_components.csv")
events_b = pd.read_csv("/home/appuser/data/train/SCLDDoS2024_SetB_events.csv")

components_c = pd.read_csv("/home/appuser/data/test/SCLDDoS2024_SetC_components.csv")
events_c = pd.read_csv("/home/appuser/data/test/SCLDDoS2024_SetC_events.csv")

components_d = pd.read_csv("/home/appuser/data/gen/SCLDDoS2024_SetD_components.csv")
events_d = pd.read_csv("/home/appuser/data/gen/SCLDDoS2024_SetD_events.csv")

events_a = process_components(events_a, components_a)

events_b = process_components(events_b, components_b)

events_c = process_components(events_c, components_c)

events_d = process_components(events_d, components_d)

print("processing a")
events_extended_a = add_time_features(events_a)
events_extended_a.to_csv("/home/appuser/data/train/SCLDDoS2024_SetA_events_extended.csv", index=False)

print("processing b")
events_extended_b = add_time_features(events_b)
events_extended_b.to_csv("/home/appuser/data/train/SCLDDoS2024_SetB_events_extended.csv", index=False)

print("processing c")
events_extended_c = add_time_features(events_c)
events_extended_c.to_csv("/home/appuser/data/test/SCLDDoS2024_SetC_events_extended.csv", index=False)

print("processing d")
events_extended_d = add_time_features(events_d)
events_extended_d.to_csv("/home/appuser/data/gen/SCLDDoS2024_SetD_events_extended.csv", index=False)

100%|██████████| 134769/134769 [02:58<00:00, 753.07it/s]


Index(['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code',
       'Detect count', 'Significant flag', 'Packet speed', 'Data speed',
       'Avg packet len', 'Avg source IP count', 'Start time', 'End time',
       'Whitelist flag', 'Type', 'Detection Frequency', 'Packet Speed Std',
       'Data Speed Std', 'Avg Packet Len Std', 'Source IP Count Std',
       'Packet Speed Burst Ratio', 'Data Speed Burst Ratio',
       'Avg Packet Len Burst Ratio', 'Source IP Burst Ratio',
       'Packet Transmission Rate', 'Packet Density', 'Packet Size Efficiency',
       'Packet Speed Ac Magnitude', 'Packet Speed Ac Frequency',
       'Data Speed Ac Magnitude', 'Data Speed Ac Frequency',
       'Avg Packet Len Ac Magnitude', 'Avg Packet Len Ac Frequency',
       'Source IP Count Ac Magnitude', 'Source IP Count Ac Frequency'],
      dtype='object')


100%|██████████| 129999/129999 [02:53<00:00, 748.69it/s]


Index(['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code',
       'Detect count', 'Significant flag', 'Packet speed', 'Data speed',
       'Avg packet len', 'Avg source IP count', 'Start time', 'End time',
       'Whitelist flag', 'Type', 'Detection Frequency', 'Packet Speed Std',
       'Data Speed Std', 'Avg Packet Len Std', 'Source IP Count Std',
       'Packet Speed Burst Ratio', 'Data Speed Burst Ratio',
       'Avg Packet Len Burst Ratio', 'Source IP Burst Ratio',
       'Packet Transmission Rate', 'Packet Density', 'Packet Size Efficiency',
       'Packet Speed Ac Magnitude', 'Packet Speed Ac Frequency',
       'Data Speed Ac Magnitude', 'Data Speed Ac Frequency',
       'Avg Packet Len Ac Magnitude', 'Avg Packet Len Ac Frequency',
       'Source IP Count Ac Magnitude', 'Source IP Count Ac Frequency'],
      dtype='object')


100%|██████████| 130000/130000 [02:55<00:00, 740.11it/s]


Index(['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code',
       'Detect count', 'Significant flag', 'Packet speed', 'Data speed',
       'Avg packet len', 'Avg source IP count', 'Start time', 'End time',
       'Whitelist flag', 'Type', 'Detection Frequency', 'Packet Speed Std',
       'Data Speed Std', 'Avg Packet Len Std', 'Source IP Count Std',
       'Packet Speed Burst Ratio', 'Data Speed Burst Ratio',
       'Avg Packet Len Burst Ratio', 'Source IP Burst Ratio',
       'Packet Transmission Rate', 'Packet Density', 'Packet Size Efficiency',
       'Packet Speed Ac Magnitude', 'Packet Speed Ac Frequency',
       'Data Speed Ac Magnitude', 'Data Speed Ac Frequency',
       'Avg Packet Len Ac Magnitude', 'Avg Packet Len Ac Frequency',
       'Source IP Count Ac Magnitude', 'Source IP Count Ac Frequency'],
      dtype='object')


100%|██████████| 437657/437657 [09:49<00:00, 742.63it/s]


Index(['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code',
       'Detect count', 'Significant flag', 'Packet speed', 'Data speed',
       'Avg packet len', 'Avg source IP count', 'Start time', 'End time',
       'Whitelist flag', 'Type', 'Detection Frequency', 'Packet Speed Std',
       'Data Speed Std', 'Avg Packet Len Std', 'Source IP Count Std',
       'Packet Speed Burst Ratio', 'Data Speed Burst Ratio',
       'Avg Packet Len Burst Ratio', 'Source IP Burst Ratio',
       'Packet Transmission Rate', 'Packet Density', 'Packet Size Efficiency',
       'Packet Speed Ac Magnitude', 'Packet Speed Ac Frequency',
       'Data Speed Ac Magnitude', 'Data Speed Ac Frequency',
       'Avg Packet Len Ac Magnitude', 'Avg Packet Len Ac Frequency',
       'Source IP Count Ac Magnitude', 'Source IP Count Ac Frequency'],
      dtype='object')
processing a
processing b
processing c
processing d


# Further Feature Engineering with Inferred Attack Code

In [5]:
# Adding The Inferred Attack Code columns and saving it, so we don't need to wait through it mutliple times
CAT_TO_NUM_LABELS = {
    "Normal traffic": 0,
    "Suspicious traffic": 1,
    "DDoS attack": 2,
}

component_columns = [
    "Attack ID", "Detect count", "Card", "Victim IP", "Port number",
    "Attack code", "Significant flag", "Packet speed", "Data speed", "Avg packet len",
    "Source IP count", "Time"
]

event_columns = [
    "Attack ID", "Card", "Victim IP", "Port number", "Attack code", 
    "Detect count", "Significant flag", "Packet speed", "Data speed", 
    "Avg packet len", "Avg source IP count", "Start time", "End time", 
    "Whitelist flag", "Type"
]

class DDoSDataset(Dataset):
    def __init__(self, split):
        self.train_data_paths = [f'/home/appuser/data/train/SCLDDoS2024_SetA_events_extended.csv',
                                 f'/home/appuser/data/train/SCLDDoS2024_SetB_events_extended.csv']
        self.test_data_paths = [f'/home/appuser/data/test/SCLDDoS2024_SetC_events_extended.csv']     
        
        self.gen_data_paths = [f'/home/appuser/data/gen/SCLDDoS2024_SetD_events_extended.csv',]
        
        self.split = split   
        
        if split == 'train':
            self.load_data(self.train_data_paths, apply_smote=False)
        elif split == 'test':
            self.load_data(self.test_data_paths, apply_smote=False)
        elif split == 'gen':
            self.load_data(self.gen_data_paths, apply_smote=False)
            
        else:
            print("Invalid split. Use 'train' or 'test'")
            
    
    def get_ports(self):
        return self.ddos_ports
    
    
    def get_data(self):
        return self.features.numpy(), self.lables.numpy()
    
    def engineer_features_from_components(self, df_components):

        grouped = df_components.groupby('Attack ID')

        features = pd.DataFrame()

        features['Unique Ports'] = grouped['Port number'].nunique()
        features['Unique Victim IPs'] = grouped['Victim IP'].nunique()

        return features.reset_index()
    
    # Function to infer attack codes for a full attack group
    def infer_attack_code_row(self, row):
        codes = set()

        # CHARGEN:
        if row["Packet speed"] > 500000 and row["Data speed"] > 400 and row["Port number"] == 443:
            codes.add("CHARGEN")
            
        # CLDAP:
        if row["Detect count"] >= 10 and row["Data speed"] > 400 and row["Port number"] in [389, 53,80,443,0]:
            codes.add("CLDAP")
            
        # CoAP: I don't see any indicators for this
        
        # # DNS: notghing specific but we can use the port number
        # if row["Port number"] in [53, 443] and row["Data speed"] < 30:
        #     codes.add("DNS")
            
        # Generic UDP:
        if row["Data speed"] < 20 and row["Port number"] in [0,80, 56, 5656, 4500]:
            codes.add("Generic UDP")  
            
        # IPV4 fragmentation:
        if row["Packet speed"] > 1000000 and row["Data speed"] > 1000 and row["Port number"] in [0,80,443]:
            codes.add("IPV4 fragmentation")
            
        # NTP: I don't see anything
        
        # RDP: same
        
        # RPC: same
        
        # SNMP: same
        
        # SSDP: same
        
        # SYN Attack:
        if row["Data speed"] <= 10 and row["Avg packet len"] <= 10 and row["Port number"] in [80,11,22, 443, 0]:
            codes.add("SYN Attack")
            
        # Sentinel:
        if row["Packet speed"] < 30000 and row["Data speed"] < 10 and row["Port number"] == 0:
            codes.add("Sentinel")
            
        # TCP Anomaly:
        if row["Avg packet len"] == 0:
            codes.add("TCP Anomaly")

        return "; ".join(sorted(codes)) if codes else "Unknown"
        
    # preload the data as it makes the training much faster (and it easily fits in memory)
    def load_data(self, data_paths, apply_smote=False, undersample=False, sample_factor=4, add_features=True):
        data = []
        component_data = []
        
        for path in data_paths:
            event_df = pd.read_csv(path).fillna(0)
            data.append(event_df)

            # Attempt to load corresponding component file
            comp_path = path.replace('_events_extended.csv', '_components.csv')
            ref_event_path = path.replace('_events_extended.csv', '_events.csv')
            if os.path.exists(comp_path) and add_features:
                # Load event data
                ref_ev_df = pd.read_csv(ref_event_path).fillna(0)
                ref_ev_df.columns = event_columns

                # Filter out invalid 'Attack ID's based on 'End time'
                ref_ev_df2 = ref_ev_df[ref_ev_df['End time'].astype(str) != '0']
                invalid_attack_ids = ref_ev_df[ref_ev_df['End time'].astype(str) == '0']['Attack ID'].unique()

                # Filter the event data by removing rows with invalid 'Attack ID's
                valid_attack_ids = ref_ev_df2['Attack ID'].unique()  # Attack IDs present in valid events

                # Load component data
                component_df = pd.read_csv(comp_path).fillna(0)
                component_df.columns = component_columns

                # Remove invalid attack IDs from component data
                component_df = component_df[~component_df['Attack ID'].isin(invalid_attack_ids)]
                
                # Now filter component data to only include 'Attack ID's present in valid events
                component_df = component_df[component_df['Attack ID'].isin(valid_attack_ids)]

                # Append the filtered component data
                component_data.append(component_df)
            else:
                print(f"Component file not found: {comp_path}")
        
        df = pd.concat(data, ignore_index=True)
        
        if component_data:
            # df_components = pd.concat(component_data, ignore_index=True)
            
            # attack_id_to_code = (
            #     df_components.groupby("Attack ID")
            #     .apply(self.infer_attack_code_group)
            #     .rename("Inferred Attack Code")
            # )
            
            # Merge back the inferred attack code to all component rows
            #df_components = df_components.merge(attack_id_to_code, on="Attack ID")
            
            #comp_features = self.engineer_features_from_components(df_components)
            df["Inferred Attack Code"] = df.apply(self.infer_attack_code_row, axis=1)
            # df = pd.concat([df, attack_id_to_code], axis=1)
            # df = df.drop(columns=['Attack ID'])
            cols = list(df.columns)
            cols[-2], cols[-1] = cols[-1], cols[-2]
            df = df[cols]
            df = df.dropna(how='all')
            
        # Save the dataframes to a CSV file
        if self.split == 'train':
            df.to_csv('/home/appuser/data/train/A_B_inferred_attack_code.csv', index=False)
        elif self.split == 'test':
            df.to_csv('/home/appuser/data/test/C_inferred_attack_code.csv', index=False)
        elif self.split == 'gen':
            df.to_csv('/home/appuser/data/gen/D_inferred_attack_code.csv', index=False)
       


### Example usage    

In [6]:
train = DDoSDataset('train')
test = DDoSDataset('test')
gen = DDoSDataset('gen')