In [1]:
import numpy as np
import pandas as pd
import glob
import os

In [2]:
DATA_DIR = "..//Dataset"
os.environ["LOKY_MAX_CPU_COUNT"] = "16"  
csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
dfs = []

In [3]:
for file_path in csv_files:
    filename = os.path.basename(file_path)
    day_label = os.path.splitext(filename)[0]

    df_temp = pd.read_csv(file_path)
    df_temp["Day"] = day_label

    dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)

df.columns = df.columns.str.strip()

In [4]:
date_map = {
    'Monday':           '2023-11-06 12:00:00',
    'Tuesday':          '2023-11-07 12:00:00',
    'Wednesday':        '2023-11-08 12:00:00',
    'Thursday-Morning': '2023-11-09 09:00:00',
    'Thursday-Afternoon':'2023-11-09 15:00:00',
    'Friday-Morning':   '2023-11-10 09:00:00',
    'Friday-Afternoon1':'2023-11-10 13:00:00',
    'Friday-Afternoon2':'2023-11-10 17:00:00',
}

In [5]:
df['timestamp'] = pd.to_datetime(df['Day'].map(date_map))
df.set_index('timestamp', inplace=True)
df.drop(columns=['Day'], inplace=True)

df['dow'] = df.index.dayofweek  # 0=Mon,…,6=Sun
df['hour'] = df.index.hour

df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 7)
df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 7)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [6]:
df.head()

Unnamed: 0_level_0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,dow_sin,dow_cos,hour_sin,hour_cos
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-11-10 13:00:00,22,1266342,41,44,2664,6954,456,0,64.97561,109.864573,...,0,0.0,0.0,0,0,BENIGN,-0.433884,-0.900969,-0.258819,-0.965926
2023-11-10 13:00:00,22,1319353,41,44,2664,6954,456,0,64.97561,109.864573,...,0,0.0,0.0,0,0,BENIGN,-0.433884,-0.900969,-0.258819,-0.965926
2023-11-10 13:00:00,22,160,1,1,0,0,0,0,0.0,0.0,...,0,0.0,0.0,0,0,BENIGN,-0.433884,-0.900969,-0.258819,-0.965926
2023-11-10 13:00:00,22,1303488,41,42,2728,6634,456,0,66.536585,110.129945,...,0,0.0,0.0,0,0,BENIGN,-0.433884,-0.900969,-0.258819,-0.965926
2023-11-10 13:00:00,35396,77,1,2,0,0,0,0,0.0,0.0,...,0,0.0,0.0,0,0,BENIGN,-0.433884,-0.900969,-0.258819,-0.965926


In [17]:
df.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [8]:
print(f"Days of the week unique: {df["dow"].unique()}\n")
print(f"Hours unique: {df["hour"].unique()}\n")
print(f"Labels unique: {df["Label"].unique()}\n")

Days of the week unique: [4 0 3 1 2]

Hours unique: [13 17  9 12 15]

Labels unique: ['BENIGN' 'PortScan' 'DDoS' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']



In [4]:
df["Label"].unique()

array(['BENIGN', 'DDoS', 'PortScan', 'Bot', 'Infiltration',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'FTP-Patator', 'SSH-Patator',
       'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye',
       'Heartbleed'], dtype=object)

In [15]:
df_labels_count = df.groupby(["Label"]).size().reset_index(name="Count")
sorted_labels_count = df_labels_count.sort_values(by="Count", ascending=False)
sorted_labels_count

Unnamed: 0,Label,Count
0,BENIGN,2273097
4,DoS Hulk,231073
10,PortScan,158930
2,DDoS,128027
3,DoS GoldenEye,10293
7,FTP-Patator,7938
11,SSH-Patator,5897
6,DoS slowloris,5796
5,DoS Slowhttptest,5499
1,Bot,1966


In [16]:
attack_days = df.groupby(["Day", "Label"]).size().reset_index(name="Count")
sorted_attack_days = attack_days.sort_values(by="Count", ascending=False)
sorted_attack_days

Unnamed: 0,Day,Label,Count
6,Monday-WorkingHours.pcap_ISCX,BENIGN,529918
16,Wednesday-workingHours.pcap_ISCX,BENIGN,440031
13,Tuesday-WorkingHours.pcap_ISCX,BENIGN,432074
7,Thursday-WorkingHours-Afternoon-Infilteration....,BENIGN,288566
18,Wednesday-workingHours.pcap_ISCX,DoS Hulk,231073
4,Friday-WorkingHours-Morning.pcap_ISCX,BENIGN,189067
9,Thursday-WorkingHours-Morning-WebAttacks.pcap_...,BENIGN,168186
3,Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX,PortScan,158930
1,Friday-WorkingHours-Afternoon-DDos.pcap_ISCX,DDoS,128027
2,Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX,BENIGN,127537


In [7]:
# перевірка на nan
nan_counts = df.isna().sum()
cols_with_nan = nan_counts[nan_counts > 0]
print(cols_with_nan)
print("")

# перевірка на нескінченні значення
inf_counts = df.isin([np.inf, -np.inf]).sum()
cols_with_inf = inf_counts[inf_counts > 0]
print(cols_with_inf)
print("")

# перевірка, чи порт знаходиться в допустимих значеннях
print(df[(df['Destination Port'] < 0) | (df['Destination Port'] > 65535)].shape[0])
print("")

# перевірка, чи стандартне відхилення в признаків < 0
STD_FEATURES = ["Fwd Packet Length Std", "Bwd Packet Length Std", "Flow IAT Std", "Fwd IAT Std", "Bwd IAT Std", "Packet Length Std", "Active Std", "Idle Std"]
mask_any = (df[STD_FEATURES] < 0).any(axis=1)
print(df[mask_any].shape[0])
print("")

# перевірка передачі пакетів, що нижче 0
print(f"Flow Bytes/s < 0: {df[df['Flow Bytes/s'] < 0].shape[0]}")
print(f"Flow Packets/s < 0: {df[df['Flow Packets/s'] < 0].shape[0]}")

Flow Bytes/s    1358
dtype: int64

Flow Bytes/s      1509
Flow Packets/s    2867
dtype: int64

0

0

Flow Bytes/s < 0: 85
Flow Packets/s < 0: 115


In [6]:
numeric_cols = df.columns.drop(['Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'Label'])
mask = df[numeric_cols] < 0
neg_counts = mask.sum()
cols_with_neg = neg_counts[neg_counts > 0].index.tolist()[:-4]

print("Кількість від'ємних значень:")
print(neg_counts[neg_counts > 0])

Кількість від'ємних значень:
Flow Duration               115
Flow Bytes/s                 85
Flow Packets/s              115
Flow IAT Mean               115
Flow IAT Max                115
Flow IAT Min               2891
Fwd IAT Min                  17
Fwd Header Length            35
Bwd Header Length            22
Fwd Header Length.1          35
min_seg_size_forward         35
dow_sin                  703245
dow_cos                 1854916
hour_sin                 800814
hour_cos                2830743
dtype: int64


In [7]:
# очистка датасету
print(f"До очистки: {df.shape[0]}")

df[cols_with_neg] = df[cols_with_neg].mask(df[cols_with_neg] < 0)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"Після очистки: {df.shape[0]}")

До очистки: 2830743
Після очистки: 2824951


In [8]:
category_labels = {
    'BENIGN': ['BENIGN'],
    'DoS': ['DDoS', 'DoS slowloris', 'DoS Hulk', 'DoS GoldenEye'],
    'PortScan': ['PortScan'],
    'Bot_Infiltration': ['Bot', 'Infiltration'],
    'Web': [
        'Web Attack � Brute Force',
        'Web Attack � XSS',
        'Web Attack � Sql Injection'
    ],
    'FTP_SSH_Patator': ['FTP-Patator', 'SSH-Patator'],
    'Heartbleed': ['Heartbleed']
}

base_features = [
    'Flow Bytes/s', 'Flow Packets/s', 'Average Packet Size', 'Down/Up Ratio',
    'Packet Length Mean', 'Packet Length Std', 'Min Packet Length', 'Max Packet Length',
    'Flow IAT Mean', 'Flow IAT Std', 'Fwd IAT Mean', 'Bwd IAT Mean',
    'SYN Flag Count', 'FIN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
    'Active Mean', 'Idle Mean', 'Subflow Fwd Packets', 'Subflow Bwd Packets', 'Label', 'dow', 'hour', 'dow_sin', 'dow_cos', 'hour_sin', 'hour_cos'
]

group_features = {
    'dos': [
        'Fwd Packets/s', 'Bwd Packets/s', 'Flow Duration',
        'Flow IAT Min', 'Flow IAT Max', 'SYN Flag Count', 'PSH Flag Count'
    ],
    'portscan': [
        'SYN Flag Count', 'FIN Flag Count', 'RST Flag Count',
        'Total Fwd Packets', 'Total Backward Packets'
    ],
    'bot_infiltration': [
        'Flow Duration', 'Fwd IAT Std', 'Bwd IAT Std',
        'Fwd PSH Flags', 'Bwd URG Flags', 'Down/Up Ratio'
    ],
    'web': [
        'Fwd Header Length', 'Bwd Header Length', 'Packet Length Variance',
        'ACK Flag Count', 'Average Packet Size'
    ],
    'ftp_ssh_patator': [
        'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Bwd Avg Bytes/Bulk',
        'Active Mean', 'Idle Mean', 'Init_Win_bytes_forward'
    ],
    'heartbleed': [
        'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd IAT Min',
        'Total Length of Fwd Packets', 'Packet Length Std'
    ]
}

In [9]:
def save_grouped_by_category(df, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    normalize = lambda cat: cat.lower().replace('�', '_')
    
    for category, labels in category_labels.items():
        if category == 'BENIGN':
            features = base_features
        else:
            key = normalize(category)
            extra = group_features.get(key, [])
            features = list(dict.fromkeys(base_features + extra))
        
        subset = df[df['Label'].isin(labels)][features]

        filename = f"{category}.csv"
        path = os.path.join(output_dir, filename)
        subset.to_csv(path, index=False)
        print(f"Збережено {len(subset)} рядків у {path}")

In [10]:
save_grouped_by_category(df, output_dir='..//Filtered datasets')

Збережено 2268589 рядків у ..//Filtered datasets\BENIGN.csv
Збережено 374055 рядків у ..//Filtered datasets\DoS.csv
Збережено 158804 рядків у ..//Filtered datasets\PortScan.csv
Збережено 1991 рядків у ..//Filtered datasets\Bot_Infiltration.csv
Збережено 2180 рядків у ..//Filtered datasets\Web.csv
Збережено 13826 рядків у ..//Filtered datasets\FTP_SSH_Patator.csv
Збережено 7 рядків у ..//Filtered datasets\Heartbleed.csv
