In [165]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# date = "lr_1e-06_a_5.0_b_0.5_thsh_5.0"
# folder = os.path.join("result", date)
# print(folder)

In [170]:
def filtering(df):
    # 基本邏輯
    # 1. max iat > min iat
    df_filtered = df[df['max iat'] >= df['min iat']]
    # 2. max pkt len > min pkt len
    df_filtered = df_filtered[df_filtered['max pkt_length'] >=  df_filtered['min pkt_length']]
    
    # 3. Flow Duration ≥ Max IAT ≥ Flow Duration/Packet Count - 1 
    # flow duration >= max iat
    df_filtered = df_filtered[df_filtered['flow duration'] >= df_filtered['max iat']]
    df_filtered = df_filtered[
        (df_filtered['max iat']) >= (df_filtered['flow duration'] / (df_filtered['packet count']-1))
    ]
    # 4. FIN Count ≤ ACK Count
    # [FIN, ACK]
    df_filtered = df_filtered[df_filtered['fin count'] <= df_filtered['ack count']]
    
    # 5. PSH Count ≤ ACK Count
    # [PSH, ACK]
    df_filtered = df_filtered[df_filtered['psh count'] <= df_filtered['ack count']]
    
    # 6. Packet Count ≥ 各個 Flags Count
    df_filtered = df_filtered[df_filtered['packet count'] >= df_filtered['fin count']]
    df_filtered = df_filtered[df_filtered['packet count'] >= df_filtered['syn count']]
    df_filtered = df_filtered[df_filtered['packet count'] >= df_filtered['psh count']]
    df_filtered = df_filtered[df_filtered['packet count'] >= df_filtered['ack count']]
    
    # 觀察到的
    # 1. Packet Count >3 ⇔ Flow Duration ≥ 1
    # 新增條件 1: 若 pkt count > 3，則 flow duration >= 1 
    df_filtered = df_filtered[~((df_filtered['packet count'] >= 5) & (df_filtered['flow duration'] < 1))]

    # 新增條件 2: 若 flow duration > 0，則 pkt count > 0
    df_filtered = df_filtered[~((df_filtered['flow duration'] >= 1) & (df_filtered['packet count'] <= 5))]

    # 2. Packet Count × 0.5 ≤ Flags Count 總和 ≤ Packet Count × 2
    df_filtered = df_filtered[
        (df_filtered['fin count'] + df_filtered['syn count'] +
         df_filtered['psh count'] + df_filtered['ack count']) <= (df_filtered['packet count'] * 2)
    ]
    df_filtered = df_filtered[
        (df_filtered['fin count'] + df_filtered['syn count'] +
         df_filtered['psh count'] + df_filtered['ack count']) >= (df_filtered['packet count'] * 0.5)
    ]
    
    # 3. 當 ACK count > 5時，(FIN count + SYN count + PSH count) ≥ 0.25 × ACK count
    df_filtered = df_filtered[~(
    (df_filtered['ack count'] > 5) &
    ((df_filtered['fin count'] + df_filtered['syn count'] + df_filtered['psh count']) < (df_filtered['ack count'] * 0.25))
    )]
    
    # 4. 當 Flow Duration > 0時，Flow Duration ≥ Packet Count × 0.4
    df_filtered = df_filtered[~((df_filtered['flow duration'] > 0) & (
        (df_filtered['flow duration']) < (df_filtered['packet count'] * 0.4)))
    ]


    return df_filtered

In [171]:
features=[
                'flow duration', 'packet count', 'max pkt_length', 'min pkt_length',
                'max iat', 'min iat', 'fin count', 'syn count', 'psh count', 
                'ack count', "predict", "loss", "gen_loss", "bd_loss"
                ]

# features=[
#                 'flow duration', 'packet count', 'max pkt_length', 'min pkt_length',
#                 'max iat', 'min iat', 'fin count', 'syn count', 'psh count', 
#                 'ack count', "predict", "gen_loss",	"disc_loss"
#                 ]

In [172]:
def main(folder):
    for file in os.listdir(folder):
        if file.startswith("generated_data"):
            file_path = os.path.join(folder, file)
            df = pd.read_csv(file_path, header=None, names=features, skiprows=1)
            print("-" * 10)
            print(f'Filtering {file} ...')
            print(f'number of origin data: {len(df)}')
            df_0 = df[df['predict'] == 0]
            print(f'number of label-0 data: {len(df_0)}')
            
    #         print(df)
            df_filtered = filtering(df_0)
            print(f'number of filered data: {len(df_filtered)}')
            percentage = len(df_filtered) / len(df) * 100
            print(f'Percentage: {percentage:.2f}\%')
            # append origin data
#             origin = df.iloc[[0]]
#             df_filtered = pd.concat([origin, df_filtered], ignore_index=True)
            output_dir = os.path.join(folder, 'filtered_data')
            os.makedirs(output_dir, exist_ok=True)
            output_path = os.path.join(output_dir, f'filtered_{file}')
            df_filtered.to_csv(output_path, mode='w', header=features, index=False, encoding="utf-8")
    print("Done!")

In [174]:
root = "result"
# root = os.path.join("result", "colab")
# root = os.path.join(root, "Result_GAN,05_02")
date = "lr_1e-06_"
for dir in os.listdir(root):
#     if dir.startswith(date):
        folder = os.path.join(root, dir)
        print(folder)
        main(folder)

result\0720
----------
Filtering generated_data_200_lr_1e-06_a_0.1_b_0.3_thsh_1.0.csv ...
number of origin data: 50000
number of label-0 data: 9936
number of filered data: 291
Percentage: 0.58\%
----------
Filtering generated_data_200_lr_1e-06_a_0.1_b_0.5_thsh_1.0.csv ...
number of origin data: 50000
number of label-0 data: 1061
number of filered data: 46
Percentage: 0.09\%
----------
Filtering generated_data_200_lr_1e-06_a_0.1_b_1.0_thsh_1.0.csv ...
number of origin data: 50000
number of label-0 data: 8914
number of filered data: 828
Percentage: 1.66\%
----------
Filtering generated_data_200_lr_1e-06_a_0.1_b_3.0_thsh_1.0.csv ...
number of origin data: 50000
number of label-0 data: 34739
number of filered data: 244
Percentage: 0.49\%
----------
Filtering generated_data_200_lr_1e-06_a_0.3_b_0.1_thsh_1.0.csv ...
number of origin data: 50000
number of label-0 data: 42236
number of filered data: 433
Percentage: 0.87\%
----------
Filtering generated_data_200_lr_1e-06_a_0.3_b_0.5_thsh_1.0.cs