In [None]:
ATTACK_MAPPING: Dict[str, Tuple[int, str, str]] = {
    r'^Benign': (1, 'BENIGN', 'Benign'),
    r'^ARP_Spoofing': (0, 'SPOOFING', 'ARP Spoofing'),
    r'^Recon-Ping_Sweep': (0, 'RECON', 'Ping Sweep'),
    r'^Recon-VulScan': (0, 'RECON', 'Recon VulScan'),
    r'^Recon-OS_Scan': (0, 'RECON', 'OS Scan'),
    r'^Recon-Port_Scan': (0, 'RECON', 'Port Scan'),
    r'^MQTT-Malformed_Data': (0, 'MQTT', 'Malformed Data'),
    r'^MQTT-DoS-Connect_Flood': (0, 'MQTT', 'DoS Connect Flood'),
    r'^MQTT-DDoS-Publish_Flood': (0, 'MQTT', 'DDoS Publish Flood'),
    r'^MQTT-DoS-Publish_Flood': (0, 'MQTT', 'DoS Publish Flood'),
    r'^MQTT-DDoS-Connect_Flood': (0, 'MQTT', 'DDoS Connect Flood'),
    r'TCP_IP-DoS-TCP': (0, 'DoS', 'DoS TCP'),
    r'TCP_IP-DoS-ICMP': (0, 'DoS', 'DoS ICMP'),
    r'TCP_IP-DoS-SYN': (0, 'DoS', 'DoS SYN'),
    r'TCP_IP-DoS-UDP': (0, 'DoS', 'DoS UDP'),
    r'TCP_IP-DDoS-SYN': (0, 'DDoS', 'DDoS SYN'),
    r'TCP_IP-DDoS-TCP': (0, 'DDoS', 'DDoS TCP'),
    r'TCP_IP-DDoS-ICMP': (0, 'DDoS', 'DDoS ICMP'),
    r'TCP_IP-DDoS-UDP': (0, 'DDoS', 'DDoS UDP')
}

def get_category_and_attack(filename: str) -> Tuple[int, str, str]:
    """Get category and attack type from filename."""
    for pattern, (is_benign, category, attack) in ATTACK_MAPPING.items():
        if re.match(pattern, filename):
            return is_benign, category, attack
    return -1, 'UNKNOWN', 'UNKNOWN'

def process_file(filepath: str) -> pd.DataFrame:
    """Process single CSV file and add categorization."""
    filename = os.path.basename(filepath)
    is_benign, category, attack = get_category_and_attack(filename)
    
    df = pd.read_csv(filepath)
    df['is_benign'] = is_benign
    df['category'] = category
    df['attack'] = attack
    
    return df

In [None]:
def merge_csv_files(directory: str, output_file: str) -> pd.DataFrame:
    """Merge CSV files and return merged DataFrame."""
    # Get CSV files
    files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    print(f"Found {len(files)} CSV files in {directory}")
    
    # Process files
    dfs = []
    for filename in files:
        try:
            filepath = os.path.join(directory, filename)
            df = process_file(filepath)
            dfs.append(df)
        except Exception as e:
            print(f"\nError processing {filename}: {str(e)}")
    
    # Merge and save
    merged_df = pd.concat(dfs, ignore_index=True)
    merged_df.to_csv(output_file, index=False)
    
    return merged_df

In [None]:
base_dir = 'dataset/subset'

def show_distribution(df: pd.DataFrame) -> None:   
   category_summary = df['category'].value_counts().reset_index()
   category_summary.columns = ['category', 'count']
   print(tabulate(category_summary, headers='keys', tablefmt='psql'))

for dataset in ['train', 'test']:
   mdf = merge_csv_files(os.path.join(base_dir, dataset), f'dataset/{dataset}_labeld.csv')
   print(f"\nTotal rows: {len(mdf):,}")
   show_distribution(mdf)

   # Plot distribution based on attack type
   plt.figure(figsize=(12, 6))
   sns.countplot(data=mdf, x='attack', order=mdf['attack'].value_counts().index)
   plt.title(f'{dataset.capitalize()} Dataset - Distribution by Attack Type')
   plt.xticks(rotation=90)
   plt.show()