In [2]:
#pip install pandas tqdm numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

# Path to the extracted CSV files (adjust this path as needed)
DATA_DIR = 'CICIDS2017/MachineLearningCVE'

# List all .csv files
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.csv')]

# Combine all files
df_list = []

print("🔄 Loading and normalizing CSV files...\n")
for file in tqdm(csv_files):
    file_path = os.path.join(DATA_DIR, file)
    try:
        df = pd.read_csv(file_path, low_memory=False)

        # Normalize column names
        df.columns = df.columns.str.strip().str.replace('\r', '', regex=False).str.replace('\n', '', regex=False).str.lower()

        # Only keep if 'label' column is present
        if 'label' in df.columns:
            df_list.append(df)
        else:
            print(f"⚠️ Skipped {file}: 'label' column not found.")

    except Exception as e:
        print(f"❌ Error reading {file}: {e}")

# Combine all valid DataFrames
print("\n📦 Concatenating valid files...")
full_df = pd.concat(df_list, ignore_index=True)

# Replace infinities with NaNs and drop
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)
full_df.dropna(axis=1, how='all', inplace=True)
full_df.dropna(inplace=True)

# Drop irrelevant non-numeric columns if present
non_numeric = ['flow id', 'source ip', 'destination ip', 'timestamp']
for col in non_numeric:
    if col in full_df.columns:
        full_df.drop(columns=[col], inplace=True)

# ----- MULTI-CLASS CLEANING -----
df_multi = full_df.copy()

# Normalize multi-class label text
df_multi['label'] = df_multi['label'].astype(str).str.strip()

# Save multi-class cleaned version
multi_output = 'CICIDS2017_cleaned_multiclass.csv'
df_multi.to_csv(multi_output, index=False)
print(f"✅ Multi-class dataset saved to: {multi_output}")

# ----- BINARY CLEANING -----
df_binary = df_multi.copy()
df_binary['label'] = df_binary['label'].apply(lambda x: 0 if 'BENIGN' in x.upper() else 1)

# Save binary cleaned version
binary_output = 'CICIDS2017_cleaned.csv'
df_binary.to_csv(binary_output, index=False)
print(f"✅ Binary dataset saved to: {binary_output}")

# Final stats
print("\n🔍 Dataset summary:")
print("Multi-class label distribution:")
print(df_multi['label'].value_counts())

print("\nBinary label distribution:")
print(df_binary['label'].value_counts())


🔄 Loading and normalizing CSV files...



100%|█████████████████████████████████████████████| 8/8 [00:16<00:00,  2.02s/it]



📦 Concatenating valid files...
✅ Multi-class dataset saved to: CICIDS2017_cleaned_multiclass.csv
✅ Binary dataset saved to: CICIDS2017_cleaned.csv

🔍 Dataset summary:
Multi-class label distribution:
label
BENIGN                        2271320
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

Binary label distribution:
label
0    2271320
1     556556
Name: count, dtype: int64
