In [1]:
import pandas as pd

# Path to one of the parquet files
file_path = "CIC/LDAP-training.parquet"  # Replace with the actual path to one of your files

# Load the parquet file
df = pd.read_parquet(file_path)

# Display the column names
print("Columns in the dataset:")
print(df.columns)

Columns in the dataset:
Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packets Length Total',
       'Bwd Packets Length Total', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'S

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6715 entries, 0 to 6714
Data columns (total 78 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Protocol                  6715 non-null   int8    
 1   Flow Duration             6715 non-null   int32   
 2   Total Fwd Packets         6715 non-null   int32   
 3   Total Backward Packets    6715 non-null   int16   
 4   Fwd Packets Length Total  6715 non-null   float32 
 5   Bwd Packets Length Total  6715 non-null   float32 
 6   Fwd Packet Length Max     6715 non-null   float32 
 7   Fwd Packet Length Min     6715 non-null   float32 
 8   Fwd Packet Length Mean    6715 non-null   float32 
 9   Fwd Packet Length Std     6715 non-null   float32 
 10  Bwd Packet Length Max     6715 non-null   float32 
 11  Bwd Packet Length Min     6715 non-null   float32 
 12  Bwd Packet Length Mean    6715 non-null   float32 
 13  Bwd Packet Length Std     6715 non-null   float3

In [None]:
import os
import pandas as pd
from tqdm import tqdm  # For progress tracking

# Step 1: Define Desired Columns
def get_desired_columns():
    return [
        "Flow Duration",  # Core Traffic Features
        "Total Fwd Packets",
        "Total Backward Packets",
        "Fwd Packets Length Total",
        "Bwd Packets Length Total",
        "Packet Length Max",  # Advanced Statistical Features
        "Packet Length Min",
        "Flow IAT Mean",
        "Flow IAT Std",
        "Flow IAT Max",
        "Flow IAT Min",
        "Fwd IAT Total",  # Behavior-Based Indicators
        "Fwd IAT Mean",
        "Bwd IAT Total",
        "Bwd IAT Mean",
        "Flow Bytes/s",
        "Flow Packets/s",
        "SYN Flag Count",  # Flag-Based Features
        "RST Flag Count",
        "ACK Flag Count",
        "URG Flag Count",
        "Bwd Packets/s",  # Protocol-Specific Features
        "Fwd Packets/s",
        "Down/Up Ratio",
        "Subflow Fwd Bytes",
        "Subflow Bwd Bytes",
        "Protocol",  # Essential Labels & Metadata
        "Label"
    ]

# Step 2: Load and Filter a Single Parquet File
def load_and_filter_parquet(file_path, desired_columns):
    try:
        # Load the parquet file
        df = pd.read_parquet(file_path)
        
        # Check for missing columns
        missing_columns = [col for col in desired_columns if col not in df.columns]
        if missing_columns:
            print(f"Skipping file {file_path} due to missing columns: {missing_columns}")
            return None
        
        # Extract only the desired columns
        return df[desired_columns]
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 3: Combine All Parquet Files
def combine_parquet_files(data_dir, desired_columns):
    # List all parquet files in the directory
    parquet_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".parquet")]
    
    # Create an empty list to store filtered DataFrames
    filtered_dfs = []
    
    # Process each file with a progress bar
    for file in tqdm(parquet_files, desc="Processing Parquet Files"):
        filtered_df = load_and_filter_parquet(file, desired_columns)
        if filtered_df is not None:
            filtered_dfs.append(filtered_df)
    
    # Concatenate all filtered DataFrames into one
    if filtered_dfs:
        return pd.concat(filtered_dfs, ignore_index=True)
    else:
        raise ValueError("No valid data found in the provided parquet files.")

# Step 4: Save Combined Data to CSV
def save_to_csv(df, output_file):
    try:
        df.to_csv(output_file, index=False)
        print(f"Combined dataset saved to '{output_file}'")
    except Exception as e:
        print(f"Error saving CSV file: {e}")

# Step 5: Main Function to Orchestrate the Workflow
def main():
    # Define input and output paths
    data_dir = "CIC"  # Directory containing the parquet files
    output_file = "combineddata.csv"  # Output CSV file
    
    # Get the list of desired columns
    desired_columns = get_desired_columns()
    
    # Combine all parquet files
    print("Combining parquet files...")
    combined_df = combine_parquet_files(data_dir, desired_columns)
    
    # Save the combined dataset to CSV
    print("Saving combined dataset to CSV...")
    save_to_csv(combined_df, output_file)

if __name__ == "__main__":
    main()

Combining parquet files...


Processing Parquet Files: 100%|██████████| 17/17 [00:00<00:00, 111.49it/s]

Saving combined dataset to CSV...





Combined dataset saved to 'combineddata.csv'


In [42]:
data = "CIC2019Data.csv"

df = pd.read_csv(data)
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431371 entries, 0 to 431370
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Flow Duration             431371 non-null  int64  
 1   Total Fwd Packets         431371 non-null  int64  
 2   Total Backward Packets    431371 non-null  int64  
 3   Fwd Packets Length Total  431371 non-null  float64
 4   Bwd Packets Length Total  431371 non-null  float64
 5   Packet Length Max         431371 non-null  float64
 6   Packet Length Min         431371 non-null  float64
 7   Flow IAT Mean             431371 non-null  float64
 8   Flow IAT Std              431371 non-null  float64
 9   Flow IAT Max              431371 non-null  float64
 10  Flow IAT Min              431371 non-null  float64
 11  Fwd IAT Total             431371 non-null  float64
 12  Fwd IAT Mean              431371 non-null  float64
 13  Bwd IAT Total             431371 non-null  f

**DATA PREPROCESSING**

In [59]:
df = pd.read_csv("DDoS_Dataset_v2.csv")
df["Label"].unique()

array([1, 0])

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import pickle

# Load Dataset
df = pd.read_csv("combineddata.csv")

# Print unique labels before fixing
print("🚨 Unique Labels Before Fixing:", df["Label"].unique())

# Convert Labels (Ensure Binary Classification)
df["Label"] = df["Label"].apply(lambda x: 0 if x == "Benign" else 1)

# Print unique labels after fixing
print("✅ Unique Labels After Fixing:", df["Label"].unique())

# Remove One-Hot Encoding for Protocol (if present)
protocol_columns = [col for col in df.columns if col.startswith("Protocol_")]

if protocol_columns:
    print(f"🔥 Found One-Hot Encoded Protocol: {protocol_columns}")
    df["Protocol"] = df[protocol_columns].idxmax(axis=1).str.extract("(\d+)").astype(float)
    df.drop(columns=protocol_columns, inplace=True)  # Remove one-hot columns
else:
    print("✅ Protocol is already a single column, no one-hot encoding found!")

# Define Expected Feature Columns (REMOVE REDUNDANT ONES)
expected_features = [
    "Flow Duration", "Total Fwd Packets", "Total Backward Packets",
    "Fwd Packets Length Total", "Bwd Packets Length Total", "Packet Length Max",
    "Packet Length Min", "Flow IAT Mean", "Flow IAT Std", "Flow IAT Max", "Flow IAT Min",
    "Fwd IAT Total", "Fwd IAT Mean", "Bwd IAT Total", "Bwd IAT Mean",
    "Flow Bytes/s", "Flow Packets/s", "SYN Flag Count", "RST Flag Count",
    "ACK Flag Count", "URG Flag Count", "Bwd Packets/s", "Fwd Packets/s",
    "Down/Up Ratio", "Subflow Fwd Bytes", "Subflow Bwd Bytes",
    "Protocol", "Label"  ]

# Remove Extra Columns & Reorder
df = df[expected_features]

# Balance Dataset (Equal Normal & Attack Samples)
df_attack = df[df["Label"] == 1]
df_normal = df[df["Label"] == 0]
df_attack_balanced = resample(df_attack, replace=True, n_samples=len(df_normal), random_state=42)
df_balanced = pd.concat([df_attack_balanced, df_normal])

# Normalize Features (EXCLUDE LABEL)
feature_columns = df_balanced.columns[:-1]
scaler = StandardScaler()
df_balanced[feature_columns] = scaler.fit_transform(df_balanced[feature_columns])

# Save Preprocessed Dataset
df_balanced.to_csv("DDoS_Dataset_v2.csv", index=False)
print("✅ Final Preprocessed Dataset Saved as `DDoS_Dataset_v2.csv`")

# Save Scaler
pickle.dump(scaler, open("scaler_v2.pkl", "wb"))
print("✅ Scaler saved as `scaler_v2.pkl`")


  df["Protocol"] = df[protocol_columns].idxmax(axis=1).str.extract("(\d+)").astype(float)


🚨 Unique Labels Before Fixing: ['DrDoS_SNMP' 'Benign' 'UDP' 'MSSQL' 'Syn' 'DrDoS_NetBIOS' 'Portmap'
 'UDPLag' 'NetBIOS' 'TFTP' 'DrDoS_MSSQL' 'UDP-lag' 'WebDDoS' 'DrDoS_UDP'
 'LDAP' 'DrDoS_DNS' 'DrDoS_NTP' 'DrDoS_LDAP']
✅ Unique Labels After Fixing: [1 0]
✅ Protocol is already a single column, no one-hot encoding found!
✅ Final Preprocessed Dataset Saved as `DDoS_Dataset_v2.csv`
✅ Scaler saved as `scaler_v2.pkl`


In [65]:
df = pd.read_csv("DDoS_Dataset_v2.csv")
df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Packet Length Max,Packet Length Min,Flow IAT Mean,Flow IAT Std,Flow IAT Max,...,RST Flag Count,ACK Flag Count,URG Flag Count,Bwd Packets/s,Fwd Packets/s,Down/Up Ratio,Subflow Fwd Bytes,Subflow Bwd Bytes,Protocol,Label
0,-0.255397,-0.072027,-0.057648,-0.098205,-0.022748,0.157169,1.242688,0.238427,0.156361,-0.096246,...,-0.252053,-0.476822,-0.511169,-0.065517,-0.287334,-0.588766,-0.098205,-0.022748,0.813563,1
1,-0.368053,-0.072027,-0.057648,-0.112088,-0.022748,-0.014762,0.519947,-0.407044,-0.420737,-0.357211,...,-0.252053,-0.476822,-0.511169,-0.065517,-0.287244,-0.588766,-0.112088,-0.022748,0.813563,1
2,-0.372189,-0.081711,-0.057648,-0.113507,-0.022748,0.388666,1.907143,-0.43074,-0.441909,-0.366787,...,-0.252053,-0.476822,-0.511169,-0.065517,2.193492,-0.588766,-0.113507,-0.022748,0.813563,1
3,-0.372184,0.053872,-0.057648,0.132792,-0.022748,0.054281,-0.451479,-0.430738,-0.441906,-0.366783,...,-0.252053,-0.476822,-0.511169,-0.065517,0.348773,-0.588766,0.132792,-0.022748,0.813563,1
4,-0.249653,-0.072027,-0.057648,-0.098205,-0.022748,0.157169,1.242688,0.271341,0.185787,-0.082939,...,-0.252053,-0.476822,-0.511169,-0.065517,-0.287334,-0.588766,-0.098205,-0.022748,0.813563,1


In [66]:
import pandas as pd
df = pd.read_csv("DDoS_Dataset_v2.csv")
print(df.info())  # Ensure correct columns
print(df["Label"].value_counts())  # Confirm label balance


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195662 entries, 0 to 195661
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Flow Duration             195662 non-null  float64
 1   Total Fwd Packets         195662 non-null  float64
 2   Total Backward Packets    195662 non-null  float64
 3   Fwd Packets Length Total  195662 non-null  float64
 4   Bwd Packets Length Total  195662 non-null  float64
 5   Packet Length Max         195662 non-null  float64
 6   Packet Length Min         195662 non-null  float64
 7   Flow IAT Mean             195662 non-null  float64
 8   Flow IAT Std              195662 non-null  float64
 9   Flow IAT Max              195662 non-null  float64
 10  Flow IAT Min              195662 non-null  float64
 11  Fwd IAT Total             195662 non-null  float64
 12  Fwd IAT Mean              195662 non-null  float64
 13  Bwd IAT Total             195662 non-null  f