In [4]:
import pandas as pd
import numpy as np
import os
import glob
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

def load_csv_files_from_folder(folder_path):
    """Load all CSV files from a folder and combine them"""
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

    if not csv_files:
        print(f"No CSV files found in {folder_path}")
        return None

    print(f"Found {len(csv_files)} CSV files in {folder_path}")

    dataframes = []
    for file in csv_files:
        df = pd.read_csv(file)
        print(f"Loaded {file}: {len(df)} rows")
        dataframes.append(df)

    # Combine all CSV files from the folder
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Combined {folder_path}: {len(combined_df)} total rows")

    return combined_df

def randomly_merge_datasets(attack_df, benign_df, attack_label='attack', benign_label='benign', label_column='label', random_state=42):
    """Randomly merge attack and benign datasets"""

    # Add labels
    attack_df = attack_df.copy()
    benign_df = benign_df.copy()

    attack_df[label_column] = attack_label
    benign_df[label_column] = benign_label

    print(f"Attack samples: {len(attack_df)}")
    print(f"Benign samples: {len(benign_df)}")

    # Combine datasets
    merged_df = pd.concat([attack_df, benign_df], ignore_index=True)

    # Randomly shuffle
    merged_df = merged_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    print(f"Merged dataset: {len(merged_df)} total rows")
    print(f"Class distribution:\n{merged_df[label_column].value_counts()}")

    return merged_df

def save_merged_dataset(merged_df, output_folder, filename='merged_dataset.csv'):
    """Save merged dataset to specified folder"""

    # Create output directory
    os.makedirs(output_folder, exist_ok=True)

    # Generate full file path
    output_path = os.path.join(output_folder, filename)

    # Save dataset
    merged_df.to_csv(output_path, index=False)

    print(f"Merged dataset saved to: {output_path}")
    print(f"File size: {len(merged_df)} rows, {len(merged_df.columns)} columns")

    return output_path

def main():
    # Define paths
    attack_folder = '/content/drive/MyDrive/FYP/sampled_datasets/Testing/attack'
    benign_folder = '/content/drive/MyDrive/FYP/sampled_datasets/Testing/benign'
    output_folder = '/content/drive/MyDrive/FYP/sampled_datasets/Testing/merged'

    # Load datasets
    print("Loading attack datasets...")
    attack_df = load_csv_files_from_folder(attack_folder)

    print("\nLoading benign datasets...")
    benign_df = load_csv_files_from_folder(benign_folder)

    if attack_df is None or benign_df is None:
        print("Error: Could not load datasets")
        return

    # Randomly merge datasets
    print("\nRandomly merging datasets...")
    merged_df = randomly_merge_datasets(
        attack_df,
        benign_df,
        attack_label='attack',
        benign_label='benign',
        label_column='label',
        random_state=42
    )

    # Save merged dataset
    print("\nSaving merged dataset...")
    output_path = save_merged_dataset(
        merged_df,
        output_folder,
        'randomly_merged_dataset.csv'
    )

    # Display summary
    print("\n" + "="*50)
    print("MERGE COMPLETED SUCCESSFULLY!")
    print("="*50)
    print(f"Attack samples: {len(attack_df)}")
    print(f"Benign samples: {len(benign_df)}")
    print(f"Total merged: {len(merged_df)}")
    print(f"Output file: {output_path}")

# Alternative function for custom paths
def merge_csv_custom_paths(attack_path, benign_path, output_path, filename='merged_dataset.csv'):
    """Merge CSV files with custom paths"""

    # Load datasets
    attack_df = load_csv_files_from_folder(attack_path)
    benign_df = load_csv_files_from_folder(benign_path)

    if attack_df is None or benign_df is None:
        return None

    # Merge and save
    merged_df = randomly_merge_datasets(attack_df, benign_df)
    output_file = save_merged_dataset(merged_df, output_path, filename)

    return merged_df, output_file

# Run the main function
if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading attack datasets...
Found 1 CSV files in /content/drive/MyDrive/FYP/sampled_datasets/Testing/attack
Loaded /content/drive/MyDrive/FYP/sampled_datasets/Testing/attack/sampled_dataset_10000_20250602_095721.csv: 10000 rows
Combined /content/drive/MyDrive/FYP/sampled_datasets/Testing/attack: 10000 total rows

Loading benign datasets...
Found 1 CSV files in /content/drive/MyDrive/FYP/sampled_datasets/Testing/benign
Loaded /content/drive/MyDrive/FYP/sampled_datasets/Testing/benign/sampled_dataset_10000_20250602_090502.csv: 10000 rows
Combined /content/drive/MyDrive/FYP/sampled_datasets/Testing/benign: 10000 total rows

Randomly merging datasets...
Attack samples: 10000
Benign samples: 10000
Merged dataset: 20000 total rows
Class distribution:
label
benign    10000
attack    10000
Name: count, dtype: int64

Saving merged dataset...
Merged dataset saved to: /c