In [None]:
import pandas as pd
import numpy as np
import os

print("--- Part 3: Creating Client Datasets (Memory-Efficient Version) ---")

# --- Configuration with your specific paths ---
INPUT_DIR = r"D:\FedShield,Personal\wataiData\csv\CICIoT2023\federated_data"
CLIENTS_DF_PATH = os.path.join(INPUT_DIR, 'clients_df.csv')
NUM_CLIENTS = 3
CHUNK_SIZE = 100000 # Process 100,000 rows at a time

# --- Load unique labels once to define profiles ---
# We can do this efficiently by just reading the 'label' column
print("Reading unique labels to define client profiles...")
unique_labels = pd.read_csv(CLIENTS_DF_PATH, usecols=['label'])['label'].unique()

# --- Define Attack Profiles (Non-IID) ---
print("Defining Non-IID client profiles...")
client1_labels = [l for l in unique_labels if 'DDoS' in l or 'DoS' in l]
client2_labels = [l for l in unique_labels if 'Recon' in l or 'MITM' in l or 'Spoofing' in l]
client3_labels = [l for l in unique_labels if 'Mirai' in l or 'Scan' in l or 'BruteForce' in l or 'Injection' in l or 'XSS' in l or 'Malware' in l]
client_labels_map = {0: client1_labels, 1: client2_labels, 2: client3_labels}

# --- Prepare output files ---
# Remove old client files if they exist
for i in range(NUM_CLIENTS):
    client_path = os.path.join(INPUT_DIR, f'client_{i+1}_dataset.csv')
    if os.path.exists(client_path):
        os.remove(client_path)

# --- Process the large client dataframe in chunks ---
print(f"Loading and processing {CLIENTS_DF_PATH} in chunks...")
chunk_num = 1
headers_written = {i: False for i in range(NUM_CLIENTS)}
reader = pd.read_csv(CLIENTS_DF_PATH, chunksize=CHUNK_SIZE)

for chunk in reader:
    print(f"  -> Processing chunk {chunk_num}...")
    
    # Separate benign and attack traffic for the current chunk
    benign_chunk = chunk[chunk['label'] == 'BenignTraffic']
    attack_chunk = chunk[chunk['label'] != 'BenignTraffic']

    # Distribute attack data based on profiles
    for i in range(NUM_CLIENTS):
        client_attack_chunk = attack_chunk[attack_chunk['label'].isin(client_labels_map[i])]
        client_path = os.path.join(INPUT_DIR, f'client_{i+1}_dataset.csv')
        
        if not client_attack_chunk.empty:
            if not headers_written[i]:
                client_attack_chunk.to_csv(client_path, index=False, mode='w', header=True)
                headers_written[i] = True
            else:
                client_attack_chunk.to_csv(client_path, index=False, mode='a', header=False)

    # Distribute benign traffic evenly using a round-robin approach
    if not benign_chunk.empty:
        client_idx_for_benign = (chunk_num - 1) % NUM_CLIENTS
        client_path = os.path.join(INPUT_DIR, f'client_{client_idx_for_benign + 1}_dataset.csv')

        if not headers_written[client_idx_for_benign]:
            benign_chunk.to_csv(client_path, index=False, mode='w', header=True)
            headers_written[client_idx_for_benign] = True
        else:
            benign_chunk.to_csv(client_path, index=False, mode='a', header=False)
            
    chunk_num += 1

print(f"\n✅ Part 3 Complete!")
print("Client datasets have been created successfully.")