In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter

# Set random seed for reproducibility across all random operations (e.g., labeled anomaly sampling)
np.random.seed(42)

# =============================
# 1. Load dataset
# =============================
# Load the raw EdgeIIoTset CSV file containing packet-level IoT network traffic and labels
print("ðŸ“‚ Loading dataset...")
df = pd.read_csv("EdgeIIoT-dataset.csv")  

# Drop columns that contain identifiers, payloads, or high-cardinality text unlikely to generalize
# These are removed to reduce noise, prevent overfitting, and focus on behavioral features (Section 3.5.1 Data Cleaning)
drop_cols = [
    "ip.src_host", "ip.dst_host", "arp.dst.proto_ipv4", "arp.src.proto_ipv4",
    "http.file_data", "http.request.uri.query", "http.referer",
    "http.request.full_uri", "tcp.options", "tcp.payload",
    "dns.qry.name", "dns.qry.name.len", "mqtt.msg"
]
df = df.drop(columns=drop_cols, errors="ignore")

# =============================
# 2. Separate labels AND keep attack types
# =============================
# Preserve binary (Attack_label: 0=normal, 1=anomalous) and multi-class (Attack_type) labels before dropping
# This prevents label leakage into feature space while keeping them for window-level labeling and analysis
labels = df[["Attack_label", "Attack_type"]].copy()
attack_types_original = df["Attack_type"].values  # Preserve original string attack types for per-window tracking
df = df.drop(columns=["Attack_label", "Attack_type"], errors="ignore")

# =============================
# 3. Encode categorical columns
# =============================
# Handle categorical features (e.g., protocol types) to make them numerical
# Low-cardinality (<50 unique): Label encoding
# High-cardinality: Frequency encoding to capture occurrence patterns without exploding dimensions
categorical_cols = df.select_dtypes(include=["object"]).columns.drop("frame.time", errors="ignore")
for col in categorical_cols:
    n_unique = df[col].nunique()
    if n_unique < 50:
        # LabelEncoder for low-cardinality categorical features
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    else:
        # Frequency encoding for high-cardinality features (common in network data)
        freq = df[col].value_counts()
        df[col] = df[col].map(freq)

# =============================
# 4. Scale numerical features
# =============================
# Standardize all features (zero mean, unit variance) for stable neural network training (ConvAE and RL state embeddings)
scaler = StandardScaler()
features = df.drop(columns=["frame.time"], errors="ignore")  # Exclude timestamp from scaling
features_scaled = scaler.fit_transform(features)

# Reconstruct DataFrame with scaled features and temporarily re-attach labels/timestamp for sorting
X = pd.DataFrame(features_scaled, columns=features.columns)
X["Attack_label"] = labels["Attack_label"].values
X["Attack_type"] = labels["Attack_type"].values
if "frame.time" in df.columns:
    X["frame.time"] = df["frame.time"].values

# Sort chronologically by frame.time â€“ critical for preserving temporal sequence in streaming-like data (Section 3.5.3)
if "frame.time" in X.columns:
    X = X.sort_values("frame.time").reset_index(drop=True)
    # Re-align preserved attack types after sorting
    attack_types_original = X["Attack_type"].values

print(f"âœ… Preprocessed features: {X.shape[1]} columns (including temporary labels and timestamp)")

# =============================
# 5. Sliding window (stride = 4) WITH ATTACK TYPE TRACKING
# =============================
def sliding_window_with_attack_types(data, binary_labels, attack_types,
                                      window=32, step=4, anomaly_threshold=0.3):
    """
    Generate overlapping sliding windows for time-series modeling 
    - Fixed window size 32 captures short-term dependencies in bursty IoT attacks.
    - Stride 4 provides dense coverage with high overlap for streaming simulation.
    - Window labeled anomalous if â‰¥30% packets are anomalous (proportional threshold heuristic).
    - Tracks most prevalent attack type per window for detailed analysis 
    """
    X_seq, y_seq, attack_type_seq = [], [], []
   
    for start in range(0, len(data) - window, step):
        end = start + window
        
        # Extract feature window (32 timesteps Ã— features)
        X_seq.append(data.iloc[start:end].values)
       
        # Compute anomaly proportion using original packet-level binary labels
        window_labels = binary_labels.iloc[start:end]
        anomaly_ratio = window_labels.sum() / len(window_labels)
       
        # Apply 30% threshold: balances sensitivity to partial attacks vs. robustness to noise 
        y_seq.append(int(anomaly_ratio >= anomaly_threshold))
       
        # Track dominant attack type for per-attack performance evaluation
        window_attack_types = attack_types[start:end]
        most_common_attack = Counter(window_attack_types).most_common(1)[0][0]
        attack_type_seq.append(most_common_attack)
   
    return (np.array(X_seq, dtype=np.float32),
            np.array(y_seq, dtype=np.int8),
            np.array(attack_type_seq))

# Generate windows â€“
print("ðŸªŸ Generating sliding windows (window=32, step=4, â‰¥30% anomalies required for anomalous label)...")
X_seq, y_seq, attack_type_seq = sliding_window_with_attack_types(
    X.drop(columns=["Attack_label", "Attack_type", "frame.time"], errors="ignore"),  # Clean features only
    X["Attack_label"],                                                                 # Packet-level binary labels
    attack_types_original,                                                             # Original attack type strings
    window=32,
    step=4,
    anomaly_threshold=0.3  # 30% proportional threshold (Section 3.5.2)
)

print(f"âœ… Total windows: {len(X_seq):,}")
print(f"âœ… Window shape: {X_seq.shape[1:]} (time steps Ã— features)")
print(f"âœ… Attack types tracked: {len(np.unique(attack_type_seq))} unique types")

# Display distribution of dominant attack types across windows
print("\nðŸ“Š Attack type distribution in all windows (top 10):")
attack_counts = Counter(attack_type_seq)
for attack_type, count in attack_counts.most_common(10):
    pct = count / len(attack_type_seq) * 100
    print(f" {attack_type:30s}: {count:8,} ({pct:5.2f}%)")
if len(attack_counts) > 10:
    print(f" ... and {len(attack_counts) - 10} more attack types")

# =============================
# 6. Chronological Train/Test Split (80/20, time-based)
# =============================
# Time-based split preserves real-world deployment scenario (earlier data for training, later for testing)
split_idx = int(0.8 * len(X_seq))
X_train_seq = X_seq[:split_idx]
y_train_seq = y_seq[:split_idx]
attack_types_train = attack_type_seq[:split_idx]

X_test = X_seq[split_idx:]
y_test = y_seq[split_idx:]
attack_types_test = attack_type_seq[split_idx:]

# Semi-supervised setup: 5% of anomalous windows fully labeled; rest unlabeled 
anomaly_indices = np.where(y_train_seq == 1)[0]
num_labeled = int(0.05 * len(anomaly_indices))  # 5% of training anomalies are labeled
labeled_anom_idx = np.random.choice(anomaly_indices, num_labeled, replace=False)

X_labeled = X_train_seq[labeled_anom_idx]
y_labeled = y_train_seq[labeled_anom_idx]

unlabeled_idx = np.setdiff1d(np.arange(len(X_train_seq)), labeled_anom_idx)
X_unlabeled = X_train_seq[unlabeled_idx]
y_unlabeled = y_train_seq[unlabeled_idx]  # Retained only for sanity checks; hidden during RL training

# ConvAE trained exclusively on normal (non-anomalous) windows from training split
normal_indices = np.where(y_train_seq == 0)[0]
X_ae = X_train_seq[normal_indices]

# =============================
# 7. Save outputs WITH INDICES AND ATTACK TYPES
# =============================
# Save processed arrays for ConvAE pretraining and RL training/inference
np.save("X_ae.npy", X_ae)                  # Normal sequences for ConvAE
np.save("X_unlabeled.npy", X_unlabeled)
np.save("y_unlabeled.npy", y_unlabeled)    # For verification only
np.save("X_labeled.npy", X_labeled)
np.save("y_labeled.npy", y_labeled)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)
np.save("y_train_seq.npy", y_train_seq)

# Save attack types and indices for reproducibility and per-attack analysis 
np.save("attack_types_test.npy", attack_types_test)
np.save("attack_types_train.npy", attack_types_train)
np.save("labeled_indices.npy", labeled_anom_idx)
np.save("unlabeled_indices.npy", unlabeled_idx)
np.save("train_split_size.npy", np.array([len(X_train_seq)]))

print("\nðŸ’¾ Saved arrays (sizes in GB where applicable):")
print(f" X_ae          : {X_ae.shape} ({X_ae.nbytes / 1e9:.2f} GB)")
print(f" X_unlabeled   : {X_unlabeled.shape} ({X_unlabeled.nbytes / 1e9:.2f} GB)")
print(f" X_labeled     : {X_labeled.shape} ({X_labeled.nbytes / 1e9:.2f} GB)")
print(f" X_test        : {X_test.shape} ({X_test.nbytes / 1e9:.2f} GB)")

# =============================
# 8. Dataset Statistics
# =============================
# Comprehensive statistics for transparency and verification (aligns with thesis reporting)
print("\n" + "="*60)
print("ðŸ“Š DATASET STATISTICS")
print("="*60)

train_normal = np.sum(y_train_seq == 0)
train_anomaly = np.sum(y_train_seq == 1)
train_total = len(y_train_seq)
print(f"\nðŸ”¹ TRAINING SPLIT (First 80% chronologically):")
print(f" Total windows   : {train_total:,}")
print(f" Normal windows  : {train_normal:,} ({train_normal/train_total*100:.2f}%)")
print(f" Anomaly windows : {train_anomaly:,} ({train_anomaly/train_total*100:.2f}%)")

test_normal = np.sum(y_test == 0)
test_anomaly = np.sum(y_test == 1)
test_total = len(y_test)
print(f"\nðŸ”¹ TEST SPLIT (Last 20% chronologically):")
print(f" Total windows   : {test_total:,}")
print(f" Normal windows  : {test_normal:,} ({test_normal/test_total*100:.2f}%)")
print(f" Anomaly windows : {test_anomaly:,} ({test_anomaly/test_total*100:.2f}%)")

print(f"\nðŸ”¹ TEST SET ATTACK TYPES (dominant per window):")
test_attack_counts = Counter(attack_types_test)
for attack_type, count in test_attack_counts.most_common():
    pct = count / len(attack_types_test) * 100
    print(f" {attack_type:30s}: {count:6,} ({pct:5.2f}%)")

print(f"\nðŸ”¹ OVERALL:")
print(f" Total windows   : {train_total + test_total:,}")
print(f" Normal windows  : {train_normal + test_normal:,}")
print(f" Anomaly windows : {train_anomaly + test_anomaly:,}")
print(f" Unique attacks  : {len(np.unique(attack_type_seq))}")

print(f"\nðŸ”¹ LABELED ANOMALIES (for RL training):")
print(f" Labeled anomalies : {len(X_labeled):,} ({len(X_labeled)/train_anomaly*100:.2f}% of train anomalies)")
print(f" Unlabeled data    : {len(X_unlabeled):,}")
print(f" ConvAE data       : {len(X_ae):,} (normal sequences only)")

print("\n" + "="*60)
print(f"âœ… Preprocessing complete! Ready for ConvAE pretraining and DRL training.")
print(f"âœ… Index files saved for full reproducibility.")
print(f"âœ… Attack type information preserved for per-attack analysis.")