In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import Counter

# Set random seed for reproducibility
np.random.seed(42)

# -----------------------------
# 1. Load dataset
# -----------------------------
print("ðŸ“‚ Loading dataset...")
df = pd.read_csv("EdgeIIoT-dataset.csv")

# Drop irrelevant identifier/payload columns
drop_cols = [
    "ip.src_host", "ip.dst_host", "arp.dst.proto_ipv4", "arp.src.proto_ipv4",
    "http.file_data", "http.request.uri.query", "http.referer",
    "http.request.full_uri", "tcp.options", "tcp.payload",
    "dns.qry.name", "dns.qry.name.len", "mqtt.msg"
]
df = df.drop(columns=drop_cols, errors="ignore")

# -----------------------------
# 2. Separate labels AND keep attack types
# -----------------------------
labels = df[["Attack_label", "Attack_type"]].copy()
attack_types_original = df["Attack_type"].values  # *** NEW: Keep original attack types ***
df = df.drop(columns=["Attack_label", "Attack_type"], errors="ignore")

# -----------------------------
# 3. Encode categorical columns
# -----------------------------
categorical_cols = df.select_dtypes(include=["object"]).columns.drop("frame.time", errors="ignore")
for col in categorical_cols:
    n_unique = df[col].nunique()
    if n_unique < 50:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    else:
        freq = df[col].value_counts()
        df[col] = df[col].map(freq)

# -----------------------------
# 4. Scale numerical features
# -----------------------------
scaler = StandardScaler()
features = df.drop(columns=["frame.time"], errors="ignore")
features_scaled = scaler.fit_transform(features)

X = pd.DataFrame(features_scaled, columns=features.columns)
X["Attack_label"] = labels["Attack_label"].values
X["Attack_type"] = labels["Attack_type"].values
if "frame.time" in df.columns:
    X["frame.time"] = df["frame.time"].values

# Sort chronologically (important for sequential structure)
if "frame.time" in X.columns:
    X = X.sort_values("frame.time").reset_index(drop=True)
    # *** IMPORTANT: Also reorder attack_types_original to match ***
    attack_types_original = X["Attack_type"].values

print(f"âœ… Preprocessed features: {X.shape[1]} columns")

# -----------------------------
# 5. Sliding window (stride = 4) WITH ATTACK TYPE TRACKING
# -----------------------------
def sliding_window_with_attack_types(data, binary_labels, attack_types, 
                                      window=32, step=4, anomaly_threshold=0.4):
    """
    Create sliding windows with robust anomaly labeling AND track attack types.
    
    Args:
        data: Feature data
        binary_labels: Binary labels (0=normal, 1=anomaly)
        attack_types: Attack type strings for each packet
        window: Window size (number of time steps)
        step: Stride between windows
        anomaly_threshold: Minimum proportion of anomalies needed (default: 0.4 = 40%)
    
    Returns:
        X_seq: Array of windows (n_windows, window, n_features)
        y_seq: Binary labels for windows (n_windows,)
        attack_type_seq: Most prevalent attack type per window (n_windows,)
    """
    X_seq, y_seq, attack_type_seq = [], [], []
    
    for start in range(0, len(data) - window, step):
        end = start + window
        X_seq.append(data.iloc[start:end].values)
        
        # Calculate proportion of anomalies in this window
        window_labels = binary_labels.iloc[start:end]
        anomaly_ratio = window_labels.sum() / len(window_labels)
        
        # Label as anomalous only if >= threshold
        y_seq.append(int(anomaly_ratio >= anomaly_threshold))
        
        # *** NEW: Determine most prevalent attack type in window ***
        window_attack_types = attack_types[start:end]
        most_common_attack = Counter(window_attack_types).most_common(1)[0][0]
        attack_type_seq.append(most_common_attack)
    
    return (np.array(X_seq, dtype=np.float32), 
            np.array(y_seq, dtype=np.int8),
            np.array(attack_type_seq))

print("ðŸªŸ Generating sliding windows (window=32, step=4, â‰¥40% anomalies required)...")
X_seq, y_seq, attack_type_seq = sliding_window_with_attack_types(
    X.drop(columns=["Attack_label", "Attack_type", "frame.time"], errors="ignore"),
    X["Attack_label"],
    attack_types_original,  # *** NEW: Pass attack types ***
    window=32,
    step=4,
    anomaly_threshold=0.4  # 40% threshold
)

print(f"âœ… Total windows: {len(X_seq):,}")
print(f"âœ… Window shape: {X_seq.shape[1:]} (time steps Ã— features)")
print(f"âœ… Attack types tracked: {len(np.unique(attack_type_seq))} unique types")

# Show attack type distribution
print("\nðŸ“Š Attack type distribution in all windows:")
attack_counts = Counter(attack_type_seq)
for attack_type, count in attack_counts.most_common(10):  # Top 10
    pct = count / len(attack_type_seq) * 100
    print(f"  {attack_type:30s}: {count:8,} ({pct:5.2f}%)")
if len(attack_counts) > 10:
    print(f"  ... and {len(attack_counts) - 10} more attack types")

# -----------------------------
# 6. Chronological Train/Test Split (80/20, time-based)
# -----------------------------
split_idx = int(0.8 * len(X_seq))  # First 80% train, last 20% test
X_train_seq = X_seq[:split_idx]
y_train_seq = y_seq[:split_idx]
attack_types_train = attack_type_seq[:split_idx]  # *** NEW: Split attack types ***

X_test = X_seq[split_idx:]
y_test = y_seq[split_idx:]
attack_types_test = attack_type_seq[split_idx:]  # *** NEW: Split attack types ***

# From train: Create ConvAE + PPO sets
anomaly_indices = np.where(y_train_seq == 1)[0]
num_labeled = int(0.05 * len(anomaly_indices))  # 5% labeled anomalies

labeled_anom_idx = np.random.choice(anomaly_indices, num_labeled, replace=False)
X_labeled = X_train_seq[labeled_anom_idx]
y_labeled = y_train_seq[labeled_anom_idx]

unlabeled_idx = np.setdiff1d(np.arange(len(X_train_seq)), labeled_anom_idx)
X_unlabeled = X_train_seq[unlabeled_idx]
y_unlabeled = y_train_seq[unlabeled_idx]  # For sanity checks only; hide in training

# ConvAE trained only on normal sequences from train
normal_indices = np.where(y_train_seq == 0)[0]
X_ae = X_train_seq[normal_indices]

# -----------------------------
# 7. Save outputs WITH INDICES AND ATTACK TYPES
# -----------------------------
np.save("X_ae.npy", X_ae)
np.save("X_unlabeled.npy", X_unlabeled)
np.save("y_unlabeled.npy", y_unlabeled)
np.save("X_labeled.npy", X_labeled)
np.save("y_labeled.npy", y_labeled)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)
np.save("y_train_seq.npy", y_train_seq)

# *** NEW: Save attack types ***
np.save("attack_types_test.npy", attack_types_test)
np.save("attack_types_train.npy", attack_types_train)

# Save indices for reconstruction
np.save("labeled_indices.npy", labeled_anom_idx)
np.save("unlabeled_indices.npy", unlabeled_idx)
np.save("train_split_size.npy", np.array([len(X_train_seq)]))

print("\nðŸ’¾ Saved arrays:")
print(f"  X_ae         : {X_ae.shape} ({X_ae.nbytes / 1e9:.2f} GB)")
print(f"  X_unlabeled  : {X_unlabeled.shape} ({X_unlabeled.nbytes / 1e9:.2f} GB)")
print(f"  X_labeled    : {X_labeled.shape} ({X_labeled.nbytes / 1e9:.2f} GB)")
print(f"  X_test       : {X_test.shape} ({X_test.nbytes / 1e9:.2f} GB)")
print(f"\n  attack_types_test  : {attack_types_test.shape}")
print(f"  attack_types_train : {attack_types_train.shape}")
print(f"  labeled_indices    : {labeled_anom_idx.shape}")
print(f"  unlabeled_indices  : {unlabeled_idx.shape}")

# -----------------------------
# 8. Dataset Statistics
# -----------------------------
print("\n" + "="*60)
print("ðŸ“Š DATASET STATISTICS")
print("="*60)

# Training split statistics
train_normal = np.sum(y_train_seq == 0)
train_anomaly = np.sum(y_train_seq == 1)
train_total = len(y_train_seq)
print(f"\nðŸ”¹ TRAINING SPLIT (First 80% chronologically):")
print(f"  Total windows    : {train_total:,}")
print(f"  Normal windows   : {train_normal:,} ({train_normal/train_total*100:.2f}%)")
print(f"  Anomaly windows  : {train_anomaly:,} ({train_anomaly/train_total*100:.2f}%)")

# Test split statistics
test_normal = np.sum(y_test == 0)
test_anomaly = np.sum(y_test == 1)
test_total = len(y_test)
print(f"\nðŸ”¹ TEST SPLIT (Last 20% chronologically):")
print(f"  Total windows    : {test_total:,}")
print(f"  Normal windows   : {test_normal:,} ({test_normal/test_total*100:.2f}%)")
print(f"  Anomaly windows  : {test_anomaly:,} ({test_anomaly/test_total*100:.2f}%)")

# *** NEW: Attack type statistics for test set ***
print(f"\nðŸ”¹ TEST SET ATTACK TYPES:")
test_attack_counts = Counter(attack_types_test)
for attack_type, count in test_attack_counts.most_common():
    pct = count / len(attack_types_test) * 100
    print(f"  {attack_type:30s}: {count:6,} ({pct:5.2f}%)")

# Overall statistics
print(f"\nðŸ”¹ OVERALL:")
print(f"  Total windows    : {train_total + test_total:,}")
print(f"  Normal windows   : {train_normal + test_normal:,}")
print(f"  Anomaly windows  : {train_anomaly + test_anomaly:,}")
print(f"  Unique attacks   : {len(np.unique(attack_type_seq))}")

# Labeled data info
print(f"\nðŸ”¹ LABELED ANOMALIES (for PPO training):")
print(f"  Labeled anomalies: {len(X_labeled):,} ({len(X_labeled)/train_anomaly*100:.2f}% of train anomalies)")
print(f"  Unlabeled data   : {len(X_unlabeled):,}")
print(f"  ConvAE data      : {len(X_ae):,} (normal sequences only)")

print("\n" + "="*60)
print(f"âœ… Preprocessing complete! Ready for ConvAE and PPO training.")
print(f"âœ… Index files saved for reconstruction: labeled_indices.npy, unlabeled_indices.npy")
print(f"âœ… Attack type information saved: attack_types_test.npy, attack_types_train.npy")