In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

# 1. CONFIGURATION
###############################################################################
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

DATA_PATH = (
    "/mnt/c/Users/mikig/Desktop/UPC/PAE/Datasets/"
    "9810e03bba4983da_MOHANAD_A4706/"
    "9810e03bba4983da_MOHANAD_A4706/data/NF-UQ-NIDS-v2.csv"
)

TARGET_COLUMN = "Attack"
DROP_COLUMNS = ["Dataset", "Label"]

# If EPOCHS=0 => use large epochs + early stopping
# If EPOCHS!=0 => train for exactly EPOCHS
EPOCHS = 0  # Toggle here. 0 => Early stopping, >0 => fixed epochs.

SCALER_CHUNKSIZE = 100_000
MAX_SCALER_CHUNKS = 5

CHUNK_SIZE = 1024  # Good default chunk size for memory/performance
TRAIN_SPLIT_RATIO = 0.8
CLAMP_VALUE = 1e9
CHUNK_SHUFFLE_BUFFER = 10

In [102]:
def collect_all_labels(csv_path, target_column=TARGET_COLUMN, chunksize=200000):
    """
    Reads only the target column in chunks. Gathers every unique label in a set.
    Ensures no unseen labels at training time.
    """
    print(f"[INFO] Collecting all unique labels from '{target_column}'...")
    label_set = set()
    chunk_idx = 0
    for chunk in pd.read_csv(csv_path, usecols=[target_column], chunksize=chunksize):
        chunk_idx += 1
        chunk.dropna(subset=[target_column], inplace=True)
        label_set.update(chunk[target_column].unique())
        if chunk_idx % 10 == 0:
            print(
                f"  Processed {chunk_idx} label-only chunks... (unique labels so far={len(label_set)})"
            )
    all_labels = sorted(label_set)
    print(f"[INFO] Found {len(all_labels)} unique labels total.")
    return all_labels


def compute_class_distribution(csv_path, target_col=TARGET_COLUMN, chunksize=100000):
    """
    Reads only 'target_col' in chunks to compute frequency counts for class weighting.
    """
    print("[INFO] Computing class distribution (for class weights)...")
    counter = Counter()
    cidx = 0
    for chunk in pd.read_csv(csv_path, usecols=[target_col], chunksize=chunksize):
        cidx += 1
        chunk.dropna(subset=[target_col], inplace=True)
        counter.update(chunk[target_col].values)
        if cidx % 10 == 0:
            print(f"  Processed {cidx} distribution chunks.")
    return counter


def make_class_weights(label_encoder, class_counter):
    """
    Creates a dictionary for Keras 'class_weight' from the frequency counts.
    Typically: weight = total_samples / (num_classes * class_count).
    """
    total_samples = sum(class_counter.values())
    n_classes = len(label_encoder.classes_)
    cw = {}
    for i, cls_label in enumerate(label_encoder.classes_):
        cnt = class_counter.get(cls_label, 0)
        if cnt == 0:
            cw[i] = 0.0
        else:
            cw[i] = total_samples / (n_classes * cnt)
    return cw

In [103]:
def partial_fit_scaler(
    csv_path,
    target_column=TARGET_COLUMN,
    drop_columns=DROP_COLUMNS,
    clamp_value=CLAMP_VALUE,
    scaler_chunksize=SCALER_CHUNKSIZE,
    max_chunks=MAX_SCALER_CHUNKS,
):
    """
    Reads the CSV in moderate chunks to partial_fit a StandardScaler
    without loading entire dataset into memory.
    """
    print("[INFO] Incrementally partial-fitting a StandardScaler...")
    scaler = StandardScaler()
    reader = pd.read_csv(csv_path, chunksize=scaler_chunksize, low_memory=True)

    chunk_index = 0
    for chunk in reader:
        chunk_index += 1
        if chunk_index > max_chunks:
            break

        if drop_columns:
            chunk.drop(
                columns=[c for c in drop_columns if c in chunk.columns],
                inplace=True,
                errors="ignore",
            )
        if target_column in chunk.columns:
            chunk.drop(columns=[target_column], inplace=True, errors="ignore")

        for col in chunk.columns:
            if chunk[col].dtype == object:
                chunk[col] = chunk[col].apply(
                    lambda x: hash(x) % (2**31) if pd.notna(x) else 0
                )

        numeric_chunk = chunk.select_dtypes(include=[np.number]).astype(np.float64)
        numeric_chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
        numeric_chunk = numeric_chunk.clip(-clamp_value, clamp_value)
        numeric_chunk.fillna(0, inplace=True)

        scaler.partial_fit(numeric_chunk)
        print(
            f"  [DEBUG] partial_fit chunk #{chunk_index}, shape={numeric_chunk.shape}"
        )

    print("[INFO] Done partial-fitting the StandardScaler.")
    return scaler

In [104]:
def count_chunks(csv_path, chunk_size=CHUNK_SIZE, train_ratio=TRAIN_SPLIT_RATIO):
    """
    Calculates the number of chunks based on total samples and chunk size.
    Splits them by index for train/val.
    """
    # Define total number of samples (you can adjust this if known)
    total_samples = 75987975  # Replace with your actual total samples if different

    # Calculate the number of training and validation samples
    train_samples = int(total_samples * train_ratio)
    validation_samples = total_samples - train_samples

    # Calculate number of training and validation chunks
    train_steps = round((train_samples / chunk_size), 0)
    train_chunks = int(
        train_steps if train_samples / chunk_size < train_steps else train_steps + 1
    )
    validation_steps = round((validation_samples / chunk_size), 0)
    val_chunks = int(
        validation_steps
        if validation_samples / chunk_size < validation_steps
        else validation_steps + 1
    )

    print(f"Total samples: {total_samples}")
    print(f"Training samples: {train_samples}")
    print(f"Validation samples: {validation_samples}")
    print(f"Training chunks: {train_chunks}")
    print(f"Validation chunks: {val_chunks}")

    return train_chunks, val_chunks

In [105]:
def chunk_generator(
    csv_path,
    chunk_size,
    target_column,
    drop_columns,
    scaler,
    label_encoder,
    clamp_value,
    shuffle_seed,
    start_chunk,
    end_chunk,
    is_val=False,
):
    """
    Generator that yields (X, y) for chunk indices [start_chunk, end_chunk).
    """
    reader = pd.read_csv(csv_path, chunksize=chunk_size, low_memory=True)
    rng = np.random.default_rng(shuffle_seed)

    cidx = 0
    for chunk in reader:
        if cidx < start_chunk:
            cidx += 1
            continue
        if cidx >= end_chunk:
            break

        # Drop columns not needed
        if drop_columns:
            chunk.drop(
                columns=[c for c in drop_columns if c in chunk.columns],
                inplace=True,
                errors="ignore",
            )

        # Drop rows with missing target
        chunk.dropna(subset=[target_column], inplace=True)
        if chunk.empty:
            cidx += 1
            continue

        labels = chunk.pop(target_column)

        # Convert object columns -> numeric
        for col in chunk.columns:
            if chunk[col].dtype == object:
                chunk[col] = chunk[col].apply(
                    lambda x: hash(x) % (2**31) if pd.notna(x) else 0
                )

        X_chunk = chunk.select_dtypes(include=[np.number]).astype(np.float64)
        X_chunk.replace([np.inf, -np.inf], np.nan, inplace=True)
        X_chunk = X_chunk.clip(-clamp_value, clamp_value)
        X_chunk.fillna(0, inplace=True)

        # Scale
        if scaler is not None:
            X_chunk = scaler.transform(X_chunk)
        X_chunk = X_chunk.astype(np.float32)

        y_encoded = label_encoder.transform(labels)

        # Shuffle rows if training
        if not is_val:
            idx = np.arange(len(X_chunk))
            rng.shuffle(idx)
            X_chunk = X_chunk[idx]
            y_encoded = y_encoded[idx]

        yield X_chunk, y_encoded
        cidx += 1


def create_dataset(
    csv_path,
    chunk_size,
    target_column,
    drop_columns,
    scaler,
    label_encoder,
    clamp_value,
    shuffle_seed,
    start_chunk,
    end_chunk,
    is_val=False,
    repeat_epochs=1,
):
    """
    Creates a tf.data.Dataset from chunk_generator for [start_chunk, end_chunk).
    If repeat_epochs>0, we do ds.repeat(repeat_epochs).
    If repeat_epochs=0, we skip ds.repeat => pass is driven by model.fit epochs.
    """
    print(
        f"[INFO] Building dataset for chunks [{start_chunk}, {end_chunk}) (is_val={is_val}), repeat={repeat_epochs}"
    )
    output_types = (tf.float32, tf.int32)
    output_shapes = (tf.TensorShape([None, None]), tf.TensorShape([None]))

    ds = tf.data.Dataset.from_generator(
        lambda: chunk_generator(
            csv_path=csv_path,
            chunk_size=chunk_size,
            target_column=target_column,
            drop_columns=drop_columns,
            scaler=scaler,
            label_encoder=label_encoder,
            clamp_value=clamp_value,
            shuffle_seed=shuffle_seed,
            start_chunk=start_chunk,
            end_chunk=end_chunk,
            is_val=is_val,
        ),
        output_types=output_types,
        output_shapes=output_shapes,
    )

    if repeat_epochs > 0:
        ds = ds.repeat(repeat_epochs)

    if not is_val:
        ds = ds.shuffle(buffer_size=CHUNK_SHUFFLE_BUFFER, reshuffle_each_iteration=True)

    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

In [106]:
def build_model(input_dim, num_classes):
    model = keras.Sequential(
        [
            keras.layers.InputLayer(input_shape=(input_dim,)),
            keras.layers.Dense(256, activation="relu"),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(128, activation="relu"),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(num_classes, activation="softmax"),
        ]
    )
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model

In [107]:
def plot_training_curves(history):
    acc = history.history.get("accuracy", [])
    val_acc = history.history.get("val_accuracy", [])
    loss = history.history.get("loss", [])
    val_loss = history.history.get("val_loss", [])

    epochs_range = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label="Train Acc")
    plt.plot(epochs_range, val_acc, label="Val Acc")
    plt.title("Accuracy Over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label="Train Loss")
    plt.plot(epochs_range, val_loss, label="Val Loss")
    plt.title("Loss Over Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.tight_layout()
    plt.show()


def plot_confusion_matrix(cm, labels):
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels, fmt="d"
    )
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True Label")
    plt.show()

In [108]:
def main():
    # 1) Collect all unique labels => fit LabelEncoder
    all_labels = collect_all_labels(
        DATA_PATH, target_column=TARGET_COLUMN, chunksize=100_000
    )
    label_encoder = LabelEncoder()
    label_encoder.fit(all_labels)
    num_classes = len(all_labels)
    print(f"[INFO] LabelEncoder fitted on {num_classes} classes.")

    # 2) Partial-fit StandardScaler
    scaler = partial_fit_scaler(
        csv_path=DATA_PATH,
        target_column=TARGET_COLUMN,
        drop_columns=DROP_COLUMNS,
        clamp_value=CLAMP_VALUE,
        scaler_chunksize=SCALER_CHUNKSIZE,
        max_chunks=MAX_SCALER_CHUNKS,
    )

    # 3) Class distribution => class weights
    class_counter = compute_class_distribution(
        DATA_PATH, TARGET_COLUMN, chunksize=100_000
    )
    class_weight_dict = make_class_weights(label_encoder, class_counter)
    print("[INFO] Class weights:")
    for idx, w in class_weight_dict.items():
        print(f"  index={idx}, label={label_encoder.classes_[idx]}, weight={w:.3f}")

    # 4) Count total chunks => train/val split
    train_chunks, val_chunks = count_chunks(
        DATA_PATH, chunk_size=CHUNK_SIZE, train_ratio=TRAIN_SPLIT_RATIO
    )

    # 5) Decide how many epochs (0 => large epoch + early stopping, else fixed)
    if EPOCHS == 0:
        print("[INFO] EPOCHS=0 => Using early stopping with a maximum of 50 epochs.")
        epochs_for_fit = 50
        callbacks_list = [
            keras.callbacks.EarlyStopping(
                monitor="val_loss", patience=3, restore_best_weights=True
            )
        ]
        repeat_for_dataset = 0  # single pass each epoch
    else:
        print(
            f"[INFO] EPOCHS={EPOCHS} => Using exactly {EPOCHS} epochs (no early stopping)."
        )
        epochs_for_fit = EPOCHS
        callbacks_list = []
        repeat_for_dataset = EPOCHS  # dataset repeats EPOCHS times

    # 6) Create train & val datasets
    train_ds = create_dataset(
        csv_path=DATA_PATH,
        chunk_size=CHUNK_SIZE,
        target_column=TARGET_COLUMN,
        drop_columns=DROP_COLUMNS,
        scaler=scaler,
        label_encoder=label_encoder,
        clamp_value=CLAMP_VALUE,
        shuffle_seed=SEED,
        start_chunk=0,
        end_chunk=train_chunks,
        is_val=False,
        repeat_epochs=repeat_for_dataset,
    )

    val_ds = create_dataset(
        csv_path=DATA_PATH,
        chunk_size=CHUNK_SIZE,
        target_column=TARGET_COLUMN,
        drop_columns=DROP_COLUMNS,
        scaler=scaler,
        label_encoder=label_encoder,
        clamp_value=CLAMP_VALUE,
        shuffle_seed=SEED,
        start_chunk=train_chunks,
        end_chunk=train_chunks + val_chunks,
        is_val=True,
        repeat_epochs=repeat_for_dataset,
    )

    # 7) Determine input_dim from the first training batch
    for Xb, yb in train_ds.take(1):
        input_dim = Xb.shape[1]
        print(f"[INFO] Detected input_dim={input_dim} from the first training batch.")
        break

    # 8) Build the model
    model = build_model(input_dim, num_classes)
    model.summary()

    # 9) Train
    print("[INFO] Starting training now...")
    history = model.fit(
        train_ds.repeat(),
        epochs=epochs_for_fit,
        validation_data=val_ds,
        class_weight=class_weight_dict,
        callbacks=callbacks_list,
    )
    print("[INFO] Training complete.")

    # 10) Plot training curves
    plot_training_curves(history)

    # 11) Evaluate on validation data fully
    print("[INFO] Evaluating on validation data to build confusion matrix ...")
    all_preds = []
    all_labels_eval = []
    for X_val, y_val in val_ds:
        preds = model.predict(X_val)
        preds_class = np.argmax(preds, axis=1)
        all_preds.extend(preds_class)
        all_labels_eval.extend(y_val.numpy())

    cm = confusion_matrix(all_labels_eval, all_preds)
    print("\n[INFO] Confusion Matrix:\n", cm)
    plot_confusion_matrix(cm, labels=label_encoder.classes_)

    print("\n[INFO] Classification Report:")
    print(
        classification_report(
            all_labels_eval,
            all_preds,
            target_names=label_encoder.classes_,
            zero_division=0,
        )
    )

    # 12) Save model
    model.save("netflow_classification_model_conditional_epochs.keras")
    print(
        "[INFO] Model saved to 'netflow_classification_model_conditional_epochs.keras'."
    )