In [91]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Normalization,
    StringLookup,
    IntegerLookup,
    Embedding,
    Flatten,
    Concatenate,
    Dense,
    BatchNormalization,
    Dropout,
    Input,
)
from tensorflow.keras import Model
from sklearn.utils import class_weight
import numpy as np
import os

# Ensure TensorFlow version compatibility
print(f"TensorFlow Version: {tf.__version__}")

TensorFlow Version: 2.18.0


In [92]:
# Cell 2: Configuration Parameters

# File path to the CSV dataset
file_path = "/mnt/c/Users/mikig/Desktop/UPC/PAE/Datasets/9810e03bba4983da_MOHANAD_A4706/9810e03bba4983da_MOHANAD_A4706/data/NF-UQ-NIDS-v2.csv"

# Batch size for training
batch_size = 64

# Number of batches to use for adapting normalization and lookup layers
batches_for_adaptation = 5000

# Validation split (20% for validation)
validation_split = 0.2

# Random seed for reproducibility
seed = 42

# Early stopping and model checkpointing parameters
patience = 3  # For EarlyStopping
model_checkpoint_path = "best_model.keras"

In [93]:
# Cell 3: Define Column Defaults

# Define column defaults with appropriate data types
column_defaults = [
    tf.string,  # 0: IPV4_SRC_ADDR
    tf.int64,  # 1: L4_SRC_PORT
    tf.string,  # 2: IPV4_DST_ADDR
    tf.int64,  # 3: L4_DST_PORT
    tf.int64,  # 4: PROTOCOL
    tf.float32,  # 5: L7_PROTO
    tf.int64,  # 6: IN_BYTES
    tf.int64,  # 7: IN_PKTS
    tf.int64,  # 8: OUT_BYTES
    tf.int64,  # 9: OUT_PKTS
    tf.int64,  # 10: TCP_FLAGS
    tf.int64,  # 11: CLIENT_TCP_FLAGS
    tf.int64,  # 12: SERVER_TCP_FLAGS
    tf.int64,  # 13: FLOW_DURATION_MILLISECONDS
    tf.int64,  # 14: DURATION_IN
    tf.int64,  # 15: DURATION_OUT
    tf.int64,  # 16: MIN_TTL
    tf.int64,  # 17: MAX_TTL
    tf.int64,  # 18: LONGEST_FLOW_PKT
    tf.int64,  # 19: SHORTEST_FLOW_PKT
    tf.int64,  # 20: MIN_IP_PKT_LEN
    tf.int64,  # 21: MAX_IP_PKT_LEN
    tf.float32,  # 22: SRC_TO_DST_SECOND_BYTES
    tf.float32,  # 23: DST_TO_SRC_SECOND_BYTES
    tf.int64,  # 24: RETRANSMITTED_IN_BYTES
    tf.int64,  # 25: RETRANSMITTED_IN_PKTS
    tf.int64,  # 26: RETRANSMITTED_OUT_BYTES
    tf.int64,  # 27: RETRANSMITTED_OUT_PKTS
    tf.float32,  # 28: SRC_TO_DST_AVG_THROUGHPUT
    tf.float32,  # 29: DST_TO_SRC_AVG_THROUGHPUT
    tf.int64,  # 30: NUM_PKTS_UP_TO_128_BYTES
    tf.int64,  # 31: NUM_PKTS_128_TO_256_BYTES
    tf.int64,  # 32: NUM_PKTS_256_TO_512_BYTES
    tf.int64,  # 33: NUM_PKTS_512_TO_1024_BYTES
    tf.int64,  # 34: NUM_PKTS_1024_TO_1514_BYTES
    tf.int64,  # 35: TCP_WIN_MAX_IN
    tf.int64,  # 36: TCP_WIN_MAX_OUT
    tf.int64,  # 37: ICMP_TYPE
    tf.int64,  # 38: ICMP_IPV4_TYPE
    tf.int64,  # 39: DNS_QUERY_ID
    tf.int64,  # 40: DNS_QUERY_TYPE
    tf.int64,  # 41: DNS_TTL_ANSWER
    tf.float32,  # 42: FTP_COMMAND_RET_CODE
    tf.int64,  # 43: Label
    tf.string,  # 44: Attack
    tf.string,  # 45: Dataset
]

In [94]:
# Cell 4: Feature Categorization

# Define feature categories
numerical_features = [
    "IN_BYTES",
    "OUT_BYTES",
    "RETRANSMITTED_IN_BYTES",
    "RETRANSMITTED_OUT_BYTES",
    "IN_PKTS",
    "OUT_PKTS",
    "RETRANSMITTED_IN_PKTS",
    "RETRANSMITTED_OUT_PKTS",
    "FLOW_DURATION_MILLISECONDS",
    "DURATION_IN",
    "DURATION_OUT",
    "SRC_TO_DST_SECOND_BYTES",
    "DST_TO_SRC_SECOND_BYTES",
    "SRC_TO_DST_AVG_THROUGHPUT",
    "DST_TO_SRC_AVG_THROUGHPUT",
    "MIN_IP_PKT_LEN",
    "MAX_IP_PKT_LEN",
    "LONGEST_FLOW_PKT",
    "SHORTEST_FLOW_PKT",
    "TCP_WIN_MAX_IN",
    "TCP_WIN_MAX_OUT",
    "FTP_COMMAND_RET_CODE",
]

categorical_features = ["PROTOCOL", "ICMP_TYPE", "ICMP_IPV4_TYPE", "DNS_QUERY_TYPE"]

identifier_features = ["IPV4_SRC_ADDR", "IPV4_DST_ADDR", "L4_SRC_PORT", "L4_DST_PORT"]

columns_to_remove = ["Label", "Dataset"]

label_column = "Attack"

In [100]:
# Cell 5: Create Training and Validation Datasets (Using Take and Skip)

# Define total number of samples (you can adjust this if known)
total_samples = 75987975  # Replace with your actual total samples if different

# Calculate the number of training and validation samples
train_samples = int(total_samples * 0.8)
validation_samples = total_samples - train_samples

# Calculate number of training and validation steps
train_steps = round((train_samples / batch_size), 0)
train_steps = int(
    train_steps if train_samples / batch_size < train_steps else train_steps + 1
)
validation_steps = round((validation_samples / batch_size), 0)
validation_steps = int(
    validation_steps
    if validation_samples / batch_size < validation_steps
    else validation_steps + 1
)

print(f"Total samples: {total_samples}")
print(f"Training samples: {train_samples}")
print(f"Validation samples: {validation_samples}")
print(f"Training steps: {train_steps}")
print(f"Validation steps: {validation_steps}")

# Create a single dataset without validation_split and subset
full_dataset = tf.data.experimental.make_csv_dataset(
    file_path,
    label_name=label_column,
    batch_size=batch_size,
    column_defaults=column_defaults,
    shuffle=True,  # Shuffle the data
    num_epochs=1,  # Repeat once
    ignore_errors=False,
    shuffle_seed=seed,  # Use 'shuffle_seed' instead of 'seed'
)

# Split the dataset using take and skip
train_dataset = full_dataset.take(train_samples // batch_size).prefetch(
    tf.data.AUTOTUNE
)
validation_dataset = full_dataset.skip(train_samples // batch_size).prefetch(
    tf.data.AUTOTUNE
)

print("Training and validation datasets created successfully using take and skip.")

Total samples: 75987975
Training samples: 60790380
Validation samples: 15197595
Training steps: 949850
Validation steps: 237463
Training and validation datasets created successfully using take and skip.


In [None]:
# Cell 6: Label Encoding

# Create a StringLookup layer to encode string labels into integers
label_lookup = StringLookup(output_mode="int")

print("Constructing vocabulary for labels...")
# Adapt the label_lookup layer to the 'Attack' column
attack_column_data = train_dataset.map(lambda features, label: label).take(
    batches_for_adaptation
)
label_lookup.adapt(attack_column_data)

# After adapting the label_lookup layer
vocab = label_lookup.get_vocabulary()
print("Vocabulary size:", len(vocab))
print("Vocabulary:", vocab)

In [None]:
# Cell 7: Normalization of Numerical Features

# Create and adapt Normalization layers for each numerical feature
normalizers = {}
for feature in numerical_features:
    normalizer = Normalization(
        axis=None
    )  # axis=None normalizes over the entire feature
    # Extract the feature data for adaptation
    feature_data = (
        train_dataset.map(
            lambda features, label: tf.expand_dims(features[feature], axis=-1)
        )
        .unbatch()
        .take(batches_for_adaptation)
    )
    # Adapt the normalizer with feature values
    normalizer.adapt(feature_data)
    normalizers[feature] = normalizer
    print(f"Normalization for {feature} adapted.")

In [None]:
# Cell 8: Categorical Feature Encoding

from tensorflow.keras.layers import IntegerLookup, Embedding

# Define lookup layers for categorical features
categorical_lookup_layers = {}
embedding_layers = {}
embedding_dim = 8  # You can adjust this based on experimentation

for feature in categorical_features:
    # Create and adapt IntegerLookup layers
    lookup = IntegerLookup(output_mode="int", mask_token=None)
    # Extract the feature data for adaptation
    feature_data = train_dataset.map(lambda features, label: features[feature]).take(
        batches_for_adaptation
    )
    lookup.adapt(feature_data)
    categorical_lookup_layers[feature] = lookup
    print(f"Lookup layer for {feature} adapted.")

    # Create Embedding layers
    vocab_size = len(lookup.get_vocabulary()) + 1  # +1 for masking if necessary
    embedding = Embedding(
        input_dim=vocab_size, output_dim=embedding_dim, name=f"{feature}_embedding"
    )
    embedding_layers[feature] = embedding
    print(f"Embedding layer for {feature} created with vocab size {vocab_size}.")

In [None]:
# Pre-adapt Normalization for `bytes_per_packet`
bytes_per_packet_normalizer = Normalization(axis=None)
bytes_pp_data = (
    train_dataset.map(
        lambda features, label: tf.cast(features["IN_BYTES"], tf.float32)
        / (tf.cast(features["IN_PKTS"], tf.float32) + 1e-6)
    )
    .unbatch()
    .take(batches_for_adaptation)
)

# Adapt normalizer
bytes_per_packet_normalizer.adapt(bytes_pp_data)
normalizers["bytes_per_packet"] = bytes_per_packet_normalizer
print("Normalization for 'bytes_per_packet' pre-adapted.")

In [88]:
def preprocess(features, label):
    def ip_to_int(ip_string):
        try:
            octets = tf.strings.split(ip_string, ".")
            octets = tf.strings.to_number(octets, out_type=tf.int64)
            ip_int = (
                tf.bitwise.left_shift(octets[:, 0], 24)
                + tf.bitwise.left_shift(octets[:, 1], 16)
                + tf.bitwise.left_shift(octets[:, 2], 8)
                + octets[:, 3]
            )
            return ip_int
        except Exception:
            return tf.zeros_like(ip_string, dtype=tf.int64)

    # Remove unnecessary columns
    for col in columns_to_remove:
        features.pop(col, None)

    # Convert IP columns to integers and ensure correct batch shape
    for col in identifier_features[:2]:  # IPV4_SRC_ADDR, IPV4_DST_ADDR
        features[col] = ip_to_int(features[col])  # Ensure batch shape (64,)

    # Normalize numerical features
    for feature in numerical_features:
        if feature in features:
            features[feature] = tf.squeeze(
                normalizers[feature](
                    tf.expand_dims(tf.cast(features[feature], tf.float32), axis=-1)
                )
            )
        else:
            features[feature] = 0.0  # Default value for missing features

    # Compute and normalize `bytes_per_packet`
    if "IN_BYTES" in features and "IN_PKTS" in features:
        bytes_per_packet = tf.cast(features["IN_BYTES"], tf.float32) / (
            tf.cast(features["IN_PKTS"], tf.float32) + 1e-6
        )
        features["bytes_per_packet"] = tf.squeeze(
            normalizers["bytes_per_packet"](tf.expand_dims(bytes_per_packet, axis=-1))
        )
    else:
        features["bytes_per_packet"] = 0.0

    # Encode categorical features
    for feature in categorical_features:
        if feature in features:
            features[feature] = categorical_lookup_layers[feature](features[feature])
        else:
            features[feature] = 0  # Default for missing categorical features

    for feature in features:
        features[feature] = tf.expand_dims(features[feature], axis=-1)

    # Create input dictionary
    input_dict = {
        key: features[key]
        for key in features
        if key
        in numerical_features
        + identifier_features
        + categorical_features
        + ["bytes_per_packet"]
    }

    # Encode the label
    label = label_lookup(label)

    return input_dict, label

In [89]:
# Cell 10: Apply Preprocessing to Datasets

train_dataset = train_dataset.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

validation_dataset = validation_dataset.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [None]:
# Define Inputs for each feature type
numerical_inputs = [
    Input(shape=(1,), name=feature, dtype=tf.float32)
    for feature in numerical_features + ["bytes_per_packet"]
]

identifier_inputs = [
    Input(shape=(1,), name=feature, dtype=tf.int64) for feature in identifier_features
]

categorical_inputs = [
    Input(shape=(1,), name=feature, dtype=tf.int64) for feature in categorical_features
]

# Process categorical inputs with embeddings
embedded_categorical = []
for feature, input_layer in zip(categorical_features, categorical_inputs):
    embedding_layer = embedding_layers[feature]  # Retrieve pre-defined embedding layer
    embedded_output = Flatten()(
        embedding_layer(input_layer)
    )  # Flatten (batch_size, 1, embedding_dim) -> (batch_size, embedding_dim)
    embedded_categorical.append(embedded_output)

# Concatenate all processed inputs
# Shapes:
# - Numerical inputs: (batch_size, 1)
# - Identifier inputs: (batch_size, 1)
# - Embedded categorical: (batch_size, embedding_dim)
all_features = Concatenate(axis=-1)(
    numerical_inputs + identifier_inputs + embedded_categorical
)
print(f"Shape after concatenation: {all_features.shape}")  # Debugging output

# Define dense layers
x = Dense(128, activation="relu", name="dense_1")(all_features)
x = BatchNormalization(name="batch_norm_1")(x)
x = Dropout(0.5, name="dropout_1")(x)

x = Dense(64, activation="relu", name="dense_2")(x)
x = BatchNormalization(name="batch_norm_2")(x)
x = Dropout(0.3, name="dropout_2")(x)

# Output layer with softmax activation for multi-class classification
num_classes = len(vocab)
output = Dense(num_classes, activation="softmax", name="output")(x)

# Build the model
model = Model(
    inputs=numerical_inputs + identifier_inputs + categorical_inputs,
    outputs=output,
    name="network_traffic_classifier",
)

# Compile the model
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Print model summary
model.summary()

In [None]:
# Cell 12: Compute Class Weights

# Collect labels from a subset for class weight computation
labels_subset = []
for _, label in train_dataset.take(batches_for_adaptation):
    labels_subset.extend(label.numpy().tolist())

# Compute class weights using scikit-learn
class_weights_array = class_weight.compute_class_weight(
    class_weight="balanced", classes=np.unique(labels_subset), y=labels_subset
)

# Convert to dictionary
class_weights = {i: weight for i, weight in enumerate(class_weights_array)}
print("Class Weights:", class_weights)

In [None]:
# Cell 13: Calculate Steps Per Epoch


def count_lines(file_path):
    """Counts the number of lines in a file."""
    with open(file_path, "r") as f:
        for i, _ in enumerate(f):
            pass
    return i + 1  # Total number of lines including header


# Total number of samples (subtract header)
# total_lines = count_lines(file_path)
total_lines = 75987977  # Since we know the total number of lines
total_samples = total_lines - 1
print(f"Total lines in CSV (including header): {total_lines}")
print(f"Total samples: {total_samples}")

# Since validation_split=0.2
steps_per_epoch = int((total_samples * 0.8) // batch_size)
validation_steps = int((total_samples * 0.2) // batch_size)
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Validation steps: {validation_steps}")

In [None]:
print("Vocabulary:", label_lookup.get_vocabulary())
for features, label in train_dataset.take(1):
    print("Sample labels:", label)

print("Label values:", label.numpy())

In [None]:
# Cell 14: Train the Model

history = model.fit(
    train_dataset,
    epochs=10,  # Adjust based on convergence
    # steps_per_epoch=steps_per_epoch,
    validation_data=validation_dataset,
    # validation_steps=validation_steps,
    class_weight=class_weights,  # Include if handling class imbalance
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss", patience=patience, restore_best_weights=True
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=model_checkpoint_path, monitor="val_loss", save_best_only=True
        ),
    ],
)

In [None]:
# Cell 15: Evaluate the Model

# Evaluate the model on the validation dataset
val_loss, val_accuracy = model.evaluate(validation_dataset, steps=validation_steps)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

# Plot training & validation accuracy and loss values
import matplotlib.pyplot as plt

# Retrieve history data
acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]

loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs_range = range(len(acc))

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label="Training Accuracy")
plt.plot(epochs_range, val_acc, label="Validation Accuracy")
plt.legend(loc="lower right")
plt.title("Training and Validation Accuracy")

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label="Training Loss")
plt.plot(epochs_range, val_loss, label="Validation Loss")
plt.legend(loc="upper right")
plt.title("Training and Validation Loss")

plt.show()

In [None]:
# Cell 16: Save and Load the Model

# Save the trained model
model.save("final_model.h5")
print("Model saved to 'final_model.h5'.")

# Load the model (if needed)
# loaded_model = tf.keras.models.load_model("final_model.h5")
# print("Model loaded from 'final_model.h5'.")

In [None]:
# Cell 17: Make Predictions

# Example: Get a batch of validation data
for batch_features, batch_labels in validation_dataset.take(1):
    predictions = model.predict(batch_features)
    predicted_classes = tf.argmax(predictions, axis=1).numpy()
    true_classes = batch_labels.numpy()
    print("Predicted classes:", predicted_classes)
    print("True classes:", true_classes)
    break

In [None]:
# Cell 18: Additional Evaluation Metrics

from sklearn.metrics import classification_report

# Collect all predictions and true labels
y_true = []
y_pred = []

for batch_features, batch_labels in validation_dataset:
    preds = model.predict(batch_features)
    preds = tf.argmax(preds, axis=1).numpy()
    y_pred.extend(preds)
    y_true.extend(batch_labels.numpy())

# Generate classification report
report = classification_report(
    y_true, y_pred, target_names=vocab[1:]
)  # Exclude '[UNK]' if present
print("Classification Report:\n", report)

In [None]:
# Cell 19: Feature Importance and Interpretation

import shap

# Select a subset of the validation data for SHAP
sample_features, sample_labels = next(iter(validation_dataset.take(1)))


# Define a prediction function for SHAP
def model_predict(inputs):
    return model(inputs)


# Initialize SHAP explainer
explainer = shap.GradientExplainer(model_predict, sample_features)

# Compute SHAP values
shap_values = explainer.shap_values(sample_features)

# Plot SHAP summary
shap.summary_plot(
    shap_values, sample_features, feature_names=list(sample_features.keys())
)