In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.preprocessing import StandardScaler
import os


In [None]:
# --- 1. Define Column Names and Constants ---
# Based on the official NSL-KDD dataset documentation
# These are the 41 features + 1 label column
COLUMNS = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
    'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
    'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
    'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

# Identify which columns are categorical (text) and which are numeric
CATEGORICAL_COLS = ['protocol_type', 'service', 'flag']
# All other columns (except 'label' and 'difficulty') are numeric
NUMERIC_COLS = list(set(COLUMNS[:-2]) - set(CATEGORICAL_COLS))

# File path for saving the trained Keras model
MODEL_FILE = 'kdd_detection_model.keras'


In [None]:
# --- 2. Load Data ---
def load_data(train_path, test_path):
    # Load data, skipping the last 'difficulty' column (index 42)
    train_df = pd.read_csv(train_path, 
                           names=COLUMNS[:-1],  # Use the first 42 names (features + label)
                           usecols=range(42))   # Read the first 42 columns by index (0-41)

    test_df = pd.read_csv(test_path, 
                          names=COLUMNS[:-1],   # Use the first 42 names
                          usecols=range(42))    # Read the first 42 columns by index

    return train_df, test_df


TRAIN_FILE = "data/KDDTrain+.txt"
TEST_FILE = "data/KDDTest+.txt"

train_df, test_df = load_data(TRAIN_FILE, TEST_FILE)
train_df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [None]:
# --- 3. Preprocess Data ---
def preprocess(train_df, test_df):
    print("Starting preprocessing...")

    # --- 3.1. Handle Labels (Target Variable) ---
    # Simplify to binary classification: 0 = 'normal', 1 = 'attack'
    y_train = train_df['label'].apply(lambda x: 0 if x == 'normal' else 1).values
    y_test = test_df['label'].apply(lambda x: 0 if x == 'normal' else 1).values

    # Drop the label column from the feature dataframes
    X_train = train_df.drop('label', axis=1)
    X_test = test_df.drop('label', axis=1)

    # --- 3.2. One-Hot Encoding for Categorical Features ---
    # Combine train and test for consistent one-hot encoding
     # The get_dummies function creates new columns for each unique value
    # in the categorical columns.
    combined = pd.concat([X_train, X_test], axis=0)
    combined_encoded = pd.get_dummies(combined, columns=CATEGORICAL_COLS, dtype=float)

    X_train_enc = combined_encoded.iloc[:len(X_train)]
    X_test_enc = combined_encoded.iloc[len(X_train):]

    # Scaling numeric columns
    scaler = StandardScaler()
    scaler.fit(X_train_enc[NUMERIC_COLS])

    # Separate back into train and test
    X_train_enc[NUMERIC_COLS] = scaler.transform(X_train_enc[NUMERIC_COLS])
    X_test_enc[NUMERIC_COLS] = scaler.transform(X_test_enc[NUMERIC_COLS])

    print("Preprocessing complete.")
    return X_train_enc.values, X_test_enc.values, y_train, y_test


X_train, X_test, y_train, y_test = preprocess(train_df, test_df)
X_train.shape


Starting preprocessing...
Preprocessing complete.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_enc[NUMERIC_COLS] = scaler.transform(X_train_enc[NUMERIC_COLS])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_enc[NUMERIC_COLS] = scaler.transform(X_test_enc[NUMERIC_COLS])


(125973, 122)

In [None]:
# --- 4. Build the Neural Network ---
def build_model(input_shape):
    model = keras.Sequential([
        # Input layer: Must match the number of features
        keras.layers.Input(shape=(input_shape,)),
        # Hidden layer 1
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.3), # Dropout for regularization
        # Hidden layer 2
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.3), # Dropout for regularization
        # Output layer: 1 neuron with sigmoid activation for binary classification
        keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


In [None]:
# Train or load model
if os.path.exists(MODEL_FILE):
    # 5.1. Load existing model
    print("Loading existing model...")
    model = keras.models.load_model(MODEL_FILE)
else:
    # Train the model
    print("Training new model...")
    model = build_model(X_train.shape[1])

    history = model.fit(
        X_train, y_train,
        epochs=20,
        batch_size=64,
        validation_split=0.1,
        verbose=1
    )

    # Save the trained model
    model.save(MODEL_FILE)
    print("Model saved.")


Training new model...
Epoch 1/20
[1m1772/1772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9810 - loss: 0.0561 - val_accuracy: 0.9919 - val_loss: 0.0233
Epoch 2/20
[1m1772/1772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9909 - loss: 0.0276 - val_accuracy: 0.9937 - val_loss: 0.0172
Epoch 3/20
[1m1772/1772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9927 - loss: 0.0217 - val_accuracy: 0.9948 - val_loss: 0.0159
Epoch 4/20
[1m1772/1772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9930 - loss: 0.0203 - val_accuracy: 0.9950 - val_loss: 0.0141
Epoch 5/20
[1m1772/1772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9933 - loss: 0.0188 - val_accuracy: 0.9937 - val_loss: 0.0150
Epoch 6/20
[1m1772/1772[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9940 - loss: 0.0166 - val_accuracy: 0.9957 - val_loss: 0

In [None]:
# Evaluate the model on the unseen test data 
print("\nEvaluating on test set...")
results = model.evaluate(X_test, y_test)
print(f"Test Loss: {results[0]:.4f}")
print(f"Test Accuracy: {results[1]*100:.2f}%")



Evaluating on test set...
[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8008 - loss: 2.4302
Test Loss: 2.4302
Test Accuracy: 80.08%


In [None]:
# Makes predictions on all values in KDDTest+.txt
print("\n--- Making Predictions ---")

num_of_predictions = 22544
predictions = model.predict(X_test[:num_of_predictions])

correct_predictions = 0

for i in range(num_of_predictions):
    pred_class = "Attack" if predictions[i] > 0.5 else "Normal"
    actual_class = "Attack" if y_test[i] == 1 else "Normal"

    if pred_class == actual_class:
        correct_predictions += 1

print(f"Prediction accuracy over {num_of_predictions} samples: {correct_predictions/num_of_predictions*100:.2f}%")


[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Prediction accuracy over 22544 samples: 80.08%
