Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


Upload Files

In [3]:
from google.colab import files

print("Upload KDDTrain_.txt and KDDTest_.txt")
uploaded = files.upload()


Upload KDDTrain_.txt and KDDTest_.txt


Saving KDDTest_.txt to KDDTest_ (1).txt
Saving KDDTrain_.txt to KDDTrain_.txt


 Read Sample Lines

In [4]:
# Preview raw file format
with open("KDDTrain_.txt") as f:
    for _ in range(3):
        print(f.readline().strip())


0,tcp,ftp_data,SF,491,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.00,0.00,0.00,0.00,1.00,0.00,0.00,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.00,0.00,0.00,0.00,0.08,0.15,0.00,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.00,1.00,0.00,0.00,0.05,0.07,0.00,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19


Define Columns and Load Dataset

In [5]:
columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted",
    "num_root","num_file_creations","num_shells","num_access_files",
    "num_outbound_cmds","is_host_login","is_guest_login","count",
    "srv_count","serror_rate","srv_serror_rate","rerror_rate",
    "srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
    "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate",
    "dst_host_srv_rerror_rate","label","difficulty"
]

train_df = pd.read_csv("KDDTrain_.txt", names=columns)
test_df = pd.read_csv("KDDTest_.txt", names=columns)

print("Train shape:", train_df.shape, "| Test shape:", test_df.shape)
train_df.head()


Train shape: (125973, 43) | Test shape: (22544, 43)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


Clean and Convert Labels

In [6]:
# Drop unused column
train_df.drop("difficulty", axis=1, inplace=True)
test_df.drop("difficulty", axis=1, inplace=True)

# Binary label mapping: 0 = normal, 1 = attack
train_df["label"] = (train_df["label"] != "normal").astype(int)
test_df["label"] = (test_df["label"] != "normal").astype(int)


Check Missing Values


In [7]:
print("Missing values:\n", train_df.isnull().sum().value_counts())

Missing values:
 0    42
Name: count, dtype: int64


 Split Features and Labels

In [8]:
X_train_raw, y_train = train_df.drop("label", axis=1), train_df["label"]
X_test_raw, y_test = test_df.drop("label", axis=1), test_df["label"]


Encode Categorical + Scale Features

In [9]:
cat_cols = ["protocol_type", "service", "flag"]
num_cols = [col for col in X_train_raw.columns if col not in cat_cols]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

X_train = preprocessor.fit_transform(X_train_raw)
X_test = preprocessor.transform(X_test_raw)

print("Final feature shapes →", X_train.shape, X_test.shape)


Final feature shapes → (125973, 122) (22544, 122)


***Phase 3 (DNN)***

---



Build the DNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Train the Model

In [11]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=1024,
    callbacks=[early_stop],
    verbose=2
)


Epoch 1/20
99/99 - 3s - 29ms/step - accuracy: 0.9415 - loss: 0.1802 - val_accuracy: 0.9784 - val_loss: 0.0674
Epoch 2/20
99/99 - 2s - 19ms/step - accuracy: 0.9768 - loss: 0.0665 - val_accuracy: 0.9829 - val_loss: 0.0393
Epoch 3/20
99/99 - 1s - 8ms/step - accuracy: 0.9822 - loss: 0.0457 - val_accuracy: 0.9904 - val_loss: 0.0289
Epoch 4/20
99/99 - 1s - 14ms/step - accuracy: 0.9879 - loss: 0.0354 - val_accuracy: 0.9935 - val_loss: 0.0241
Epoch 5/20
99/99 - 1s - 12ms/step - accuracy: 0.9903 - loss: 0.0289 - val_accuracy: 0.9927 - val_loss: 0.0224
Epoch 6/20
99/99 - 2s - 15ms/step - accuracy: 0.9907 - loss: 0.0275 - val_accuracy: 0.9942 - val_loss: 0.0195
Epoch 7/20
99/99 - 1s - 12ms/step - accuracy: 0.9915 - loss: 0.0248 - val_accuracy: 0.9948 - val_loss: 0.0182
Epoch 8/20
99/99 - 1s - 12ms/step - accuracy: 0.9921 - loss: 0.0233 - val_accuracy: 0.9950 - val_loss: 0.0177
Epoch 9/20
99/99 - 1s - 12ms/step - accuracy: 0.9924 - loss: 0.0216 - val_accuracy: 0.9945 - val_loss: 0.0176
Epoch 10/20

Evaluate the Model

In [12]:
from sklearn.metrics import classification_report, roc_auc_score

y_prob = model.predict(X_test).ravel()
y_pred = (y_prob > 0.5).astype(int)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))


[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Classification Report:

              precision    recall  f1-score   support

           0       0.69      0.97      0.81      9711
           1       0.97      0.67      0.79     12833

    accuracy                           0.80     22544
   macro avg       0.83      0.82      0.80     22544
weighted avg       0.85      0.80      0.80     22544

ROC-AUC Score: 0.9260038634017055


Save Model

In [13]:
model.save("nsl_dnn_model.h5")




View it

In [None]:
# Load saved model
model = load_model("nsl_dnn_model.h5")

# Print model summary
model.summary()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>