# Intrusion Detection System using Autoencoders (Semi-Supervised Learning)

In [3]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloa

In [1]:
# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [2]:
# Step 2: Load the dataset
df = pd.read_csv("UNSW-NB15P-MM-SAMPLE.csv")  # Update path as needed

In [3]:
# Step 3: Split into Normal (Dn) and Attack (Da)
Dn = df[df['Class'] == 0].drop(columns=['Class'])
Da = df[df['Class'] == 1].drop(columns=['Class'])

In [4]:
# Step 4: Split Dn into 80% training and 20% test
Dntr, Dnts = train_test_split(Dn, test_size=0.2, random_state=42)

In [5]:
# Step 5: Combine Dnts and Da to form test set (Dts)
Dts = pd.concat([Dnts, Da], ignore_index=True)
Dts_labels = [0]*len(Dnts) + [1]*len(Da)

In [6]:
# Step 6: Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(Dntr)
X_test = scaler.transform(Dts)

In [7]:
# Step 7: Define Autoencoder
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation="relu")(input_layer)
encoded = Dense(16, activation="relu")(encoded)
decoded = Dense(32, activation="relu")(encoded)
output_layer = Dense(input_dim, activation="linear")(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(0.001), loss='mse')

In [8]:
# Step 8: Train the autoencoder on normal traffic
autoencoder.fit(X_train, X_train,
                epochs=10,
                batch_size=256,
                shuffle=True,
                validation_split=0.1,
                verbose=1)

Epoch 1/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 705us/step - loss: 0.5146 - val_loss: 0.1363
Epoch 2/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457us/step - loss: 0.1162 - val_loss: 0.1059
Epoch 3/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 478us/step - loss: 0.0875 - val_loss: 0.0750
Epoch 4/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 430us/step - loss: 0.0757 - val_loss: 0.0648
Epoch 5/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 507us/step - loss: 0.0627 - val_loss: 0.0581
Epoch 6/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 474us/step - loss: 0.0571 - val_loss: 0.0516
Epoch 7/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 514us/step - loss: 0.0475 - val_loss: 0.0468
Epoch 8/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 431us/step - loss: 0.0437 - val_loss: 0.0515
Epoch 9/10
[1m965/965[

<keras.src.callbacks.history.History at 0x14ec5fb30>

In [9]:
# Step 9: Predict on both training and combined test data
X_train_pred = autoencoder.predict(X_train)
X_test_pred = autoencoder.predict(X_test)

[1m8570/8570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 181us/step
[1m2837/2837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161us/step


In [10]:
# Step 10: Compute Reconstruction Errors
train_errors = np.mean(np.square(X_train - X_train_pred), axis=1)
test_errors = np.mean(np.square(X_test - X_test_pred), axis=1)

In [11]:
# Step 11: Thresholding
threshold = np.percentile(train_errors, 95)
predicted_labels = (test_errors > threshold).astype(int)

In [12]:
# Step 12: Evaluation
print("Train Mean Error:", np.mean(train_errors))
print("Test Mean Error:", np.mean(test_errors))
print("\nConfusion Matrix:\n", confusion_matrix(Dts_labels, predicted_labels))
print("\nClassification Report:\n", classification_report(Dts_labels, predicted_labels))
print("\nROC-AUC Score:", roc_auc_score(Dts_labels, test_errors))

Train Mean Error: 0.03300393458801275
Test Mean Error: 0.35155288240900207

Confusion Matrix:
 [[65062  3495]
 [  799 21416]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97     68557
           1       0.86      0.96      0.91     22215

    accuracy                           0.95     90772
   macro avg       0.92      0.96      0.94     90772
weighted avg       0.96      0.95      0.95     90772


ROC-AUC Score: 0.9892369289459102
