# Intrusion Detection System using Autoencoders (Semi-Supervised Learning)

In [1]:
pip install tensorflow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [15]:
# Step 2: Load the dataset
df = pd.read_csv("UNSW-NB15P-MM-SAMPLE.csv")  # Update path as needed

In [16]:
# Step 3: Split into Normal (Dn) and Attack (Da)
Dn = df[df['Class'] == 0].drop(columns=['Class'])
Da = df[df['Class'] == 1].drop(columns=['Class'])

In [17]:
# Step 4: Split Dn into 80% training and 20% test
Dntr, Dnts = train_test_split(Dn, test_size=0.2, random_state=42)

In [18]:
# Step 5: Combine Dnts and Da to form test set (Dts)
Dts = pd.concat([Dnts, Da], ignore_index=True)
Dts_labels = [0]*len(Dnts) + [1]*len(Da)

In [19]:
# Step 6: Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(Dntr)
X_test = scaler.transform(Dts)

In [20]:
# Step 7: Define Autoencoder
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation="relu")(input_layer)
encoded = Dense(16, activation="relu")(encoded)
decoded = Dense(32, activation="relu")(encoded)
output_layer = Dense(input_dim, activation="linear")(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(0.001), loss='mse')

In [21]:
# Step 8: Train the autoencoder on normal traffic
autoencoder.fit(X_train, X_train,
                epochs=10,
                batch_size=256,
                shuffle=True,
                validation_split=0.1,
                verbose=1)

Epoch 1/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 613us/step - loss: 0.5275 - val_loss: 0.1471
Epoch 2/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458us/step - loss: 0.1251 - val_loss: 0.0963
Epoch 3/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 449us/step - loss: 0.0842 - val_loss: 0.0707
Epoch 4/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419us/step - loss: 0.0656 - val_loss: 0.0602
Epoch 5/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420us/step - loss: 0.0568 - val_loss: 0.0554
Epoch 6/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428us/step - loss: 0.0509 - val_loss: 0.0510
Epoch 7/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 422us/step - loss: 0.0457 - val_loss: 0.0429
Epoch 8/10
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453us/step - loss: 0.0403 - val_loss: 0.0384
Epoch 9/10
[1m965/965[

<keras.src.callbacks.history.History at 0x177e3d070>

In [22]:
# Step 9: Predict on both training and combined test data
X_train_pred = autoencoder.predict(X_train)
X_test_pred = autoencoder.predict(X_test)

[1m8570/8570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171us/step
[1m2837/2837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175us/step


In [23]:
# Step 10: Compute Reconstruction Errors
train_errors = np.mean(np.square(X_train - X_train_pred), axis=1)
test_errors = np.mean(np.square(X_test - X_test_pred), axis=1)

In [24]:
# Step 11: Thresholding
threshold = np.percentile(train_errors, 95)
predicted_labels = (test_errors > threshold).astype(int)

In [25]:
# Step 12: Evaluation
print("Train Mean Error:", np.mean(train_errors))
print("Test Mean Error:", np.mean(test_errors))
print("\nConfusion Matrix:\n", confusion_matrix(Dts_labels, predicted_labels))
print("\nClassification Report:\n", classification_report(Dts_labels, predicted_labels))
print("\nROC-AUC Score:", roc_auc_score(Dts_labels, test_errors))

Train Mean Error: 0.03262766844910162
Test Mean Error: 0.3035194184978005

Confusion Matrix:
 [[65052  3505]
 [ 1572 20643]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96     68557
           1       0.85      0.93      0.89     22215

    accuracy                           0.94     90772
   macro avg       0.92      0.94      0.93     90772
weighted avg       0.95      0.94      0.94     90772


ROC-AUC Score: 0.985990369343307
