#  Autoencoder for Anomaly Detection

This notebook trains an autoencoder to detect anomalies in Kubernetes resource usage data. Anomalies are detected based on high reconstruction error.

- Dataset: `k8_synthetic_dataset.csv`
- Evaluation metric: **Macro F1-Score**
- Author: Ammar Yousuf Abrahani
- Date: June 2025

In [3]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

In [5]:
# 📂 Load Dataset
data = pd.read_csv('../data/raw/k8_synthetic_dataset.csv')
data.head()

Unnamed: 0,cpu_usage,memory_usage,network_io,disk_io,label
0,54.967142,41.71005,337.849431,107.373466,0.0
1,48.617357,44.39819,253.891734,92.133224,0.0
2,56.476885,57.472936,343.480296,100.574896,0.0
3,65.230299,56.103703,367.781893,125.569037,0.0
4,47.658466,49.790984,320.671745,103.821981,0.0


In [6]:
# 🎯 Features and Labels
X = data[['cpu_usage', 'memory_usage', 'network_io', 'disk_io']].values
y = data['label'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

Train samples: 210, Test samples: 90


In [7]:
# 🛠️ Define Autoencoder Model
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(8, activation="relu")(input_layer)
decoder = Dense(input_dim, activation="linear")(encoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

In [8]:
# 🏋️ Train Autoencoder
autoencoder.fit(X_train, X_train, epochs=20, batch_size=16, validation_split=0.2)

Epoch 1/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 101459.6953 - val_loss: 86096.3828
Epoch 2/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 97259.6484 - val_loss: 79158.4453
Epoch 3/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 81509.8047 - val_loss: 72843.3984
Epoch 4/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 76396.1562 - val_loss: 67064.8281
Epoch 5/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 72168.7578 - val_loss: 61795.7070
Epoch 6/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 65633.8672 - val_loss: 56991.9805
Epoch 7/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 62267.5781 - val_loss: 52600.7148
Epoch 8/20
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 58999.3359 - val_loss:

<keras.src.callbacks.history.History at 0x183c89e43e0>

In [9]:
# 📉 Compute Reconstruction Error on Test Set
reconstructions = autoencoder.predict(X_test)
mse = np.mean(np.square(X_test - reconstructions), axis=1)
threshold = np.percentile(mse, 95)  # Top 5% as anomalies
y_pred = (mse > threshold).astype(int)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


In [10]:
# 📊 Evaluate Anomaly Detection
report = classification_report(y_test, y_pred, target_names=['Normal', 'Anomaly'], digits=4)
f1_macro = f1_score(y_test, y_pred, average='macro')
print("📊 Autoencoder Performance:\n")
print(report)
print(f"🔍 Macro Average F1-Score: {f1_macro:.4f}")

📊 Autoencoder Performance:

              precision    recall  f1-score   support

      Normal     0.9765    1.0000    0.9881        83
     Anomaly     1.0000    0.7143    0.8333         7

    accuracy                         0.9778        90
   macro avg     0.9882    0.8571    0.9107        90
weighted avg     0.9783    0.9778    0.9761        90

🔍 Macro Average F1-Score: 0.9107
