# Teacher-Student Distillation for Intrusion Detection Autoencoder
This notebook builds on the dropout autoencoder (teacher) and trains a smaller student model to mimic the teacher’s outputs, improving generalization.

In [9]:
# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [10]:
# Step 2: Load & Prepare Data
df = pd.read_csv("UNSW-NB15P-MM-SAMPLE.csv")
Dn = df[df.Class == 0].drop(columns='Class')
Da = df[df.Class == 1].drop(columns='Class')
Dntr, Dnts = train_test_split(Dn, test_size=0.2, random_state=42)
Dts = pd.concat([Dnts, Da], ignore_index=True)
y_test = np.array([0]*len(Dnts) + [1]*len(Da))

In [11]:
# Step 3: Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(Dntr)
X_test = scaler.transform(Dts)

In [12]:
# Step 4: Build & Train Teacher Model (Dropout AE)
input_dim = X_train.shape[1]
inp = Input(shape=(input_dim,))
x = Dense(64, activation='relu')(inp)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.2)(x)
encoded = Dense(16, activation='relu')(x)
x = Dense(32, activation='relu')(encoded)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
teacher_out = Dense(input_dim, activation='linear')(x)
teacher = Model(inp, teacher_out)
teacher.compile(optimizer=Adam(0.001), loss='mse')
teacher.fit(X_train, X_train, epochs=20, batch_size=256, validation_split=0.1, verbose=1)

Epoch 1/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 771us/step - loss: 0.6091 - val_loss: 0.2343
Epoch 2/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 668us/step - loss: 0.3159 - val_loss: 0.1891
Epoch 3/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 757us/step - loss: 0.2684 - val_loss: 0.1624
Epoch 4/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 759us/step - loss: 0.2493 - val_loss: 0.1450
Epoch 5/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 712us/step - loss: 0.2307 - val_loss: 0.1448
Epoch 6/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 712us/step - loss: 0.2254 - val_loss: 0.1410
Epoch 7/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 684us/step - loss: 0.2195 - val_loss: 0.1343
Epoch 8/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 687us/step - loss: 0.2178 - val_loss: 0.1339
Epoch 9/20
[1m965/965[

<keras.src.callbacks.history.History at 0x373445a30>

In [13]:
# Step 5: Teacher Reconstructions
T_train = teacher.predict(X_train)
T_test = teacher.predict(X_test)

[1m8570/8570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171us/step
[1m2837/2837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170us/step


In [14]:
# Step 6: Build Student Model (Smaller AE)
def build_student():
    inp_s = Input(shape=(input_dim,))
    x = Dense(32, activation='relu')(inp_s)
    encoded_s = Dense(16, activation='relu')(x)
    x = Dense(32, activation='relu')(encoded_s)
    out_s = Dense(input_dim, activation='linear')(x)
    return Model(inp_s, out_s)

student = build_student()

In [15]:
# Step 7: Distillation Training
gamma = 0.5  # distillation weight
inp_s = student.input
stud_out = student(inp_s)
distill = Model(inp_s, [stud_out, stud_out])  # two identical outputs
distill.compile(
    optimizer=Adam(0.001),
    loss=['mse','mse'],
    loss_weights=[1.0, gamma]
)
distill.fit(
    X_train, [X_train, T_train],
    epochs=20, batch_size=256,
    validation_split=0.1, verbose=1
)


Epoch 1/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 529us/step - functional_4_loss: 0.3321 - loss: 0.6676 - val_functional_4_loss: 0.0756 - val_loss: 0.1868
Epoch 2/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 520us/step - functional_4_loss: 0.0712 - loss: 0.1691 - val_functional_4_loss: 0.0587 - val_loss: 0.1303
Epoch 3/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460us/step - functional_4_loss: 0.0589 - loss: 0.1279 - val_functional_4_loss: 0.0550 - val_loss: 0.1110
Epoch 4/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451us/step - functional_4_loss: 0.0570 - loss: 0.1124 - val_functional_4_loss: 0.0518 - val_loss: 0.0996
Epoch 5/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 453us/step - functional_4_loss: 0.0524 - loss: 0.0995 - val_functional_4_loss: 0.0527 - val_loss: 0.0940
Epoch 6/20
[1m965/965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 454us/step - 

<keras.src.callbacks.history.History at 0x3249beae0>

In [16]:
# Step 8: Evaluate Student Model
S_pred = student.predict(X_test)
errors_s = np.mean((X_test - S_pred)**2, axis=1)
from sklearn.metrics import precision_recall_curve
prec_s, rec_s, thr_s = precision_recall_curve(y_test, errors_s)
f1_s = 2*(prec_s*rec_s)/(prec_s+rec_s+1e-8)
best_thr_s = thr_s[np.argmax(f1_s)]
y_pred_s = (errors_s > best_thr_s).astype(int)
cm = confusion_matrix(y_test, y_pred_s)
print('Confusion Matrix:', cm)
print(classification_report(y_test, y_pred_s, target_names=['Normal','Attack']))
tn, fp, fn, tp = cm.ravel()
print('FPR:', fp/(fp+tn), 'FNR:', fn/(fn+tp))
print('ROC-AUC:', roc_auc_score(y_test, errors_s))

[1m2837/2837[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163us/step
Confusion Matrix: [[64781  3776]
 [ 1079 21136]]
              precision    recall  f1-score   support

      Normal       0.98      0.94      0.96     68557
      Attack       0.85      0.95      0.90     22215

    accuracy                           0.95     90772
   macro avg       0.92      0.95      0.93     90772
weighted avg       0.95      0.95      0.95     90772

FPR: 0.05507825604971046 FNR: 0.04857078550528922
ROC-AUC: 0.988282251360906
