In [1]:
!nvidia-smi

Sun Feb  8 05:29:37 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   38C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
from google.colab import drive
import os

In [6]:
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
TRAIN_DATA_PATH = "/content/drive/MyDrive/ciciot2023_clean.parquet"

# Verify it sees the file
if os.path.exists(TRAIN_DATA_PATH):
    print(f"✅ Success! File found at: {TRAIN_DATA_PATH}")
else:
    print(f"❌ File not found at {TRAIN_DATA_PATH}")

✅ Success! File found at: /content/drive/MyDrive/ciciot2023_clean.parquet


In [8]:
import sys
import os
import joblib
import numpy as np
import polars as pl
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [9]:
DRIVE_ROOT = "/content/drive/MyDrive/IoT_Models"
os.makedirs(DRIVE_ROOT, exist_ok=True)

# Update Config to point to Drive
MODELS_DIR = DRIVE_ROOT

SELECTED_FEATURES = [
    'Header_Length', 'Protocol Type', 'Rate', 'fin_flag_number', 'syn_flag_number', 
    'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 
    'rst_count', 'TCP', 'ICMP', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'IAT', 'Number',
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'DHCP', 'ARP', 'IRC'
]

In [10]:
SEQUENCE_LENGTH = 1
EPOCHS = 20
BATCH_SIZE = 2048

In [11]:
df = pl.read_parquet(TRAIN_DATA_PATH).select(SELECTED_FEATURES + ["label_category"])
df.head()

Header_Length,Protocol Type,Rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ack_count,syn_count,rst_count,TCP,ICMP,Tot sum,Min,Max,AVG,Std,IAT,Number,HTTP,HTTPS,DNS,Telnet,SMTP,SSH,DHCP,ARP,IRC,label_category
f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,str
32.0,6.0,10041.426758,0.0,0.0,0.0,0.5,1.0,10.0,0.0,0.0,1.0,0.0,16016.0,66.0,4254.0,1601.599976,1536.304932,0.000103,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Benign"""
32.0,6.0,641.311279,0.0,0.0,0.0,0.2,1.0,10.0,0.0,0.0,1.0,0.0,9794.0,66.0,2962.0,979.400024,974.595459,0.002116,10.0,0.2,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Benign"""
29.6,6.0,62.990967,0.0,0.0,0.0,0.3,0.9,9.0,0.0,0.0,0.9,0.0,5965.0,66.0,2962.0,596.5,947.314941,0.022838,10.0,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Benign"""
25.200001,6.0,410.273102,0.0,0.0,0.0,0.1,0.8,8.0,0.0,0.0,0.8,0.0,12395.0,60.0,2962.0,1239.5,920.812805,0.003697,10.0,0.0,0.8,0.1,0.0,0.0,0.0,0.0,0.1,0.0,"""Benign"""
32.0,6.0,2895.019287,0.0,0.0,0.0,0.0,1.0,10.0,0.0,0.0,1.0,0.0,9348.0,66.0,1514.0,934.799988,747.743958,0.000378,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Benign"""


In [12]:
df['label_category'].value_counts()

label_category,count
str,u32
"""DDoS""",5888229
"""Web_BruteForce""",6552
"""Recon""",119664
"""Spoofing""",84236
"""Mirai""",456365
"""Benign""",190197
"""DoS""",1401918


In [13]:
X = df.select(SELECTED_FEATURES).to_numpy()
y_raw = df["label_category"].to_list()

print("Scaling Features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Encoding Labels...")
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)
y_categorical = tf.keras.utils.to_categorical(y_encoded)

X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

X_train, X_val, y_train, y_val = train_test_split(
    X_reshaped, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

Scaling Features...
Encoding Labels...


In [14]:
model = Sequential()
model.add(Input(shape=(1, X_train.shape[2])))
model.add(LSTM(64, return_sequences=True)) 
model.add(Dropout(0.2))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(le.classes_), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [15]:
early_stop = EarlyStopping(
    monitor='val_loss', 
    patience=3, 
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val, y_val),
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
[1m3183/3183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 11ms/step - accuracy: 0.8044 - loss: 0.5167 - val_accuracy: 0.8397 - val_loss: 0.3196
Epoch 2/20
[1m3183/3183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.8393 - loss: 0.3233 - val_accuracy: 0.8445 - val_loss: 0.3129
Epoch 3/20
[1m3183/3183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 10ms/step - accuracy: 0.8432 - loss: 0.3168 - val_accuracy: 0.8456 - val_loss: 0.3102
Epoch 4/20
[1m3183/3183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.8446 - loss: 0.3145 - val_accuracy: 0.8479 - val_loss: 0.3078
Epoch 5/20
[1m3183/3183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 10ms/step - accuracy: 0.8460 - loss: 0.3122 - val_accuracy: 0.8492 - val_loss: 0.3066
Epoch 6/20
[1m3183/3183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 10ms/step - accuracy: 0.8467 - loss: 0.3108 - val_accuracy: 0.8499 - val_loss: 0.3050
Epoc

In [16]:
y_pred_probs = model.predict(X_val, batch_size=BATCH_SIZE)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_val, axis=1)

print(classification_report(y_true, y_pred, target_names=le.classes_))

[1m796/796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
                precision    recall  f1-score   support

        Benign       0.75      0.87      0.80     38040
          DDoS       0.86      0.97      0.91   1177646
           DoS       0.72      0.33      0.45    280384
         Mirai       1.00      1.00      1.00     91273
         Recon       0.76      0.70      0.73     23933
      Spoofing       0.89      0.71      0.79     16847
Web_BruteForce       0.83      0.08      0.15      1310

      accuracy                           0.85   1629433
     macro avg       0.83      0.67      0.69   1629433
  weighted avg       0.84      0.85      0.83   1629433



In [17]:
model.save(os.path.join(MODELS_DIR, "lstm_model_v1.keras"))
joblib.dump(scaler, os.path.join(MODELS_DIR, "lstm_scaler.pkl"))
joblib.dump(le, os.path.join(MODELS_DIR, "lstm_label_encoder.pkl"))

['/content/drive/MyDrive/IoT_Models/lstm_label_encoder.pkl']