In [7]:
# 🚀 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# ✅ 2. Load dataset
df = pd.read_csv("/server_df.csv")
print("✅ Dataset loaded:", df.shape)

target_col = "label"
if target_col not in df.columns:
    raise ValueError("❌ Target column 'label' not found in dataset")

# ✅ 3. Clean labels (remove spaces, make consistent)
df[target_col] = df[target_col].astype(str).str.strip()

# ✅ 4. Show initial class distribution
print("\n📊 Original Class Distribution:")
print(df[target_col].value_counts())

# ✅ 5. Remove rare classes (<2 samples)
class_counts = df[target_col].value_counts()
rare_classes = class_counts[class_counts < 2].index
if len(rare_classes) > 0:
    print("⚠️ Dropping rare classes with <2 samples:", list(rare_classes))
    df = df[~df[target_col].isin(rare_classes)]

# ✅ 6. Re-check after filtering
print("\n📊 Class distribution after filtering:")
print(df[target_col].value_counts())

# ✅ 7. Reset index
df = df.reset_index(drop=True)

# ✅ 8. Split features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# ✅ 9. Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(np.unique(y_encoded))
print("✅ Classes after encoding:", num_classes)

# Save label encoder
joblib.dump(le, "iot_label_encoder.joblib")

# ✅ 10. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "iot_scaler.joblib")

# ✅ 11. Check minimum class size before splitting
from collections import Counter
counts = Counter(y_encoded)
min_class = min(counts.values())
print("📊 Smallest class size:", min_class)
if min_class < 2:
    raise ValueError("❌ Some classes still have <2 samples. Please review class counts above.")

# ✅ 12. Train-test split (will now work)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("✅ Train/Test split done:", X_train.shape, X_test.shape)

# ✅ 13. Build model
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# ✅ 14. Train model
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

# ✅ 15. Evaluate model
y_pred = np.argmax(model.predict(X_test), axis=1)
print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))

# ✅ 16. Fix classification report label mismatch
used_labels = np.unique(y_encoded)
used_names = le.inverse_transform(used_labels)

print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=used_names))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Dataset loaded: (542703, 47)

📊 Original Class Distribution:
label
DDoS-ICMP_Flood            83801
DDoS-UDP_Flood             62996
DDoS-TCP_Flood             52383
DDoS-PSHACK_Flood          47571
DDoS-SYN_Flood             47242
DDoS-RSTFINFlood           46970
DDoS-SynonymousIP_Flood    41695
DoS-UDP_Flood              38566
DoS-TCP_Flood              31007
DoS-SYN_Flood              23459
BenignTraffic              12754
Mirai-greeth_flood         11499
Mirai-udpplain             10376
Mirai-greip_flood           8738
DDoS-ICMP_Fragmentation     5251
MITM-ArpSpoofing            3581
DDoS-UDP_Fragmentation      3381
DDoS-ACK_Fragmentation      3329
DNS_Spoofing                2104
Recon-HostDiscovery         1556
Recon-OSScan                1147
Recon-PortScan               964
DoS-HTTP_Flood               835
VulnerabilityScan            435
DDoS-HTTP_Flood              341
DDoS-SlowLoris               280
DictionaryBruteForce         143
BrowserHijacking              63
Command

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m10854/10854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - accuracy: 0.7570 - loss: 0.6164 - val_accuracy: 0.8004 - val_loss: 0.4348
Epoch 2/30
[1m10854/10854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.7994 - loss: 0.4488 - val_accuracy: 0.8207 - val_loss: 0.4031
Epoch 3/30
[1m10854/10854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - accuracy: 0.8148 - loss: 0.4203 - val_accuracy: 0.8433 - val_loss: 0.3698
Epoch 4/30
[1m10854/10854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - accuracy: 0.8359 - loss: 0.3942 - val_accuracy: 0.8496 - val_loss: 0.3534
Epoch 5/30
[1m10854/10854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - accuracy: 0.8420 - loss: 0.3751 - val_accuracy: 0.8675 - val_loss: 0.3058
Epoch 6/30
[1m10854/10854[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - accuracy: 0.8632 - loss: 0.3319 - val_accuracy: 0.9022 - val_loss: 0.256

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
