**Step 1: Import Libraries & Load Dataset**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
import kagglehub




import kagglehub

# Download latest version
path = kagglehub.dataset_download("supplejade/rt-iot2022real-time-internet-of-things")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/supplejade/rt-iot2022real-time-internet-of-things?dataset_version_number=3...


100%|██████████| 3.64M/3.64M [00:00<00:00, 40.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/supplejade/rt-iot2022real-time-internet-of-things/versions/3


**Step 2: Set Dataset Path & Explore Files**

In [None]:
#ADD TO PATH


import os


# List all files in the downloaded dataset folder
print(os.listdir(path))


['RT_IOT2022.csv']


**Step 3: Load Dataset into DataFrame**

In [None]:
import pandas as pd
import os

csv_path = os.path.join(path, "RT_IOT2022.csv")

df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,no,id.orig_p,id.resp_p,proto,service,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,...,active.std,idle.min,idle.max,idle.tot,idle.avg,idle.std,fwd_init_window_size,bwd_init_window_size,fwd_last_window_size,Attack_type
0,0,38667,1883,tcp,mqtt,32.011598,9,5,3,3,...,0.0,29729180.0,29729180.0,29729180.0,29729180.0,0.0,64240,26847,502,MQTT_Publish
1,1,51143,1883,tcp,mqtt,31.883584,9,5,3,3,...,0.0,29855280.0,29855280.0,29855280.0,29855280.0,0.0,64240,26847,502,MQTT_Publish
2,2,44761,1883,tcp,mqtt,32.124053,9,5,3,3,...,0.0,29842150.0,29842150.0,29842150.0,29842150.0,0.0,64240,26847,502,MQTT_Publish
3,3,60893,1883,tcp,mqtt,31.961063,9,5,3,3,...,0.0,29913770.0,29913770.0,29913770.0,29913770.0,0.0,64240,26847,502,MQTT_Publish
4,4,51087,1883,tcp,mqtt,31.902362,9,5,3,3,...,0.0,29814700.0,29814700.0,29814700.0,29814700.0,0.0,64240,26847,502,MQTT_Publish


CHOOSING SIZE

**Step 4: Stratified Sampling of Dataset**

In [None]:
from sklearn.model_selection import train_test_split

target_column = 'Attack_type'

df_sampled, _ = train_test_split(
    df,
    train_size=60000,
    stratify=df[target_column],
    random_state=42
)

print(df_sampled.shape)
print(df_sampled[target_column].value_counts())


(60000, 85)
Attack_type
DOS_SYN_Hping                 46131
Thing_Speak                    3951
ARP_poisioning                 3777
MQTT_Publish                   2021
NMAP_UDP_SCAN                  1262
NMAP_XMAS_TREE_SCAN             980
NMAP_OS_DETECTION               975
NMAP_TCP_scan                   488
DDOS_Slowloris                  260
Wipro_bulb                      123
Metasploit_Brute_Force_SSH       18
NMAP_FIN_SCAN                    14
Name: count, dtype: int64


**Step 5: Check for Missing Values**

In [None]:
df_sampled.isnull().sum()


Unnamed: 0,0
no,0
id.orig_p,0
id.resp_p,0
proto,0
service,0
...,...
idle.std,0
fwd_init_window_size,0
bwd_init_window_size,0
fwd_last_window_size,0


DROPPING COLUMNS

**Step 6: Remove Zero-Variance Features**

In [None]:
# 1) Find zero-variance columns
zero_variance_cols = [col for col in df_sampled.columns if df_sampled[col].nunique() <= 1]

# 2) Show them (so you can review)
print("Zero-variance columns (will be dropped):", zero_variance_cols)
print("Count:", len(zero_variance_cols))

# 3) Drop them
df_sampled.drop(columns=zero_variance_cols, inplace=True)

# 4) Show new shape and a quick peek at remaining columns
print("New shape:", df_sampled.shape)
print(df_sampled.columns.tolist()[:20])  # first 20 column names


Zero-variance columns (will be dropped): ['bwd_URG_flag_count']
Count: 1
New shape: (60000, 84)
['no', 'id.orig_p', 'id.resp_p', 'proto', 'service', 'flow_duration', 'fwd_pkts_tot', 'bwd_pkts_tot', 'fwd_data_pkts_tot', 'bwd_data_pkts_tot', 'fwd_pkts_per_sec', 'bwd_pkts_per_sec', 'flow_pkts_per_sec', 'down_up_ratio', 'fwd_header_size_tot', 'fwd_header_size_min', 'fwd_header_size_max', 'bwd_header_size_tot', 'bwd_header_size_min', 'bwd_header_size_max']


**Step 7: Identify Numerical and Categorical Features**

In [None]:
numeric_cols = df_sampled.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df_sampled.select_dtypes(include=['object']).columns.tolist()

print("Number of numeric columns:", len(numeric_cols))
print("Number of categorical columns:", len(categorical_cols))

print("\nCategorical columns:")
print(categorical_cols)


Number of numeric columns: 81
Number of categorical columns: 3

Categorical columns:
['proto', 'service', 'Attack_type']


**Step 8: Encode Target Variable**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_sampled['Attack_type'] = le.fit_transform(df_sampled['Attack_type'])

df_sampled['Attack_type'].value_counts()


Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
2,46131
10,3951
0,3777
3,2021
8,1262
9,980
6,975
7,488
1,260
11,123


**Step 9: One-Hot Encode Categorical Features**

In [None]:
# One-hot encode proto and service
df_sampled = pd.get_dummies(df_sampled, columns=['proto', 'service'], drop_first=True)

print("New shape after one-hot encoding:", df_sampled.shape)


New shape after one-hot encoding: (60000, 93)


**Step 10: Feature Scaling (Standardization)**

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns again (excluding target)
numeric_cols = df_sampled.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove('Attack_type')  # do not scale target

scaler = StandardScaler()
df_sampled[numeric_cols] = scaler.fit_transform(df_sampled[numeric_cols])

# Check first 5 rows
df_sampled.head()


Unnamed: 0,no,id.orig_p,id.resp_p,flow_duration,fwd_pkts_tot,bwd_pkts_tot,fwd_data_pkts_tot,bwd_data_pkts_tot,fwd_pkts_per_sec,bwd_pkts_per_sec,...,proto_udp,service_dhcp,service_dns,service_http,service_irc,service_mqtt,service_ntp,service_radius,service_ssh,service_ssl
95645,1.242828,0.191237,-0.190608,-0.034697,-0.085013,-0.059279,-0.035707,-0.066468,-0.283461,-0.283346,...,False,False,False,False,False,False,False,False,False,False
51401,-0.211893,-0.062545,-0.190608,-0.034697,-0.085013,-0.131831,-0.035707,-0.066468,-0.948339,-0.948167,...,False,False,False,False,False,False,False,False,False,False
60081,0.073501,0.393401,-0.190608,-0.034697,-0.085013,-0.059279,-0.035707,-0.066468,-0.241906,-0.241794,...,False,False,False,False,False,False,False,False,False,False
110142,1.719482,-1.675398,-0.190608,-0.034697,-0.085013,-0.059279,-0.035707,-0.066468,-0.283461,-0.283346,...,False,False,False,False,False,False,False,False,False,False
7939,-1.093623,0.153114,-0.184538,-0.034369,-0.011775,0.013272,0.055912,0.117234,-0.948164,-0.947992,...,True,False,True,False,False,False,False,False,False,False


**Install Require Package**

In [None]:
!pip install pyloras

Collecting pyloras
  Downloading pyloras-0.1.0b6-py3-none-any.whl.metadata (4.5 kB)
Downloading pyloras-0.1.0b6-py3-none-any.whl (13 kB)
Installing collected packages: pyloras
Successfully installed pyloras-0.1.0b6


**Step 11: Hybrid Class Balancing (SMOTE + LORAS + Tiny Class Oversampling)**

In [None]:
# -----------------------------
# Hybrid Balancing: SMOTE + LORAS + Tiny Class Oversampling
# -----------------------------

import pandas as pd
from imblearn.over_sampling import SMOTE
from pyloras import LORAS
from sklearn.utils import resample
from collections import Counter
import numpy as np

# 1️⃣ Split features & target
X = df_sampled.drop('Attack_type', axis=1)
y = df_sampled['Attack_type']

# 2️⃣ Identify tiny classes for LORAS (<50 samples)
class_counts = y.value_counts()
loras_classes = class_counts[class_counts < 50].index.tolist()
print("Tiny classes for LORAS:", loras_classes)

# 3️⃣ Apply SMOTE on medium/large classes
smote_classes = class_counts[class_counts >= 50].index.tolist()
smote_mask = y.isin(smote_classes)
X_smote = X[smote_mask]
y_smote = y[smote_mask]

smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X_smote, y_smote)

# 4️⃣ Apply LORAS on tiny classes
loras_mask = y.isin(loras_classes)
X_loras = X[loras_mask]
y_loras = y[loras_mask]

lrs = LORAS(random_state=42)
X_lr, y_lr = lrs.fit_resample(X_loras.values, y_loras.values)

# Convert LORAS output back to DataFrame
X_lr_df = pd.DataFrame(X_lr, columns=X_loras.columns)
y_lr_df = pd.Series(y_lr, name='Attack_type')

# 5️⃣ Combine SMOTE + LORAS
X_res = pd.concat([pd.DataFrame(X_sm, columns=X_smote.columns), X_lr_df], axis=0)
y_res = pd.concat([pd.Series(y_sm, name='Attack_type'), y_lr_df], axis=0)

# 6️⃣ Identify remaining extremely tiny classes after hybrid
remaining_tiny_classes = y_res.value_counts()[y_res.value_counts() < y_res.value_counts().max()].index.tolist()

# 7️⃣ Randomly oversample remaining tiny classes
X_rest = X_res[~y_res.isin(remaining_tiny_classes)]
y_rest = y_res[~y_res.isin(remaining_tiny_classes)]

X_tiny = X_res[y_res.isin(remaining_tiny_classes)]
y_tiny = y_res[y_res.isin(remaining_tiny_classes)]

# Use majority class count as target
max_count = y_res.value_counts().max()

X_tiny_resampled = []
y_tiny_resampled = []

for cls in remaining_tiny_classes:
    X_cls = X_tiny[y_tiny == cls]
    y_cls = y_tiny[y_tiny == cls]

    if len(X_cls) == 0:
        continue

    X_resampled_cls, y_resampled_cls = resample(
        X_cls, y_cls,
        replace=True,
        n_samples=max_count,
        random_state=42
    )

    X_tiny_resampled.append(X_resampled_cls)
    y_tiny_resampled.append(y_resampled_cls)

# 8️⃣ Combine all
X_tiny_final = pd.concat(X_tiny_resampled)
y_tiny_final = pd.concat(y_tiny_resampled)

X_final = pd.concat([X_rest, X_tiny_final]).reset_index(drop=True)
y_final = pd.concat([y_rest, y_tiny_final]).reset_index(drop=True)

# 9️⃣ Check final results
print("Final balanced dataset shape:", X_final.shape)
print("Class distribution after full balancing:")
print(y_final.value_counts())


Tiny classes for LORAS: [4, 5]
Final balanced dataset shape: (553572, 92)
Class distribution after full balancing:
Attack_type
2     46131
10    46131
3     46131
6     46131
8     46131
0     46131
9     46131
7     46131
1     46131
11    46131
5     46131
4     46131
Name: count, dtype: int64


**Step 12: Build, Train, and Evaluate 1D-CNN Convolutional Neural Network Model**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.keras import TqdmCallback

# -----------------------------
# Ensure numeric features
# -----------------------------
X_final_numeric = X_final.copy()
for col in X_final_numeric.columns:
    X_final_numeric[col] = pd.to_numeric(X_final_numeric[col], errors='coerce')
X_final_numeric = X_final_numeric.fillna(0)
X_final_numeric = X_final_numeric.astype('float32')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_final_numeric, y_final,
    test_size=0.2,
    random_state=42,
    stratify=y_final
)

num_classes = y_final.nunique()
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

# Reshape for Conv1D
X_train_1d = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_1d = X_test.values.reshape(-1, X_test.shape[1], 1)

# -----------------------------
# 1️⃣ Build 1D-CNN
# -----------------------------
inputs = layers.Input(shape=(X_train_1d.shape[1], 1))
x = layers.Conv1D(16, kernel_size=3, activation='relu', padding='same')(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(8, kernel_size=3, activation='relu', padding='same')(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(16, activation='relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)

cnn_model = models.Model(inputs, outputs)
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# -----------------------------
# 2️⃣ Train 1D-CNN (10 epochs)
# -----------------------------
history_cnn = cnn_model.fit(
    X_train_1d, y_train_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    verbose=0,
    callbacks=[TqdmCallback(verbose=1)]
)

# -----------------------------
# 3️⃣ Evaluate 1D-CNN
# -----------------------------
y_pred_cnn = cnn_model.predict(X_test_1d).argmax(axis=1)
acc_cnn = accuracy_score(y_test, y_pred_cnn)
print("1D-CNN Accuracy:", acc_cnn)
print("\nClassification Report:\n", classification_report(y_test, y_pred_cnn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_cnn))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[1m3460/3460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
1D-CNN Accuracy: 0.8112992819401165

Classification Report:
               precision    recall  f1-score   support

           0       0.23      0.14      0.17      9226
           1       0.83      0.99      0.90      9226
           2       1.00      0.90      0.95      9226
           3       0.86      1.00      0.92      9227
           4       0.74      0.84      0.79      9226
           5       0.95      0.89      0.92      9226
           6       0.90      1.00      0.95      9227
           7       1.00      0.99      0.99      9227
           8       0.74      0.94      0.83      9226
           9       1.00      0.99      1.00      9226
          10       0.56      0.63      0.59      9226
          11       0.73      0.45      0.55      9226

    accuracy                           0.81    110715
   macro avg       0.79      0.81      0.80    110715
weighted avg       0.79      0.81      0.80    110

**Step 13: Build, Train, and Evaluate DenseNet 1D Model**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm.keras import TqdmCallback

# Reshape for Conv1D
X_train_1d = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_1d = X_test.values.reshape(-1, X_test.shape[1], 1)

# -----------------------------
# DenseNet blocks
# -----------------------------
def dense_block(x, num_layers=1, growth_rate=5):
    for _ in range(num_layers):
        out = layers.Conv1D(filters=growth_rate, kernel_size=3, padding='same', activation='relu')(x)
        x = layers.Concatenate()([x, out])
    return x

def transition_layer(x, reduction=0.5):
    filters = int(x.shape[-1] * reduction)
    x = layers.Conv1D(filters=filters, kernel_size=1, padding='same', activation='relu')(x)
    x = layers.AveragePooling1D(pool_size=2, strides=2, padding='same')(x)
    return x

# -----------------------------
# Build DenseNet
# -----------------------------
inputs_dn = layers.Input(shape=(X_train_1d.shape[1],1))
x_dn = layers.GaussianNoise(0.5)(inputs_dn)
x_dn = dense_block(x_dn, num_layers=1, growth_rate=5)
x_dn = transition_layer(x_dn)
x_dn = dense_block(x_dn, num_layers=1, growth_rate=5)
x_dn = layers.GlobalAveragePooling1D()(x_dn)
x_dn = layers.Dense(32, activation='relu')(x_dn)
x_dn = layers.Dropout(0.7)(x_dn)
x_dn = layers.Dense(16, activation='relu')(x_dn)
x_dn = layers.Dropout(0.6)(x_dn)
outputs_dn = layers.Dense(num_classes, activation='softmax')(x_dn)

densenet_model = models.Model(inputs=inputs_dn, outputs=outputs_dn)
densenet_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# -----------------------------
# Train DenseNet (10 epochs)
# -----------------------------
history_dn = densenet_model.fit(
    X_train_1d, y_train_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    verbose=0,
    callbacks=[TqdmCallback(verbose=1)]
)

# -----------------------------
# Evaluate DenseNet
# -----------------------------
y_pred_dn = densenet_model.predict(X_test_1d).argmax(axis=1)
acc_dn = accuracy_score(y_test, y_pred_dn)
print("DenseNet Accuracy:", acc_dn)
print("\nClassification Report:\n", classification_report(y_test, y_pred_dn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dn))


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

[1m3460/3460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
DenseNet Accuracy: 0.7187553628686266

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.05      0.10      9226
           1       0.61      0.39      0.48      9226
           2       1.00      0.90      0.94      9226
           3       0.39      0.99      0.56      9227
           4       0.67      0.84      0.75      9226
           5       0.98      0.89      0.93      9226
           6       0.87      1.00      0.93      9227
           7       0.98      1.00      0.99      9227
           8       0.78      0.92      0.84      9226
           9       1.00      1.00      1.00      9226
          10       0.49      0.49      0.49      9226
          11       0.48      0.17      0.26      9226

    accuracy                           0.72    110715
   macro avg       0.74      0.72      0.69    110715
weighted avg       0.74      0.72      0.69    1

**Step 14: Build, Train, and Evaluate Hybrid CNN + DenseNet Model**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ----------------------------------------------------
# 🔹 RESHAPE FOR 1D CNN + DENSENET
# ----------------------------------------------------
X_train_1d = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_1d = X_test.values.reshape(-1, X_test.shape[1], 1)

# ----------------------------------------------------
# 🔹 CNN
# ----------------------------------------------------
inputs = layers.Input(shape=(X_train_1d.shape[1], 1))

x_cnn = layers.Conv1D(16, kernel_size=3, activation='relu', padding='same')(inputs)
x_cnn = layers.Dropout(0.3)(x_cnn)
x_cnn = layers.Conv1D(8, kernel_size=3, activation='relu', padding='same')(x_cnn)
x_cnn = layers.GlobalAveragePooling1D()(x_cnn)
x_cnn = layers.Dense(16, activation='relu')(x_cnn)
x_cnn = layers.Dropout(0.3)(x_cnn)

# ----------------------------------------------------
# 🔹 DENSENET
# ----------------------------------------------------
def dense_block(x, num_layers=1, growth_rate=5):
    for _ in range(num_layers):
        out = layers.Conv1D(filters=growth_rate, kernel_size=3,
                            padding='same', activation='relu')(x)
        x = layers.Concatenate()([x, out])
    return x

def transition_layer(x, reduction=0.5):
    filters = int(x.shape[-1] * reduction)
    x = layers.Conv1D(filters=filters, kernel_size=1,
                      padding='same', activation='relu')(x)
    x = layers.AveragePooling1D(pool_size=2, strides=2, padding='same')(x)
    return x

x_dn = layers.GaussianNoise(0.5)(inputs)
x_dn = dense_block(x_dn, num_layers=1, growth_rate=5)
x_dn = transition_layer(x_dn)
x_dn = dense_block(x_dn, num_layers=1, growth_rate=5)
x_dn = layers.GlobalAveragePooling1D()(x_dn)
x_dn = layers.Dense(32, activation='relu')(x_dn)
x_dn = layers.Dropout(0.3)(x_dn)
x_dn = layers.Dense(16, activation='relu')(x_dn)
x_dn = layers.Dropout(0.3)(x_dn)

fusion = layers.Concatenate()([x_cnn, x_dn])

fusion = layers.Dense(32, activation='relu')(fusion)

# ----------------------------------------------------
# 🔹 FINAL CLASSIFIER
# ----------------------------------------------------
outputs = layers.Dense(num_classes, activation='softmax')(fusion)

# ----------------------------------------------------
# BUILD & COMPILE
# ----------------------------------------------------
hybrid_model = models.Model(inputs, outputs)
hybrid_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# ----------------------------------------------------
# 🔹 TRAIN (10 EPOCHS)
# ----------------------------------------------------
history_hybrid = hybrid_model.fit(
    X_train_1d, y_train_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    verbose=1
)

# ----------------------------------------------------
# 🔹 EVALUATE
# ----------------------------------------------------
y_pred_h = hybrid_model.predict(X_test_1d).argmax(axis=1)
acc_h = accuracy_score(y_test, y_pred_h)

print("🔥 HYBRID CNN + DenseNet (NO FUSION) Accuracy:", acc_h)
print("\nClassification Report:\n", classification_report(y_test, y_pred_h))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_h))


Epoch 1/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 9ms/step - accuracy: 0.5388 - loss: 1.3077 - val_accuracy: 0.8506 - val_loss: 0.5093
Epoch 2/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 9ms/step - accuracy: 0.7819 - loss: 0.6433 - val_accuracy: 0.8848 - val_loss: 0.3637
Epoch 3/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 9ms/step - accuracy: 0.8272 - loss: 0.5099 - val_accuracy: 0.8875 - val_loss: 0.3534
Epoch 4/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 8ms/step - accuracy: 0.8479 - loss: 0.4519 - val_accuracy: 0.8999 - val_loss: 0.2920
Epoch 5/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 9ms/step - accuracy: 0.8612 - loss: 0.4105 - val_accuracy: 0.9153 - val_loss: 0.2770
Epoch 6/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 8ms/step - accuracy: 0.8704 - loss: 0.3832 - val_accuracy: 0.8949 - val_loss: 0.

**Step 15: RAM-Friendly Core-Set Active Learning with Hybrid Model**

In [None]:
# ===============================
# 🔹 IMPORTS
# ===============================
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import euclidean_distances
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# ===============================
# 🔹 PREPARE DATA
# ===============================
# Ensure X_train/X_test are numeric numpy arrays and reshaped for 1D-CNN
X_train_1d = X_train.values.astype(np.float32).reshape(-1, X_train.shape[1], 1)
X_test_1d  = X_test.values.astype(np.float32).reshape(-1, X_test.shape[1], 1)
num_classes = len(np.unique(y_train))

# Pool and labeled sets
X_pool_1d = X_test_1d.copy()
y_pool = y_test.to_numpy()
X_labeled = X_train_1d.copy()
y_train_base_cat = to_categorical(y_train, num_classes=num_classes)

# ===============================
# 🔹 EMBEDDINGS MODEL
# ===============================
# Use second-to-last dense layer of your trained hybrid_model
embedding_model = tf.keras.Model(inputs=hybrid_model.input,
                                 outputs=hybrid_model.layers[-2].output)

def get_embeddings(X):
    return embedding_model.predict(X, verbose=0)

# ===============================
# 🔹 RAM-FRIENDLY CORE-SET SAMPLING
# ===============================
def core_set_sampling_ram_friendly(X_pool, X_labeled, k=200,
                                   subset_pool=2000, subset_labeled=500):
    # Subsample pool
    if len(X_pool) <= subset_pool:
        X_subset = X_pool
        idx_subset = np.arange(len(X_pool))
    else:
        idx_subset = np.random.choice(len(X_pool), subset_pool, replace=False)
        X_subset = X_pool[idx_subset]

    # Subsample labeled
    if len(X_labeled) <= subset_labeled:
        X_label_sub = X_labeled
    else:
        idx_label_sub = np.random.choice(len(X_labeled), subset_labeled, replace=False)
        X_label_sub = X_labeled[idx_label_sub]

    # Compute embeddings
    emb_pool = get_embeddings(X_subset)
    emb_label = get_embeddings(X_label_sub)

    # Distance to nearest labeled sample
    dists = euclidean_distances(emb_pool, emb_label)
    min_dists = dists.min(axis=1)

    # Pick top k most distant (diverse) samples
    top_local_idx = np.argsort(-min_dists)[:k]
    top_idx = idx_subset[top_local_idx]
    return top_idx

# ===============================
# 🔹 ACTIVE LEARNING LOOP (Optimized)
# ===============================
n_iterations = 5
batch_size = 128
k = 400                # select more per iteration
subset_pool = 4000     # larger pool subset
subset_labeled = 1000  # larger labeled subset
fine_tune_epochs = 3   # more epochs per iteration

for i in tqdm(range(n_iterations), desc="RAM-Friendly Core-Set AL Optimized"):
    top_idx = core_set_sampling_ram_friendly(
        X_pool_1d, X_labeled, k=k,
        subset_pool=subset_pool, subset_labeled=subset_labeled
    )

    # Select samples
    X_selected = X_pool_1d[top_idx]
    y_selected = y_pool[top_idx]
    y_selected_cat = to_categorical(y_selected, num_classes=num_classes)

    # Combine with labeled set
    X_train_aug = np.concatenate([X_labeled, X_selected], axis=0)
    y_train_aug = np.concatenate([y_train_base_cat, y_selected_cat], axis=0)

    # Safety check
    assert X_train_aug.shape[0] == y_train_aug.shape[0], f"{X_train_aug.shape[0]} vs {y_train_aug.shape[0]}"

    # Fine-tune hybrid model
    hybrid_model.fit(X_train_aug, y_train_aug,
                     epochs=fine_tune_epochs, batch_size=batch_size, verbose=0)

    # Update labeled set
    X_labeled = X_train_aug.copy()
    y_train_base_cat = y_train_aug.copy()

    # Remove selected from pool
    mask = np.ones(len(X_pool_1d), dtype=bool)
    mask[top_idx] = False
    X_pool_1d = X_pool_1d[mask]
    y_pool = y_pool[mask]

    # Evaluate after each iteration
    y_pred = hybrid_model.predict(X_test_1d, verbose=0).argmax(axis=1)
    acc = accuracy_score(y_test, y_pred)
    tqdm.write(f"Iteration {i+1}/{n_iterations}, Accuracy: {acc:.4f}")

# ===============================
# 🔹 FINAL EVALUATION
# ===============================
y_pred_final = hybrid_model.predict(X_test_1d, verbose=0).argmax(axis=1)
print("🔥 FINAL RAM-Friendly Core-Set Accuracy (Optimized):", accuracy_score(y_test, y_pred_final))
print("\nClassification Report:\n", classification_report(y_test, y_pred_final))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_final))


RAM-Friendly Core-Set AL Optimized:  20%|██        | 1/5 [03:56<15:46, 236.73s/it]

Iteration 1/5, Accuracy: 0.9340


RAM-Friendly Core-Set AL Optimized:  40%|████      | 2/5 [07:10<10:33, 211.31s/it]

Iteration 2/5, Accuracy: 0.9245


RAM-Friendly Core-Set AL Optimized:  60%|██████    | 3/5 [11:09<07:28, 224.18s/it]

Iteration 3/5, Accuracy: 0.9381


RAM-Friendly Core-Set AL Optimized:  80%|████████  | 4/5 [14:49<03:42, 222.57s/it]

Iteration 4/5, Accuracy: 0.9436


RAM-Friendly Core-Set AL Optimized: 100%|██████████| 5/5 [18:48<00:00, 225.74s/it]


Iteration 5/5, Accuracy: 0.9469
🔥 FINAL RAM-Friendly Core-Set Accuracy (Optimized): 0.946899697421307

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.78      0.77      9226
           1       0.92      0.99      0.96      9226
           2       1.00      1.00      1.00      9226
           3       1.00      0.99      0.99      9227
           4       0.96      0.90      0.93      9226
           5       0.94      1.00      0.97      9226
           6       0.99      1.00      0.99      9227
           7       0.98      0.99      0.99      9227
           8       0.99      0.93      0.96      9226
           9       1.00      1.00      1.00      9226
          10       0.90      0.85      0.88      9226
          11       0.92      0.93      0.93      9226

    accuracy                           0.95    110715
   macro avg       0.95      0.95      0.95    110715
weighted avg       0.95      0.95      0.95    110715


Confu

**CNN RESULT**

In [None]:
from sklearn.metrics import roc_auc_score

# -----------------------------
# Accuracy
# -----------------------------
acc_cnn = accuracy_score(y_test, y_pred_cnn)
print("1D-CNN Accuracy:", acc_cnn)

# -----------------------------
# Classification Report (full)
# -----------------------------
report_cnn = classification_report(y_test, y_pred_cnn, output_dict=True)
print("\nClassification Report (Full):\n", classification_report(y_test, y_pred_cnn))

# Macro and Weighted averages
macro_f1_cnn = report_cnn['macro avg']['f1-score']
macro_precision_cnn = report_cnn['macro avg']['precision']
macro_recall_cnn = report_cnn['macro avg']['recall']
weighted_f1_cnn = report_cnn['weighted avg']['f1-score']
weighted_precision_cnn = report_cnn['weighted avg']['precision']
weighted_recall_cnn = report_cnn['weighted avg']['recall']

print("\nTotal Metrics:")
print(f"Macro F1-Score: {macro_f1_cnn:.4f}")
print(f"Macro Precision: {macro_precision_cnn:.4f}")
print(f"Macro Recall: {macro_recall_cnn:.4f}")
print(f"Weighted F1-Score: {weighted_f1_cnn:.4f}")
print(f"Weighted Precision: {weighted_precision_cnn:.4f}")
print(f"Weighted Recall: {weighted_recall_cnn:.4f}")

cm_cnn = confusion_matrix(y_test, y_pred_cnn)
print("\nConfusion Matrix:\n", cm_cnn)
y_test_onehot = to_categorical(y_test, num_classes=num_classes)
y_score_cnn = cnn_model.predict(X_test_1d)

macro_roc_auc_cnn = roc_auc_score(y_test_onehot, y_score_cnn, average='macro', multi_class='ovr')
micro_roc_auc_cnn = roc_auc_score(y_test_onehot, y_score_cnn, average='micro', multi_class='ovr')

print(f"\nMacro ROC-AUC: {macro_roc_auc_cnn:.4f}")
print(f"Micro ROC-AUC: {micro_roc_auc_cnn:.4f}")


1D-CNN Accuracy: 0.8112992819401165

Classification Report (Full):
               precision    recall  f1-score   support

           0       0.23      0.14      0.17      9226
           1       0.83      0.99      0.90      9226
           2       1.00      0.90      0.95      9226
           3       0.86      1.00      0.92      9227
           4       0.74      0.84      0.79      9226
           5       0.95      0.89      0.92      9226
           6       0.90      1.00      0.95      9227
           7       1.00      0.99      0.99      9227
           8       0.74      0.94      0.83      9226
           9       1.00      0.99      1.00      9226
          10       0.56      0.63      0.59      9226
          11       0.73      0.45      0.55      9226

    accuracy                           0.81    110715
   macro avg       0.79      0.81      0.80    110715
weighted avg       0.79      0.81      0.80    110715


Total Metrics:
Macro F1-Score: 0.7962
Macro Precision: 0.7949
Ma

**Dnet Result**

In [None]:
from sklearn.metrics import roc_auc_score

# -----------------------------
# Predictions
# -----------------------------
y_pred_dn = densenet_model.predict(X_test_1d).argmax(axis=1)

# -----------------------------
# Accuracy
# -----------------------------
acc_dn = accuracy_score(y_test, y_pred_dn)
print("DenseNet Accuracy:", acc_dn)

# -----------------------------
# Classification Report (full)
# -----------------------------
report = classification_report(y_test, y_pred_dn, output_dict=True)
print("\nClassification Report (Full):\n", classification_report(y_test, y_pred_dn))

# Macro and Weighted averages
macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']
weighted_f1 = report['weighted avg']['f1-score']
weighted_precision = report['weighted avg']['precision']
weighted_recall = report['weighted avg']['recall']

print("\nTotal Metrics:")
print(f"Macro F1-Score: {macro_f1:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")
print(f"Weighted F1-Score: {weighted_f1:.4f}")
print(f"Weighted Precision: {weighted_precision:.4f}")
print(f"Weighted Recall: {weighted_recall:.4f}")

# -----------------------------
# Confusion Matrix
# -----------------------------
cm = confusion_matrix(y_test, y_pred_dn)
print("\nConfusion Matrix:\n", cm)

# -----------------------------
# ROC-AUC (macro & micro)
# -----------------------------
# Convert y_test to one-hot for ROC-AUC
y_test_onehot = to_categorical(y_test, num_classes=num_classes)
y_score_dn = densenet_model.predict(X_test_1d)

macro_roc_auc = roc_auc_score(y_test_onehot, y_score_dn, average='macro', multi_class='ovr')
micro_roc_auc = roc_auc_score(y_test_onehot, y_score_dn, average='micro', multi_class='ovr')

print(f"\nMacro ROC-AUC: {macro_roc_auc:.4f}")
print(f"Micro ROC-AUC: {micro_roc_auc:.4f}")


[1m3460/3460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
DenseNet Accuracy: 0.7187553628686266

Classification Report (Full):
               precision    recall  f1-score   support

           0       0.59      0.05      0.10      9226
           1       0.61      0.39      0.48      9226
           2       1.00      0.90      0.94      9226
           3       0.39      0.99      0.56      9227
           4       0.67      0.84      0.75      9226
           5       0.98      0.89      0.93      9226
           6       0.87      1.00      0.93      9227
           7       0.98      1.00      0.99      9227
           8       0.78      0.92      0.84      9226
           9       1.00      1.00      1.00      9226
          10       0.49      0.49      0.49      9226
          11       0.48      0.17      0.26      9226

    accuracy                           0.72    110715
   macro avg       0.74      0.72      0.69    110715
weighted avg       0.74      0.72      0.

**HYBRID RESULT**

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from tensorflow.keras.utils import to_categorical

# -----------------------------
# Predictions
# -----------------------------
y_pred_hybrid = hybrid_model.predict(X_test_1d).argmax(axis=1)

# -----------------------------
# Accuracy
# -----------------------------
acc_hybrid = accuracy_score(y_test, y_pred_hybrid)
print("Hybrid CNN + DenseNet Accuracy:", acc_hybrid)

# -----------------------------
# Classification Report (full)
# -----------------------------
report = classification_report(y_test, y_pred_hybrid, output_dict=True)
print("\nClassification Report (Full):\n", classification_report(y_test, y_pred_hybrid))

# Macro and Weighted averages
macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']
weighted_f1 = report['weighted avg']['f1-score']
weighted_precision = report['weighted avg']['precision']
weighted_recall = report['weighted avg']['recall']

print("\nTotal Metrics:")
print(f"Macro F1-Score: {macro_f1:.4f}")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall: {macro_recall:.4f}")
print(f"Weighted F1-Score: {weighted_f1:.4f}")
print(f"Weighted Precision: {weighted_precision:.4f}")
print(f"Weighted Recall: {weighted_recall:.4f}")

# -----------------------------
# Confusion Matrix
# -----------------------------
cm = confusion_matrix(y_test, y_pred_hybrid)
print("\nConfusion Matrix:\n", cm)

# -----------------------------
# ROC-AUC (macro & micro)
# -----------------------------
y_test_onehot = to_categorical(y_test, num_classes=num_classes)
y_score_hybrid = hybrid_model.predict(X_test_1d)

macro_roc_auc = roc_auc_score(y_test_onehot, y_score_hybrid, average='macro', multi_class='ovr')
micro_roc_auc = roc_auc_score(y_test_onehot, y_score_hybrid, average='micro', multi_class='ovr')

print(f"\nMacro ROC-AUC: {macro_roc_auc:.4f}")
print(f"Micro ROC-AUC: {micro_roc_auc:.4f}")


[1m3460/3460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step
Hybrid CNN + DenseNet Accuracy: 0.946899697421307

Classification Report (Full):
               precision    recall  f1-score   support

           0       0.76      0.78      0.77      9226
           1       0.92      0.99      0.96      9226
           2       1.00      1.00      1.00      9226
           3       1.00      0.99      0.99      9227
           4       0.96      0.90      0.93      9226
           5       0.94      1.00      0.97      9226
           6       0.99      1.00      0.99      9227
           7       0.98      0.99      0.99      9227
           8       0.99      0.93      0.96      9226
           9       1.00      1.00      1.00      9226
          10       0.90      0.85      0.88      9226
          11       0.92      0.93      0.93      9226

    accuracy                           0.95    110715
   macro avg       0.95      0.95      0.95    110715
weighted avg       0.95     

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split # New import
from tensorflow.keras.utils import to_categorical # Already imported, but explicitly here for clarity

# ----------------------------------------------------
# 🔹 PREPARE DATA (copied from Step 12 to resolve NameError)
# ----------------------------------------------------
# Ensure numeric features
X_final_numeric = X_final.copy() # X_final is available in kernel state
for col in X_final_numeric.columns:
    X_final_numeric[col] = pd.to_numeric(X_final_numeric[col], errors='coerce')
X_final_numeric = X_final_numeric.fillna(0)
X_final_numeric = X_final_numeric.astype('float32')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_final_numeric, y_final, # y_final is available in kernel state
    test_size=0.2,
    random_state=42,
    stratify=y_final
)

num_classes = y_final.nunique()
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes) # Not directly used in this cell but good practice to include

# ----------------------------------------------------
# 🔹 RESHAPE DATA
# ----------------------------------------------------
X_train_1d = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_1d = X_test.values.reshape(-1, X_test.shape[1], 1)

# ----------------------------------------------------
# 🔹 CNN BLOCK
# ----------------------------------------------------
inputs = layers.Input(shape=(X_train_1d.shape[1], 1))

x_cnn = layers.Conv1D(16, kernel_size=3, activation='relu', padding='same')(inputs)
x_cnn = layers.Dropout(0.3)(x_cnn)
x_cnn = layers.Conv1D(8, kernel_size=3, activation='relu', padding='same')(x_cnn)
x_cnn = layers.GlobalAveragePooling1D()(x_cnn)
x_cnn = layers.Dense(16, activation='relu')(x_cnn)
x_cnn = layers.Dropout(0.3)(x_cnn)

# ----------------------------------------------------
# 🔹 DENSENET BLOCK
# ----------------------------------------------------
def dense_block(x, num_layers=1, growth_rate=5):
    for _ in range(num_layers):
        out = layers.Conv1D(filters=growth_rate, kernel_size=3,
                            padding='same', activation='relu')(x)
        x = layers.Concatenate()([x, out])
    return x

def transition_layer(x, reduction=0.5):
    filters = int(x.shape[-1] * reduction)
    x = layers.Conv1D(filters=filters, kernel_size=1,
                      padding='same', activation='relu')(x)
    x = layers.AveragePooling1D(pool_size=2, strides=2, padding='same')(x)
    return x

x_dn = layers.GaussianNoise(0.5)(inputs)
x_dn = dense_block(x_dn, num_layers=1, growth_rate=5)
x_dn = transition_layer(x_dn)
x_dn = dense_block(x_dn, num_layers=1, growth_rate=5)
x_dn = layers.GlobalAveragePooling1D()(x_dn)
x_dn = layers.Dense(32, activation='relu')(x_dn)
x_dn = layers.Dropout(0.3)(x_dn)
x_dn = layers.Dense(16, activation='relu')(x_dn)
x_dn = layers.Dropout(0.3)(x_dn)

# ----------------------------------------------------
# 🔹 FUSION
# ----------------------------------------------------
fusion = layers.Concatenate()([x_cnn, x_dn])
fusion = layers.Dense(32, activation='relu')(fusion)

# ----------------------------------------------------
# 🔹 FINAL CLASSIFIER
# ----------------------------------------------------
outputs = layers.Dense(num_classes, activation='softmax')(fusion)

# ----------------------------------------------------
# 🔹 BUILD & COMPILE
# ----------------------------------------------------
hybrid_model = models.Model(inputs, outputs)
hybrid_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# ----------------------------------------------------
# 🔹 TRAIN
# ----------------------------------------------------
history_hybrid = hybrid_model.fit(
    X_train_1d, y_train_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    verbose=1
)

# ----------------------------------------------------
# 🔹 EVALUATE
# ----------------------------------------------------
y_pred_h = hybrid_model.predict(X_test_1d).argmax(axis=1)
acc_h = accuracy_score(y_test, y_pred_h)

print("🔥 HYBRID CNN + DenseNet Accuracy:", acc_h)
print("\nClassification Report:\n", classification_report(y_test, y_pred_h))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_h))

Epoch 1/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 9ms/step - accuracy: 0.5456 - loss: 1.3000 - val_accuracy: 0.8030 - val_loss: 0.5901
Epoch 2/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 9ms/step - accuracy: 0.8012 - loss: 0.6047 - val_accuracy: 0.8663 - val_loss: 0.4110
Epoch 3/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 9ms/step - accuracy: 0.8394 - loss: 0.4767 - val_accuracy: 0.8778 - val_loss: 0.3638
Epoch 4/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 9ms/step - accuracy: 0.8600 - loss: 0.4156 - val_accuracy: 0.8884 - val_loss: 0.3290
Epoch 5/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 9ms/step - accuracy: 0.8724 - loss: 0.3772 - val_accuracy: 0.8718 - val_loss: 0.3295
Epoch 6/10
[1m11072/11072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 9ms/step - accuracy: 0.8789 - loss: 0.3588 - val_accuracy: 0.8935 - val_loss: 0.