In [None]:
base_directory = '../rawdata'
file_extension = "*.csv"

In [None]:
from scripts.utils import load_device_data

danmini_doorbell_df = load_device_data(base_directory, file_extension, 'Danmini_Doorbell')
ennio_doorbell_df = load_device_data(base_directory, file_extension, 'Ennio_Doorbell')
# Ecobee Thermostat
ecobee_thermostat_df = load_device_data(base_directory, file_extension, 'Ecobee_Thermostat')
# Web cam
samsung_cam_df = load_device_data(base_directory, file_extension, 'Samsung_SNH_1011_N_Webcam')
# Baby monitor
baby_monitor_df = load_device_data(base_directory, file_extension, 'Philips_B120N10_Baby_Monitor')
# Security cam
provision_cam1_df = load_device_data(base_directory, file_extension, 'Provision_PT_737E_Security_Camera')
provision_cam2_df = load_device_data(base_directory, file_extension, 'Provision_PT_838_Security_Camera')
simplehome_cam1_df = load_device_data(base_directory, file_extension, 'SimpleHome_XCS7_1002_WHT_Security_Camera')
simplehome_cam2_df = load_device_data(base_directory, file_extension, 'SimpleHome_XCS7_1003_WHT_Security_Camera')

In [None]:
dataframe = {"Danmin Door Bell": danmini_doorbell_df, 
            "Ecobee_Thermostat": ecobee_thermostat_df,
            "Ennio_Doorbell": ennio_doorbell_df,
            "Philips_B120N10_Baby_Monitor": baby_monitor_df,
            "Provision_PT_737E_Security_Camera": provision_cam1_df,
            "Provision_PT_838_Security_Camera": provision_cam2_df,
            "Samsung_SNH_1011_N_Webcam": samsung_cam_df,
            "SimpleHome_XCS7_1002_WHT_Security_Camera": simplehome_cam1_df,
            "SimpleHome_XCS7_1003_WHT_Security_Camera": simplehome_cam2_df
            }

In [None]:
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight
from sklearn.metrics import precision_score, recall_score, f1_score # <-- Added this import

# Helper function to create the sliding windows
def create_sequences(X, y, time_steps=10):
    """
    Creates sequences from time-series data.
    
    X: Input features (scaled)
    y: Input labels (encoded)
    time_steps: How many previous rows to use for prediction
    """
    Xs, ys = [], []
    
    for i in range(len(X) - time_steps):
        # Get a window of 'time_steps' rows
        v = X[i:(i + time_steps)]
        # Get the label for the *end* of that window
        Xs.append(v)
        ys.append(y[i + time_steps])
        
    return np.array(Xs), np.array(ys)

# --- Your Main Function (Revised) ---

def cnn_lstm_classifier(data, device_name, n_steps=10):
    """
    Trains a CNN-LSTM classifier with correct data preprocessing
    and calculates precision, recall, and f1-score.
    
    data: Your input pandas DataFrame
    device_name: String name for saving the model
    n_steps: The number of time steps (rows) to look back (hyperparameter)
    """
    
    print("Starting model training...")
    
    # --- 1. Initial Data Prep ---
    # Separate features and labels
    X = data.drop(['label', 'device'], axis=1)
    y = data['label']
    
    # Get feature count *before* scaling
    n_features = X.shape[1]
    print(f"Original data shape: {X.shape}")

    # Encode labels (e.g., 'benign' -> 0, 'attack_1' -> 1)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Get the number of unique classes
    n_classes = len(np.unique(y_encoded))
    print(f"Found {n_features} features and {n_classes} classes.")

    # --- 2. Stratified Split FIRST ---
    # We split *before* creating sequences to prevent data leakage
    # Stratify ensures class proportions are maintained in train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, 
        test_size=0.3, 
        random_state=42,
        stratify=y_encoded  # <-- CRITICAL for imbalanced data
    )

    # --- 3. Scale Data ---
    # LSTMs need scaled data. Fit *only* on training data.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # --- 4. Create Sequences ---
    # Now we create the sliding windows (e.g., [samples, 10, 115])
    print(f"Creating sequences with n_steps={n_steps}...")
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, n_steps)
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, n_steps)

    if X_train_seq.shape[0] == 0:
        print("Error: No sequences created. Is n_steps larger than your data?")
        return "Training failed."
        
    print(f"New training data shape: {X_train_seq.shape}") # Should be [samples, n_steps, n_features]
    print(f"New training label shape: {y_train_seq.shape}") # Should be [samples]

    # --- 5. One-Hot Encode Labels ---
    # We do this *after* creating sequences and splitting
    y_train_cat = to_categorical(y_train_seq, num_classes=n_classes)
    y_test_cat = to_categorical(y_test_seq, num_classes=n_classes)

    # --- 6. Handle Class Imbalance (Optional but Recommended) ---
    # This tells the model to pay more attention to rare classes
    # 'balanced' mode automatically calculates weights
    cw = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(y_train_seq), # Use the sequence labels
        y=y_train_seq
    )
    class_weights = dict(enumerate(cw))
    print(f"Calculated class weights: {class_weights}")

    # --- 7. Build and Compile Model ---
    model = Sequential()
    # The input shape is (timesteps, features)
    model.add(Conv1D(
        filters=64, 
        kernel_size=3, 
        activation='relu', 
        input_shape=(n_steps, n_features)  # <-- THE KEY CHANGE
    ))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dropout(0.5))
    # Output layer has one neuron per class
    model.add(Dense(n_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    # --- 8. Train the Model ---
    history = model.fit(
        X_train_seq, 
        y_train_cat, 
        epochs=10,  # You may need more epochs
        batch_size=64, 
        validation_data=(X_test_seq, y_test_cat),
        class_weight=class_weights,  # <-- Use the class weights
        verbose=1
    )

    # --- 9. Evaluate and Save ---
    print("\nEvaluating model on test data...")
    loss, accuracy = model.evaluate(X_test_seq, y_test_cat, verbose=0)
    
    # --- 10. Calculate Precision, Recall, F1-Score (NEW) ---
    print("Calculating additional metrics...")
    y_pred_prob = model.predict(X_test_seq)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # We compare against the original integer-encoded test labels (y_test_seq)
    # Use 'weighted' average for imbalanced classes, matches class_weight logic
    precision = precision_score(y_test_seq, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test_seq, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test_seq, y_pred, average='weighted', zero_division=0)
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Loss: {loss:.4f}")
    print(f"Test Precision (Weighted): {precision:.4f}")
    print(f"Test Recall (Weighted): {recall:.4f}")
    print(f"Test F1-Score (Weighted): {f1:.4f}")

    results = {
        'Accuracy': accuracy,
        'Loss': loss,
        'Precision_Weighted': precision, # <-- New
        'Recall_Weighted': recall,       # <-- New
        'F1_Score_Weighted': f1,         # <-- New
        'n_steps': n_steps,
        'features': n_features
    }

    modelpath = f'models/{device_name}'
    if not os.path.exists(modelpath):
        os.makedirs(modelpath) # Use makedirs for nested paths
    
    model.save(f'{modelpath}/{device_name}_cnn_lstm_model.h5')

    return f'CNN-LSTM Model trained and saved successfully \n {results}'


In [None]:
for k in dataframe:
    print("----------------------xxxxxxx----------------------")
    print(k)
    print("----------------------xxxxxxx----------------------")
    
    
    # New CNN-LSTM Classifier
    results_cnn_lstm = cnn_lstm_classifier(dataframe[k], k)
    print("CNN-LSTM Results:")
    print(results_cnn_lstm)
    
    print("---------------------xxxxxxx-----------------------")

In [None]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import load_model

# --- This helper function MUST be identical to the one used in training ---
def create_sequences(X, y, time_steps=10):
    """
    Creates sequences from time-series data.
    
    X: Input features (scaled)
    y: Input labels (encoded)
    time_steps: How many previous rows to use for prediction
    """
    Xs, ys = [], []
    
    for i in range(len(X) - time_steps):
        v = X[i:(i + time_steps)]
        Xs.append(v)
        ys.append(y[i + time_steps])
        
    return np.array(Xs), np.array(ys)

def plot_model_predictions(data, device_name, n_steps=10):
    """
    Loads a saved CNN-LSTM model, re-creates the test set,
    and plots the actual vs. predicted results.
    
    data: The *original* pandas DataFrame used for training
    device_name: String name used when saving the model
    n_steps: The number of time steps (must match training)
    """
    
    print("Loading model and preparing data for plotting...")
    
    # --- 1. Re-create the Test Set (CRITICAL) ---
    # We must follow the *exact* same preprocessing steps as in training
    # to get the identical test set.
    
    X = data.drop(['label', 'device'], axis=1)
    y = data['label']
    
    n_features = X.shape[1]
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    n_classes = len(label_encoder.classes_)
    
    print(f"Loaded {n_features} features and {n_classes} classes.")
    
    # Stratified Split (must use same random_state)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, 
        test_size=0.3, 
        random_state=42, # <-- Must be same as training
        stratify=y_encoded
    )

    # Scale Data (fit on train, transform test)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train) # Fit on train
    X_test_scaled = scaler.transform(X_test)       # Transform test
    
    # Create Sequences
    print(f"Creating test sequences with n_steps={n_steps}...")
    X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, n_steps)
    
    if X_test_seq.shape[0] == 0:
        print("Error: No test sequences created. Check n_steps and data size.")
        return

    # --- 2. Load the Saved Model ---
    model_path = f'models/{device_name}/{device_name}_cnn_lstm_model.h5'
    if not os.path.exists(model_path):
        print(f"Error: Model file not found at {model_path}")
        return
        
    print(f"Loading model from {model_path}...")
    model = load_model(model_path)
    model.summary()

    # --- 3. Make Predictions ---
    print("Generating predictions on test set...")
    y_pred_prob = model.predict(X_test_seq)
    y_pred = np.argmax(y_pred_prob, axis=1) # Get the class with highest probability

    # y_test_seq contains the true, integer-encoded labels
    # y_pred contains the predicted, integer-encoded labels

    # --- 4. Plot Confusion Matrix ---
    print("Generating Confusion Matrix...")
    cm = confusion_matrix(y_test_seq, y_pred)
    class_names = label_encoder.classes_
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, 
        annot=True, 
        fmt='d', 
        cmap='Blues', 
        xticklabels=class_names, 
        yticklabels=class_names
    )
    plt.title(f'Confusion Matrix - {device_name}', fontsize=16)
    plt.ylabel('Actual Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.show()

    # --- 5. Plot Actual vs. Predicted Time Series ---
    print("Generating Actual vs. Predicted sequence plot...")
    plt.figure(figsize=(15, 6))
    
    # Plot a subset if the test set is too large to see
    plot_limit = min(500, len(y_test_seq)) # Plot first 500 samples or all
    
    plt.plot(
        y_test_seq[:plot_limit], 
        label='Actual Labels', 
        linestyle='-', 
        marker='o', 
        alpha=0.7,
        markersize=6
    )
    plt.plot(
        y_pred[:plot_limit], 
        label='Predicted Labels', 
        linestyle='--', 
        marker='x', 
        alpha=0.7,
        markersize=6
    )
    
    plt.title(f'Actual vs. Predicted Labels (First {plot_limit} Test Samples)', fontsize=16)
    plt.xlabel('Test Sample Index', fontsize=12)
    plt.ylabel('Class Label', fontsize=12)
    # Set y-ticks to match class labels
    plt.yticks(ticks=np.arange(len(class_names)), labels=class_names)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()
    
    # --- 6. Print Classification Report ---
    print("\n--- Classification Report ---\n")
    report = classification_report(
        y_test_seq, 
        y_pred, 
        target_names=class_names, 
        zero_division=0
    )
    print(report)

    return "Plotting and reporting complete."

In [None]:
for k in dataframe:
    print("----------------------xxxxxxx----------------------")
    print(k)
    print("----------------------xxxxxxx----------------------")
    
    results = plot_model_predictions(dataframe[k], k)
    print(results)
    
    print("---------------------xxxxxxx-----------------------")