# AdvancedBMT Synthetic IIoT Data Generator 

>
> Apr 14, 2025 v0.0.1
>
>- Python 3.12.7
>- Pandas version: 1.4.2
>- Matplotlib version: 3.10.0
>- Scikit-learn version: 1.6.1
>- numpy version: 1.26.4



In [None]:
# ### run this cell for version check
# !python --version
# import pandas
# import matplotlib
# import sklearn
# print("Pandas version:", pandas.__version__)
# print("Matplotlib version:", matplotlib.__version__)
# print("Scikit-learn version:", sklearn.__version__)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# === CONFIG ===
csv_path = "../test_csv/motor_temp_low.csv"
sensors_to_test = ["temperature"]
window_size = 30
epochs = 10
batch_size = 64
test_size = 0.2
random_state = 42

# === PROCESSING ===
for sensor_name in sensors_to_test:
    print(f"\n🔍 Processing sensor: {sensor_name}")
    df = pd.read_csv(csv_path, parse_dates=["timestamp"])
    sensor_df = df[df["sensor"] == sensor_name].copy()
    sensor_df.sort_values(by="timestamp", inplace=True)
    
    if sensor_df["is_anomaly"].sum() == 0:
        print(f"⚠️ No anomalies in {sensor_name}")
        continue
    print(sensor_df.query("is_anomaly == True")[["timestamp", "feature_0", "is_anomaly"]])    


    # Feature engineering
    le = LabelEncoder()
    sensor_df["state_encoded"] = le.fit_transform(sensor_df["state"])

    # Drop rows with missing values before scaling to ensure alignment
    sensor_df = sensor_df.dropna(subset=["feature_0", "state_encoded"]).reset_index(drop=True)

    scaler = MinMaxScaler()
    features = scaler.fit_transform(sensor_df[["feature_0", "state_encoded"]])

    # Create sliding windows
    X, y_true = [], []
    timestamps = []
    for i in range(len(features) - window_size):
        X.append(features[i:i+window_size])
        label_window = sensor_df["is_anomaly"].iloc[i:i+window_size]
        if label_window.any():
            window = sensor_df.iloc[i:i+window_size]
            anomaly_rows = window[window["is_anomaly"] == True]
            print(f"Anomaly detected in window {i}-{i+window_size}:")
            print(anomaly_rows[["timestamp", "feature_0", "is_anomaly"]])
        y_true.append(int(label_window.any()))
        timestamps.append(sensor_df["timestamp"].iloc[i + window_size - 1])
    
    
    X, y_true = np.array(X), np.array(y_true)
    
    # Train-test split
    X_train, X_test, y_train, y_test, ts_train, ts_test = train_test_split(
        X, y_true, timestamps, test_size=0.3, shuffle=True, random_state=42
    )
    
    # LSTM Autoencoder
    model = Sequential([
        LSTM(64, activation='relu', return_sequences=True, input_shape=(window_size, X.shape[2])),
        Dropout(0.2),
        LSTM(32, activation='relu', return_sequences=True),
        Dropout(0.2),
        TimeDistributed(Dense(X.shape[2]))
    ])
    model.compile(optimizer='adam', loss='mse')
    
    history = model.fit(
        X_train, X_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        callbacks=[EarlyStopping(patience=3)],
        verbose=1
    )
    
    # Predictions
    reconstructions = model.predict(X_train)
    mse = np.mean(np.power(X_train - reconstructions, 2), axis=(1, 2))
    
    # Threshold
    thresholds = np.linspace(np.min(mse), np.max(mse), 100)
    f1_scores = [f1_score(y_train, mse > t) for t in thresholds]
    best_threshold = thresholds[np.argmax(f1_scores)]

    reconstructions = model.predict(X_test)
    mse = np.mean(np.power(X_test - reconstructions, 2), axis=(1, 2))
    y_pred = (mse > best_threshold).astype(int)

    
    # # Set threshold as the 95th percentile of reconstruction error
    # percentile_value = 95
    # best_threshold = np.percentile(mse, percentile_value)

    # # Predict anomalies: anything above threshold is labeled as anomaly (1)
    # y_pred = (mse > best_threshold).astype(int)

    # # Print predicted anomalies (regardless of whether they're true)
    # predicted_anomaly_indices = np.where(y_pred == 1)[0]

    # print(f"\n🚨 Predicted Anomalies ({len(predicted_anomaly_indices)}):")
    # for idx in predicted_anomaly_indices:
    #     row_idx = -len(X_test) + idx
    #     ts = sensor_df["timestamp"].iloc[row_idx]
    #     val = sensor_df["feature_0"].iloc[row_idx]
    #     true_label = y_test[idx]
    #     print(f" - Index {idx:>4} | Timestamp: {ts} | Feature_0: {val:.4f} | Ground Truth: {true_label}")

    
    # Results
    print("\n📊 Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Optimal Threshold: {best_threshold:.6f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Anomaly'],
                yticklabels=['Normal', 'Anomaly'])
    plt.title(f"Confusion Matrix - {sensor_name}")
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()
    
    # Simple visualization of results
    test_timestamps = sensor_df["timestamp"].iloc[-len(X_test):]
    # plt.figure(figsize=(12, 5))
    # plt.plot(test_timestamps, mse, label="Reconstruction Error")
    # plt.scatter(test_timestamps[y_test == 1], mse[y_test == 1],
    #             color="red", label="True Anomalies")
    # plt.axhline(best_threshold, color='black', linestyle='--', 
    #             label=f'Threshold: {best_threshold:.4f}')
    # plt.title(f"Anomaly Detection Results - {sensor_name}")
    # plt.xlabel("Timestamp")
    # plt.ylabel("Reconstruction Error (MSE)")
    # plt.legend()
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()
    # Convert everything to NumPy arrays (if not already)
    ts_test = np.array(ts_test)
    mse = np.array(mse)
    y_test = np.array(y_test)

    # Sort by timestamp
    sorted_indices = np.argsort(ts_test)
    ts_sorted = ts_test[sorted_indices]
    mse_sorted = mse[sorted_indices]
    y_sorted = y_test[sorted_indices]

    plt.figure(figsize=(12, 5))
    plt.plot(ts_sorted, mse_sorted, label="Reconstruction Error")
    plt.scatter(ts_sorted[y_sorted == 1], mse_sorted[y_sorted == 1],
                color="red", label="True Anomalies")
    plt.axhline(best_threshold, color='black', linestyle='--', 
                label=f'Threshold: {best_threshold:.4f}')
    plt.title(f"Anomaly Detection Results - {sensor_name}")
    plt.xlabel("Timestamp")
    plt.ylabel("Reconstruction Error (MSE)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()