In [2]:
# ========================
# 0. IMPORTS
# ========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, LSTM, RepeatVector, TimeDistributed,
                                    MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D,
                                    Conv1D)
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.python.client import device_lib
import os
# ========================
# 1. CONFIGURATION
# ========================
INPUT_STEPS = 10
FORECAST_STEPS = 10
TEST_RATIO = 0.05
# Tuning parameters
EPOCHS_LIST = [10]
BATCH_SIZES = [128]
WINDOW_SIZE_SIMULATION = 10  # 6h window
THRESHOLD_PERCENTILE = 90
# Transformer parameters for anomaly detection
EMBED_DIM = 128
NUM_HEADS = 4
FF_DIM = 256
DROPOUT_RATE = 0.1
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
# ========================
# 2. DEVICE SETUP
# ========================
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print("✅ GPU is available and will be used.")
    except RuntimeError as e:
        print(e)
else:
    print("⚠️ No GPU detected, running on CPU.")
# ========================
# 3. LOAD AND PREPROCESS DATA
# ========================
file_path = '../../data/cleaned_labeled_dataset.csv'
df = pd.read_csv(file_path, delimiter=',')
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')
df.set_index('DateTime', inplace=True)

# Extract labels column if it exists and drop it from the dataset
if 'labels' in df.columns:
    print("✅ Found 'labels' column. Extracting for later evaluation.")
    labels_series = df['labels'].copy()
    df = df.drop(columns=['labels'])
else:
    print("⚠️ No 'labels' column found. Will assume all samples are normal.")
    labels_series = pd.Series(np.zeros(len(df)))

# Clean
df.dropna(axis=1, thresh=int(0.7 * len(df)), inplace=True)
df.ffill(inplace=True)
df.bfill(inplace=True)

# Normalize
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df.values)
df_scaled = pd.DataFrame(scaled_data, index=df.index, columns=df.columns).astype(np.float32)
print(f"✅ Scaled dataset shape: {df_scaled.shape}")

# ========================
# 4. SEQUENTIAL TRAIN/TEST SPLIT
# ========================
split_idx = int((1 - TEST_RATIO) * len(df_scaled))
train_data = df_scaled.iloc[:split_idx]
test_data = df_scaled.iloc[split_idx:]
# Split labels in the same way
train_labels = labels_series.iloc[:split_idx]
test_labels = labels_series.iloc[split_idx:]
print(f"✅ Training samples: {len(train_data)}, Testing samples: {len(test_data)}")
# ========================
# 5. CREATE SEQUENCES
# ========================
def create_sequences(data, input_steps, forecast_steps):
    X, y = [], []
    for i in range(len(data) - input_steps - forecast_steps):
        X.append(data[i:i+input_steps])
        y.append(data[i+input_steps:i+input_steps+forecast_steps])
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

# Create sequence labels (a sequence is anomalous if any point in it is anomalous)
def create_sequence_labels(labels, input_steps, forecast_steps):
    seq_labels = []
    for i in range(len(labels) - input_steps - forecast_steps):
        # If any point in the forecast window is anomalous, mark the sequence as anomalous
        forecast_window_labels = labels[i+input_steps:i+input_steps+forecast_steps]
        seq_labels.append(1 if any(forecast_window_labels > 0) else 0)
    return np.array(seq_labels, dtype=np.int32)

X_train_seq, y_train_seq = create_sequences(train_data.values, INPUT_STEPS, FORECAST_STEPS)
X_test_seq, y_test_seq = create_sequences(test_data.values, INPUT_STEPS, FORECAST_STEPS)

# Create labels for test sequences - used for anomaly evaluation
test_seq_labels = create_sequence_labels(test_labels.values, INPUT_STEPS, FORECAST_STEPS)
print(f"✅ Training sequences: {X_train_seq.shape}, Testing sequences: {X_test_seq.shape}")
print(f"✅ Test sequence labels shape: {test_seq_labels.shape}, with {np.sum(test_seq_labels)} anomalous sequences")

# ========================
# 6. BUILD LSTM SEQ2SEQ MODEL
# ========================
def build_lstm_seq2seq(input_steps, forecast_steps, input_dim, units=128):
    inputs = Input(shape=(input_steps, input_dim))
    encoded = LSTM(units)(inputs)
    repeated = RepeatVector(forecast_steps)(encoded)
    decoded = LSTM(units, return_sequences=True)(repeated)
    outputs = TimeDistributed(Dense(input_dim))(decoded)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse')
    return model

# ========================
# 7. TRAINING + TUNING
# ========================
best_val_rmse = np.inf
best_model = None
history_records = []
for epochs in EPOCHS_LIST:
    for batch_size in BATCH_SIZES:
        print(f"\n🔵 Training LSTM Seq2Seq with epochs={epochs}, batch_size={batch_size}")

        model = build_lstm_seq2seq(INPUT_STEPS, FORECAST_STEPS, X_train_seq.shape[2])
        es = EarlyStopping(patience=5, restore_best_weights=True)
        history = model.fit(X_train_seq, y_train_seq,
                            validation_split=0.1,
                            epochs=epochs,
                            batch_size=batch_size,
                            callbacks=[es],
                            verbose=1,
                            shuffle=False)

        val_preds = model.predict(X_test_seq, batch_size=batch_size)
        val_rmse = np.sqrt(mean_squared_error(y_test_seq.reshape(-1), val_preds.reshape(-1)))
        val_mae = mean_absolute_error(y_test_seq.reshape(-1), val_preds.reshape(-1))
        print(f"✅ Validation RMSE: {val_rmse:.5f}, MAE: {val_mae:.5f}")
        history_records.append({
            "epochs": epochs,
            "batch_size": batch_size,
            "val_rmse": val_rmse,
            "val_mae": val_mae
        })
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model = model
# Save tuning history
history_df = pd.DataFrame(history_records)
history_df.to_csv("lstm_seq2seq_tuning_history.csv", index=False)
print("\n📋 Tuning Results Summary:")
print(history_df)
# Save best model
best_model.save("best_lstm_seq2seq_forecaster.keras")
print("\n✅ Best LSTM Seq2Seq model saved.")

# ========================
# 8. BUILD AND TRAIN TRANSFORMER AUTOENCODER FOR ANOMALY DETECTION
# ========================
def create_ae_sequences(data, seq_len):
    return np.array([data[i:i+seq_len] for i in range(len(data) - seq_len)], dtype=np.float32)
X_ae_train = create_ae_sequences(train_data.values, FORECAST_STEPS)
def transformer_encoder(inputs, embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    # Multi-head self-attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    # Feed-forward network
    ffn_output = Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)

    # Add & Norm
    return LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)
def build_transformer_autoencoder(input_steps, input_dim, embed_dim=128, num_heads=4, ff_dim=256, dropout_rate=0.1):
    inputs = Input(shape=(input_steps, input_dim))

    # Initial projection to embed_dim
    x = Conv1D(filters=embed_dim, kernel_size=1, activation='relu')(inputs)

    # Encoder: Transformer blocks
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)

    # Bottleneck
    encoded = GlobalAveragePooling1D()(x)

    # Decoder: Expand to sequence
    x = RepeatVector(input_steps)(encoded)

    # Decoder: Transformer blocks
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)

    # Output projection back to original dimensions
    outputs = TimeDistributed(Dense(input_dim))(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model
# Build and train the transformer autoencoder for anomaly detection
transformer_ae = build_transformer_autoencoder(
    FORECAST_STEPS, 
    X_ae_train.shape[2],
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    ff_dim=FF_DIM,
    dropout_rate=DROPOUT_RATE
)
# Train Transformer AE
es = EarlyStopping(patience=5, restore_best_weights=True)
transformer_ae.fit(
    X_ae_train, 
    X_ae_train, 
    validation_split=0.1, 
    epochs=20, 
    batch_size=64, 
    callbacks=[es], 
    verbose=1
)
transformer_ae.save("best_transformer_autoencoder.h5")
print("\n✅ Transformer Autoencoder trained and saved.")

# Use the last 3000 points of the test data for simulation
test_data_tail = test_data.tail(3000).reset_index(drop=True)
test_labels_tail = test_labels.tail(3000).reset_index(drop=True)

# ========================
# 9. REAL-TIME SIMULATION ON TEST SET
# ========================
simulation_X, simulation_y = create_sequences(test_data_tail.values, INPUT_STEPS, FORECAST_STEPS)
# Create corresponding labels for evaluation
simulation_labels = create_sequence_labels(test_labels_tail.values, INPUT_STEPS, FORECAST_STEPS)

forecast_list = []
reconstruction_list = []
reconstruction_errors = []
anomaly_flags_list = []
true_windows = []
for i in range(0, len(simulation_X), WINDOW_SIZE_SIMULATION):
    window_X = simulation_X[i:i+1]
    window_y_true = simulation_y[i]
    # Use LSTM Seq2Seq for forecasting
    y_pred_future = best_model.predict(window_X, batch_size=128, verbose=1)[0]
    X_forecast = np.expand_dims(y_pred_future, axis=0)

    # Use Transformer Autoencoder for anomaly detection
    y_reconstructed = transformer_ae.predict(X_forecast, batch_size=128, verbose=1)[0]
    reconstruction_error = np.mean((y_pred_future - y_reconstructed)**2, axis=1)
    threshold = np.percentile(reconstruction_error, THRESHOLD_PERCENTILE)
    anomaly_flags = reconstruction_error > threshold
    forecast_list.append(y_pred_future)
    reconstruction_list.append(y_reconstructed)
    reconstruction_errors.append(reconstruction_error)
    anomaly_flags_list.append(anomaly_flags)
    true_windows.append(window_y_true)
print("\n✅ Real-time simulation complete.")

# ========================
# 10. EVALUATION
# ========================
# Forecasting metrics
y_pred_all = np.vstack(forecast_list)
y_true_all = np.vstack(true_windows)
forecast_rmse = np.sqrt(mean_squared_error(y_true_all.reshape(-1), y_pred_all.reshape(-1)))
forecast_mae = mean_absolute_error(y_true_all.reshape(-1), y_pred_all.reshape(-1))
print(f"\n📈 Forecasting Evaluation on Test:")
print(f"RMSE: {forecast_rmse:.5f}")
print(f"MAE:  {forecast_mae:.5f}")

# Anomaly detection metrics
all_detected = np.hstack(anomaly_flags_list)

# Use the true labels from the dataset instead of assuming all normal
# Take only the labels corresponding to the windows we evaluated
true_labels_subset = simulation_labels[:len(all_detected)]

print(f"\nAnomaly detection evaluation using true labels:")
print(f"Number of true anomalies: {np.sum(true_labels_subset)}")
print(f"Number of detected anomalies: {np.sum(all_detected)}")

# Compute metrics using true labels
precision = precision_score(true_labels_subset, all_detected, zero_division=0)
recall = recall_score(true_labels_subset, all_detected, zero_division=0)
f1 = f1_score(true_labels_subset, all_detected, zero_division=0)

print(f"\n📈 Anomaly Detection Evaluation:")
print(f"Precision: {precision:.5f}")
print(f"Recall:    {recall:.5f}")
print(f"F1 Score:  {f1:.5f}")

# ========================
# 11. SAVE METRICS
# ========================
metrics_results = {
    "Model": "LSTM Seq2Seq + Transformer AE",
    "Forecast_RMSE": forecast_rmse,
    "Forecast_MAE": forecast_mae,
    "Anomaly_Precision": precision,
    "Anomaly_Recall": recall,
    "Anomaly_F1": f1
}
metrics_df = pd.DataFrame([metrics_results])
metrics_df.to_csv("metrics_lstm_seq2seq_transformer_ae_pipeline.csv", index=False)
print("\n✅ Metrics saved to 'metrics_lstm_seq2seq_transformer_ae_pipeline.csv'.")

# ========================
# 12. PLOTS
# ========================
# Plot Reconstruction Errors with True Labels
plt.figure(figsize=(14,5))
all_errors = np.hstack(reconstruction_errors)
plt.plot(all_errors, label='Reconstruction Error')
plt.axhline(np.percentile(all_errors, THRESHOLD_PERCENTILE), color='red', linestyle='--', label='Threshold')

# Detected anomalies
detected_indices = np.where(all_errors > np.percentile(all_errors, THRESHOLD_PERCENTILE))[0]
plt.scatter(detected_indices,
            all_errors[detected_indices],
            color='red', label='Detected Anomalies', s=10)

# True anomalies
true_anomaly_indices = np.where(true_labels_subset == 1)[0]
plt.scatter(true_anomaly_indices,
            np.ones_like(true_anomaly_indices) * np.max(all_errors)*0.9,
            color='green', marker='*', label='True Anomalies', s=20)

plt.title("Reconstruction Errors vs True Anomalies")
plt.xlabel("Forecast Steps")
plt.ylabel("Error")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("reconstruction_errors_lstm_transformer_pipeline.png")
plt.show()

# Plot Confusion Matrix as a heatmap
from sklearn.metrics import confusion_matrix
import seaborn as sns

conf_matrix = confusion_matrix(true_labels_subset, all_detected)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Normal', 'Anomaly'],
            yticklabels=['Normal', 'Anomaly'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Anomaly Detection Confusion Matrix')
plt.tight_layout()
plt.savefig('anomaly_detection_confusion_matrix.png')
plt.show()

# Plot Metrics
metric_names = ["Forecast_RMSE", "Forecast_MAE", "Anomaly_Precision", "Anomaly_Recall", "Anomaly_F1"]
metric_values = [forecast_rmse, forecast_mae, precision, recall, f1]
plt.figure(figsize=(10,6))
plt.bar(metric_names, metric_values, color='gold')
plt.title("LSTM Seq2Seq + Transformer AE Performance Metrics")
plt.ylabel("Score / Error")
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("metrics_bar_chart_lstm_transformer_pipeline.png")
plt.show()

✅ GPU is available and will be used.
✅ Found 'labels' column. Extracting for later evaluation.
✅ Scaled dataset shape: (18652, 26)
✅ Training samples: 17719, Testing samples: 933
✅ Training sequences: (17699, 10, 26), Testing sequences: (913, 10, 26)
✅ Test sequence labels shape: (913,), with 1 anomalous sequences

🔵 Training LSTM Seq2Seq with epochs=10, batch_size=128
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
✅ Validation RMSE: 0.27874, MAE: 0.19559

📋 Tuning Results Summary:
   epochs  batch_size  val_rmse  val_mae
0      10         128  0.278739  0.19559

✅ Best LSTM Seq2Seq model saved.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

✅ Transformer Autoencoder trained and saved.

✅ Real-time simulation complete.

📈 Forecasting Evaluat

ValueError: Found input variables with inconsistent numbers of samples: [913, 920]