In [None]:
# ========================
# 0. IMPORTS
# ========================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Dense, LSTM, RepeatVector, TimeDistributed,
                                    MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D,
                                    Conv1D)
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.python.client import device_lib
import os
# ========================
# 1. CONFIGURATION
# ========================
INPUT_STEPS = 10
FORECAST_STEPS = 10
TEST_RATIO = 0.2
# Tuning parameters
EPOCHS_LIST = [20]
BATCH_SIZES = [128]
WINDOW_SIZE_SIMULATION = 10  # 6h window
# Transformer parameters for anomaly detection
EMBED_DIM = 128
NUM_HEADS = 4
FF_DIM = 256
DROPOUT_RATE = 0.1
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
# ========================
# 2. DEVICE SETUP
# ========================
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print("✅ GPU is available and will be used.")
    except RuntimeError as e:
        print(e)
else:
    print("⚠️ No GPU detected, running on CPU.")
# ========================
# 3. LOAD AND PREPROCESS DATA
# ========================
file_path = '../../data/cleaned_labeled_dataset.csv'
df = pd.read_csv(file_path, delimiter=',')
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')
df.set_index('DateTime', inplace=True)

file_path2= '../../data/preprocessed_data.csv'
df_forecast = pd.read_csv(file_path, delimiter=',')
df_forecast['DateTime'] = pd.to_datetime(df_forecast['DateTime'], errors='coerce')
df_forecast.set_index('DateTime', inplace=True)

# Extract labels column if it exists and drop it from the dataset
if 'labels' in df.columns:
    print("✅ Found 'labels' column. Extracting for later evaluation.")
    labels_series = df['labels'].copy()
    df = df.drop(columns=['labels'])
else:
    print("⚠️ No 'labels' column found. Will assume all samples are normal.")
    labels_series = pd.Series(np.zeros(len(df)))

# Extract labels column if it exists and drop it from the dataset
if 'labels' in df_forecast.columns:
    print("✅ Found 'labels' column. Extracting for later evaluation.")
    labels_df_forecast = df_forecast['labels'].copy()
    df_forecast = df_forecast.drop(columns=['labels'])
else:
    print("⚠️ No 'labels' column found. Will assume all samples are normal.")
    df_forecast = pd.Series(np.zeros(len(df_forecast)))

    
# Normalize
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df.values)
df_scaled = pd.DataFrame(scaled_data, index=df.index, columns=df.columns).astype(np.float32)
print(f"✅ Scaled dataset shape: {df_scaled.shape}")


# Normalize
scaler = MinMaxScaler()
scaled_data_forecast = scaler.fit_transform(df_forecast.values)
scaled_data_forecast2 = pd.DataFrame(scaled_data_forecast, index=df_forecast.index, columns=df_forecast.columns).astype(np.float32)
print(f"✅ Scaled dataset shape: {scaled_data_forecast2.shape}")


# ========================
# 4. SEQUENTIAL TRAIN/TEST SPLIT
# ========================
split_idx = int((1 - TEST_RATIO) * len(df_scaled))
train_data_detection = df_scaled.iloc[:split_idx]
train_data = scaled_data_forecast2.iloc[:split_idx]
test_data = scaled_data_forecast2.iloc[split_idx:]
# Split labels in the same way
train_labels = labels_df_forecast.iloc[:split_idx]
test_labels = labels_df_forecast.iloc[split_idx:]
print(f"✅ Training samples: {len(train_data)}, Testing samples: {len(test_data)}")
# ========================
# 5. CREATE SEQUENCES
# ========================
def create_sequences(data, input_steps, forecast_steps):
    X, y = [], []
    for i in range(len(data) - input_steps - forecast_steps):
        X.append(data[i:i+input_steps])
        y.append(data[i+input_steps:i+input_steps+forecast_steps])
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

# Create sequence labels (a sequence is anomalous if any point in it is anomalous)
def create_sequence_labels(labels, input_steps, forecast_steps):
    """
    Create sequence-level labels from point-level labels.
    A sequence is considered anomalous (1) if any point in its forecast window is anomalous.
    
    Args:
        labels: Array of point-level binary labels (0=normal, >0=anomaly)
        input_steps: Number of input steps (not used for labeling)
        forecast_steps: Number of forecast steps
        
    Returns:
        Array of sequence-level binary labels
    """
    seq_labels = []
    for i in range(len(labels) - input_steps - forecast_steps):
        # If any point in the forecast window is anomalous, mark the sequence as anomalous
        forecast_window_labels = labels[i+input_steps:i+input_steps+forecast_steps]
        # Convert any non-zero value to anomaly (1)
        is_anomalous = np.any(np.array(forecast_window_labels) > 0)
        seq_labels.append(1 if is_anomalous else 0)
    return np.array(seq_labels, dtype=np.int32)

X_train_seq, y_train_seq = create_sequences(train_data.values, INPUT_STEPS, FORECAST_STEPS)
X_test_seq, y_test_seq = create_sequences(test_data.values, INPUT_STEPS, FORECAST_STEPS)

# Create labels for test sequences - used for anomaly evaluation
test_seq_labels = create_sequence_labels(test_labels.values, INPUT_STEPS, FORECAST_STEPS)
print(f"✅ Training sequences: {X_train_seq.shape}, Testing sequences: {X_test_seq.shape}")
print(f"✅ Test sequence labels shape: {test_seq_labels.shape}, with {np.sum(test_seq_labels)} anomalous sequences")

# ========================
# 6. BUILD LSTM SEQ2SEQ MODEL
# ========================
def build_lstm_seq2seq(input_steps, forecast_steps, input_dim, units=128):
    inputs = Input(shape=(input_steps, input_dim))
    encoded = LSTM(units)(inputs)
    repeated = RepeatVector(forecast_steps)(encoded)
    decoded = LSTM(units, return_sequences=True)(repeated)
    outputs = TimeDistributed(Dense(input_dim))(decoded)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse')
    return model

# ========================
# 7. TRAINING + TUNING
# ========================
best_val_rmse = np.inf
best_model = None
history_records = []
for epochs in EPOCHS_LIST:
    for batch_size in BATCH_SIZES:
        print(f"\n🔵 Training LSTM Seq2Seq with epochs={epochs}, batch_size={batch_size}")

        model = build_lstm_seq2seq(INPUT_STEPS, FORECAST_STEPS, X_train_seq.shape[2])
        es = EarlyStopping(patience=5, restore_best_weights=True)
        history = model.fit(X_train_seq, y_train_seq,
                            validation_split=0.1,
                            epochs=epochs,
                            batch_size=batch_size,
                            callbacks=[es],
                            verbose=1,
                            shuffle=False)

        val_preds = model.predict(X_test_seq, batch_size=batch_size)
        val_rmse = np.sqrt(mean_squared_error(y_test_seq.reshape(-1), val_preds.reshape(-1)))
        val_mae = mean_absolute_error(y_test_seq.reshape(-1), val_preds.reshape(-1))
        print(f"✅ Validation RMSE: {val_rmse:.5f}, MAE: {val_mae:.5f}")
        history_records.append({
            "epochs": epochs,
            "batch_size": batch_size,
            "val_rmse": val_rmse,
            "val_mae": val_mae
        })
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model = model
# Save tuning history
history_df = pd.DataFrame(history_records)
history_df.to_csv("lstm_seq2seq_tuning_history.csv", index=False)
print("\n📋 Tuning Results Summary:")
print(history_df)
# Save best model
best_model.save("best_lstm_seq2seq_forecaster.keras")
print("\n✅ Best LSTM Seq2Seq model saved.")



In [None]:
# ========================
# 8. BUILD AND TRAIN TRANSFORMER AUTOENCODER FOR ANOMALY DETECTION
# ========================
def create_ae_sequences(data, seq_len):
    return np.array([data[i:i+seq_len] for i in range(len(data) - seq_len)], dtype=np.float32)
X_ae_train = create_ae_sequences(train_data_detection.values, FORECAST_STEPS)
def transformer_encoder(inputs, embed_dim, num_heads, ff_dim, dropout_rate=0.1):
    # Multi-head self-attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    attention_output = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    # Feed-forward network
    ffn_output = Dense(ff_dim, activation="relu")(attention_output)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)

    # Add & Norm
    return LayerNormalization(epsilon=1e-6)(attention_output + ffn_output)
def build_transformer_autoencoder(input_steps, input_dim, embed_dim=128, num_heads=4, ff_dim=256, dropout_rate=0.1):
    inputs = Input(shape=(input_steps, input_dim))

    # Initial projection to embed_dim
    x = Conv1D(filters=embed_dim, kernel_size=1, activation='relu')(inputs)

    # Encoder: Transformer blocks
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)

    # Bottleneck
    encoded = GlobalAveragePooling1D()(x)

    # Decoder: Expand to sequence
    x = RepeatVector(input_steps)(encoded)

    # Decoder: Transformer blocks
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)
    x = transformer_encoder(x, embed_dim, num_heads, ff_dim, dropout_rate)

    # Output projection back to original dimensions
    outputs = TimeDistributed(Dense(input_dim))(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model
# Build and train the transformer autoencoder for anomaly detection
transformer_ae = build_transformer_autoencoder(
    FORECAST_STEPS, 
    X_ae_train.shape[2],
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    ff_dim=FF_DIM,
    dropout_rate=DROPOUT_RATE
)
# Train Transformer AE
es = EarlyStopping(patience=5, restore_best_weights=True)
transformer_ae.fit(
    X_ae_train, 
    X_ae_train, 
    validation_split=0.1, 
    epochs=20, 
    batch_size=128, 
    callbacks=[es], 
    verbose=1
)
transformer_ae.save("best_transformer_autoencoder.h5")
print("\n✅ Transformer Autoencoder trained and saved.")

# Use the last 3000 points of the test data for simulation
test_data_tail = test_data.tail(3000).reset_index(drop=True)
test_labels_tail = test_labels.tail(3000).reset_index(drop=True)


In [None]:


# Use the last 3000 points of the test data for simulation
test_data_tail = test_data.tail(3000).reset_index(drop=True)
test_labels_tail = test_labels.tail(3000).reset_index(drop=True)

# ========================
# 9. REAL-TIME SIMULATION WITH MANUAL THRESHOLD CONTROL
# ========================
# Import necessary libraries for ROC curve analysis
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# Function to find optimal threshold using ROC curve
def find_optimal_threshold_roc(y_true, reconstruction_errors):
    """
    Find the optimal threshold using ROC curve analysis.
    Returns the threshold that maximizes Youden's J statistic (TPR - FPR)
    """
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, reconstruction_errors)
    
    # Calculate AUC
    roc_auc = auc(fpr, tpr)
    
    # Find the threshold that maximizes TPR - FPR (Youden's J statistic)
    j_scores = tpr - fpr
    optimal_idx = np.argmax(j_scores)
    optimal_threshold = thresholds[optimal_idx]
    
    print(f"✅ ROC analysis - Optimal threshold: {optimal_threshold:.5f}")
    print(f"✅ ROC analysis - AUC: {roc_auc:.5f}")
    print(f"✅ ROC analysis - At optimal threshold: TPR={tpr[optimal_idx]:.5f}, FPR={fpr[optimal_idx]:.5f}")
    
    return optimal_threshold

# Function to detect anomalies with manual threshold
def detect_anomalies(reconstruction_errors, threshold=None, percentile=None, y_true=None):
    """
    Detect anomalies using either:
    1. Manual threshold value
    2. Percentile-based threshold
    3. ROC optimal threshold (if y_true is provided)
    
    Returns anomaly flags and the threshold used
    """
    # Get reconstruction errors as a flat array
    flat_errors = np.concatenate(reconstruction_errors) if isinstance(reconstruction_errors, list) else reconstruction_errors
    
    # Determine the threshold to use
    if threshold is not None:
        # Use manually specified threshold
        used_threshold = threshold
        print(f"Using manually specified threshold: {used_threshold:.5f}")
    elif percentile is not None:
        # Calculate percentile-based threshold
        used_threshold = np.percentile(flat_errors, percentile)
        print(f"Using {percentile}th percentile threshold: {used_threshold:.5f}")
    elif y_true is not None:
        # Find optimal threshold using ROC curve
        used_threshold = find_optimal_threshold_roc(y_true, flat_errors)
        print(f"Using ROC optimal threshold: {used_threshold:.5f}")
    else:
        # Default: use 99.9th percentile
        used_threshold = np.percentile(flat_errors, 99.9)
        print(f"Using default 99.9th percentile threshold: {used_threshold:.5f}")
    
    # Apply threshold to detect anomalies
    if isinstance(reconstruction_errors, list):
        anomaly_flags = [errors > used_threshold for errors in reconstruction_errors]
    else:
        anomaly_flags = reconstruction_errors > used_threshold
    
    return anomaly_flags, used_threshold

# Create sequences for simulation
simulation_X, simulation_y = create_sequences(test_data_tail.values, INPUT_STEPS, FORECAST_STEPS)
# Create corresponding labels for evaluation
simulation_labels = create_sequence_labels(test_labels_tail.values, INPUT_STEPS, FORECAST_STEPS)

forecast_list = []
reconstruction_list = []
reconstruction_errors = []
true_windows = []

# Process all samples without overlapping
step_size = WINDOW_SIZE_SIMULATION  # Use the window size as step size to avoid overlap
for i in range(0, len(simulation_X), step_size):
    # Get the current window batch (up to step_size samples)
    window_X = simulation_X[i:i+step_size]
    window_y_true = simulation_y[i:i+step_size]
    
    if len(window_X) == 0:
        continue
    
    # Use LSTM Seq2Seq for forecasting
    y_pred_future = best_model.predict(window_X, batch_size=128, verbose=1)
    X_forecast = y_pred_future  # No need to expand dims as we're processing batches
    
    # Use Transformer Autoencoder for anomaly detection
    y_reconstructed = transformer_ae.predict(X_forecast, batch_size=128, verbose=1)
    
    # Calculate reconstruction error for each sample in the batch
    batch_reconstruction_errors = np.mean((y_pred_future - y_reconstructed)**2, axis=(1, 2))
    
    # Store predictions and errors
    forecast_list.append(y_pred_future)
    reconstruction_list.append(y_reconstructed)
    reconstruction_errors.append(batch_reconstruction_errors)
    true_windows.append(window_y_true)

print("\n✅ Real-time simulation complete.")

# Flatten the reconstruction errors list for threshold calculation
all_reconstruction_errors = np.concatenate(reconstruction_errors)

# Make sure we're only considering windows where we have both predictions and labels
min_length = min(len(simulation_labels), len(all_reconstruction_errors))
true_labels_subset = simulation_labels[:min_length]
errors_subset = all_reconstruction_errors[:min_length]

# Calculate optimal threshold using ROC curve analysis
optimal_threshold = find_optimal_threshold_roc(true_labels_subset, errors_subset)

# Calculate optimal percentile
optimal_percentile = 100 * (1 - np.mean(np.where(all_reconstruction_errors >= optimal_threshold, 1, 0)))
print(f"✅ Optimal percentile corresponding to ROC threshold: {optimal_percentile:.5f}th percentile")

# Store optimal values to be used for manual control
OPTIMAL_THRESHOLD = optimal_threshold
OPTIMAL_PERCENTILE = optimal_percentile

# Function to evaluate anomaly detection performance
def evaluate_anomaly_detection(y_true, y_pred):
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    print(f"\n📈 Anomaly Detection Evaluation:")
    print(f"Precision: {precision:.5f}")
    print(f"Recall:    {recall:.5f}")
    print(f"F1 Score:  {f1:.5f}")
    
    return precision, recall, f1

# Example of using the manual threshold control
# You can set your own threshold value or percentile
MANUAL_THRESHOLD = None  # Set to a float value to use a specific threshold
MANUAL_PERCENTILE = None  # Set to a percentile value (0-100) to use a percentile-based threshold

# If both are None, it will use the optimal threshold from ROC curve
anomaly_flags, used_threshold = detect_anomalies(
    reconstruction_errors,
    threshold=MANUAL_THRESHOLD,
    percentile=MANUAL_PERCENTILE,
    y_true=true_labels_subset
)

# Apply threshold to get anomaly flags (flattened)
all_detected = np.concatenate([flags for flags in anomaly_flags])
min_length = min(len(true_labels_subset), len(all_detected))
all_detected_subset = all_detected[:min_length]

# Evaluate anomaly detection performance using the current threshold
precision, recall, f1 = evaluate_anomaly_detection(true_labels_subset[:min_length], all_detected_subset)

# ========================
# 10. VISUALIZATION WITH MANUAL THRESHOLD
# ========================
def plot_anomaly_detection(errors, labels, threshold, detected_anomalies=None):
    """
    Plot reconstruction errors, threshold, and anomalies
    """
    plt.figure(figsize=(14,5))
    
    # Ensure we're only plotting up to the minimum length we have data for
    min_plot_len = min(len(errors), len(labels))
    plot_errors = errors[:min_plot_len]
    plot_labels = labels[:min_plot_len]
    
    # Plot reconstruction errors
    plt.plot(plot_errors, label='Reconstruction Error')
    plt.axhline(threshold, color='red', linestyle='--', label=f'Threshold: {threshold:.5f}')
    
    # Plot detected anomalies if provided
    if detected_anomalies is not None:
        detected_anomalies = detected_anomalies[:min_plot_len]
        detected_indices = np.where(detected_anomalies == 1)[0]
        plt.scatter(detected_indices,
                    plot_errors[detected_indices],
                    color='red', label='Detected Anomalies', s=10)
    
    # Plot true anomalies
    true_anomaly_indices = np.where(plot_labels == 1)[0]
    if len(true_anomaly_indices) > 0:
        plt.scatter(true_anomaly_indices,
                    np.ones_like(true_anomaly_indices) * np.max(plot_errors)*0.9,
                    color='green', marker='*', label='True Anomalies', s=20)
    else:
        print("No true anomalies found in the subset of data being visualized")
    
    plt.title("Reconstruction Errors vs True Anomalies")
    plt.xlabel("Sample Index")
    plt.ylabel("Reconstruction Error")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    return plt

# Example of visualizing results with the current threshold
plt_fig = plot_anomaly_detection(
    all_reconstruction_errors, 
    true_labels_subset, 
    used_threshold, 
    all_detected
)
plt_fig.savefig("reconstruction_errors_with_threshold.png")
plt.show()

print("\n✅ Analysis complete.")
print(f"✅ Optimal threshold from ROC curve: {OPTIMAL_THRESHOLD:.5f}")
print(f"✅ Optimal percentile: {OPTIMAL_PERCENTILE:.5f}th")
print("\nTo use manual threshold control:")
print("1. Set MANUAL_THRESHOLD to a specific value, or")
print("2. Set MANUAL_PERCENTILE to use a percentile-based threshold")
print("3. If both are None, the optimal ROC threshold will be used")