In [None]:
!pip install tensorflow matplotlib numpy scikit-learn seaborn nltk opencv-python

In [None]:
# Use Autoencoder to implement anomaly detection. Build the model by using:
# a. Import required libraries
# b. Upload / access the dataset
# c. Encoder converts it into latent representation
# d. Decoder networks convert it back to the original input
# e. Compile the models with Optimizer, Loss, and Evaluation Metrics

In [1]:
"""
AUTOENCODER FOR ANOMALY DETECTION - SYNTHETIC ECG DATASET
==========================================================
Fixed implementation with synthetic ECG data to avoid URL issues
"""

# =============================================================================
# a. IMPORT REQUIRED LIBRARIES
# =============================================================================

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report, 
                           roc_curve, auc, precision_recall_curve)
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("✅ All required libraries imported successfully!")

# =============================================================================
# b. UPLOAD / ACCESS THE DATASET - SYNTHETIC ECG DATA
# =============================================================================

def generate_synthetic_ecg_data(n_samples=5000, sequence_length=140, noise_level=0.1):
    """
    Generate synthetic ECG-like data for demonstration
    """
    print("Generating synthetic ECG data...")
    
    t = np.linspace(0, 4*np.pi, sequence_length)
    
    # Generate different types of ECG patterns
    data = []
    labels = []
    
    for i in range(n_samples):
        # Class 0: Normal sinus rhythm (majority class)
        if i < 4000:  # 80% normal
            # Normal ECG pattern
            ecg = (np.sin(t) + 
                   0.5 * np.sin(2*t) + 
                   0.2 * np.sin(3*t) +
                   0.1 * np.random.normal(0, noise_level, sequence_length))
            label = 0
        
        # Class 1-4: Various anomalies (20% total)
        elif i < 4250:  # 5% - PVC (Premature Ventricular Contraction)
            ecg = (0.7 * np.sin(1.5*t) + 
                   0.3 * np.sin(3*t) +
                   0.4 * np.random.normal(0, noise_level*2, sequence_length))
            label = 1
        
        elif i < 4500:  # 5% - Tachycardia
            ecg = (1.2 * np.sin(1.8*t) + 
                   0.3 * np.sin(2*t) +
                   0.1 * np.random.normal(0, noise_level, sequence_length))
            label = 2
        
        elif i < 4750:  # 5% - Bradycardia
            ecg = (0.6 * np.sin(0.8*t) + 
                   0.2 * np.sin(1.5*t) +
                   0.1 * np.random.normal(0, noise_level, sequence_length))
            label = 3
        
        else:  # 5% - Atrial Fibrillation
            ecg = (0.8 * np.sin(t + 0.5*np.sin(0.5*t)) + 
                   0.4 * np.random.normal(0, noise_level*3, sequence_length))
            label = 4
        
        data.append(ecg)
        labels.append(label)
    
    data = np.array(data)
    labels = np.array(labels)
    
    # Normalize data
    data = (data - data.min()) / (data.max() - data.min())
    
    return data, labels

def load_and_preprocess_data():
    """
    Load and preprocess the synthetic ECG dataset for anomaly detection
    """
    print("\n" + "="*60)
    print("STAGE b: DATASET LOADING AND PREPROCESSING")
    print("="*60)
    
    # Generate synthetic ECG data
    data, labels = generate_synthetic_ecg_data()
    
    print(f"Dataset shape: {data.shape}")
    print(f"Labels shape: {labels.shape}")
    
    # Display dataset information
    print("\nDataset Information:")
    print(f"Number of samples: {len(data)}")
    print(f"Time steps per sample: {data.shape[1]}")
    
    # Label distribution
    unique_labels, counts = np.unique(labels, return_counts=True)
    label_names = ['Normal', 'PVC', 'Tachycardia', 'Bradycardia', 'AFib']
    
    print("\nLabel Distribution:")
    for label, count in zip(unique_labels, counts):
        print(f"Class {label} ({label_names[label]}): {count} samples ({count/len(labels)*100:.1f}%)")
    
    # Prepare data for anomaly detection
    # Class 0: Normal heartbeats, Classes 1-4: Anomalous heartbeats
    normal_data = data[labels == 0]  # Normal samples
    anomalous_data = data[labels != 0]  # Anomalous samples
    
    print(f"\nNormal samples (Class 0): {len(normal_data)}")
    print(f"Anomalous samples (Classes 1-4): {len(anomalous_data)}")
    
    # Normalize the data (already normalized, but ensure proper scaling)
    scaler = MinMaxScaler()
    normal_data_scaled = scaler.fit_transform(normal_data)
    anomalous_data_scaled = scaler.transform(anomalous_data)
    all_data_scaled = scaler.transform(data)
    
    # Split normal data for training and validation
    train_data, val_data = train_test_split(
        normal_data_scaled, 
        test_size=0.2, 
        random_state=42
    )
    
    # Create test set with both normal and anomalous samples
    test_data = np.vstack([normal_data_scaled, anomalous_data_scaled])
    test_labels = np.array([0] * len(normal_data_scaled) + [1] * len(anomalous_data_scaled))
    
    print(f"\nData splits:")
    print(f"Training data (normal only): {train_data.shape}")
    print(f"Validation data (normal only): {val_data.shape}")
    print(f"Test data: {test_data.shape}")
    print(f"Test labels - Normal: {sum(test_labels == 0)}, Anomalous: {sum(test_labels == 1)}")
    
    return (train_data, val_data, test_data, test_labels, 
            normal_data_scaled, anomalous_data_scaled, scaler, data.shape[1], label_names)

# =============================================================================
# c. ENCODER NETWORK - LATENT REPRESENTATION
# =============================================================================

def build_encoder(input_dim, latent_dim=32):
    """
    Build encoder network that converts input to latent representation
    """
    print("\n" + "="*60)
    print("STAGE c: BUILDING ENCODER NETWORK")
    print("="*60)
    
    encoder = models.Sequential([
        layers.Input(shape=(input_dim,), name='encoder_input'),
        
        # First encoding layer
        layers.Dense(128, activation='relu', name='encoder_dense1'),
        layers.BatchNormalization(name='encoder_bn1'),
        layers.Dropout(0.2, name='encoder_dropout1'),
        
        # Second encoding layer
        layers.Dense(64, activation='relu', name='encoder_dense2'),
        layers.BatchNormalization(name='encoder_bn2'),
        layers.Dropout(0.2, name='encoder_dropout2'),
        
        # Third encoding layer
        layers.Dense(32, activation='relu', name='encoder_dense3'),
        layers.BatchNormalization(name='encoder_bn3'),
        
        # Latent space representation
        layers.Dense(latent_dim, activation='relu', name='latent_space')
    ], name='Encoder')
    
    print("Encoder Architecture:")
    encoder.summary()
    
    return encoder

# =============================================================================
# d. DECODER NETWORK - RECONSTRUCTION
# =============================================================================

def build_decoder(output_dim, latent_dim=32):
    """
    Build decoder network that converts latent representation back to original input
    """
    print("\n" + "="*60)
    print("STAGE d: BUILDING DECODER NETWORK")
    print("="*60)
    
    decoder = models.Sequential([
        layers.Input(shape=(latent_dim,), name='decoder_input'),
        
        # First decoding layer
        layers.Dense(32, activation='relu', name='decoder_dense1'),
        layers.BatchNormalization(name='decoder_bn1'),
        layers.Dropout(0.2, name='decoder_dropout1'),
        
        # Second decoding layer
        layers.Dense(64, activation='relu', name='decoder_dense2'),
        layers.BatchNormalization(name='decoder_bn2'),
        layers.Dropout(0.2, name='decoder_dropout2'),
        
        # Third decoding layer
        layers.Dense(128, activation='relu', name='decoder_dense3'),
        layers.BatchNormalization(name='decoder_bn3'),
        
        # Output layer - reconstruct original input
        layers.Dense(output_dim, activation='sigmoid', name='decoder_output')
    ], name='Decoder')
    
    print("Decoder Architecture:")
    decoder.summary()
    
    return decoder

# =============================================================================
# e. COMPILE MODELS WITH OPTIMIZER, LOSS, AND EVALUATION METRICS
# =============================================================================

def build_and_compile_autoencoder(encoder, decoder, input_dim):
    """
    Build and compile the complete autoencoder model
    """
    print("\n" + "="*60)
    print("STAGE e: BUILDING AND COMPILING AUTOENCODER")
    print("="*60)
    
    # Create autoencoder model
    autoencoder = models.Sequential([
        encoder,
        decoder
    ], name='Autoencoder')
    
    # Compile the model
    autoencoder.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',  # Mean Squared Error for reconstruction
        metrics=['mae']  # Mean Absolute Error as additional metric
    )
    
    print("Autoencoder Architecture:")
    autoencoder.summary()
    
    print("\nModel Compilation Details:")
    print(f"Optimizer: Adam (learning_rate=0.001)")
    print(f"Loss Function: Mean Squared Error (MSE)")
    print(f"Metrics: Mean Absolute Error (MAE)")
    
    return autoencoder

# =============================================================================
# MODEL TRAINING AND EVALUATION
# =============================================================================

def train_autoencoder(autoencoder, train_data, val_data):
    """
    Train the autoencoder model
    """
    print("\n" + "="*60)
    print("TRAINING AUTOENCODER")
    print("="*60)
    
    # Define callbacks
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=10,
            min_lr=1e-7,
            verbose=1
        )
    ]
    
    # Training parameters
    batch_size = 32
    epochs = 100
    
    print(f"Training Parameters:")
    print(f"Batch size: {batch_size}")
    print(f"Epochs: {epochs}")
    print(f"Training samples: {len(train_data)}")
    print(f"Validation samples: {len(val_data)}")
    
    # Train the model
    history = autoencoder.fit(
        train_data, train_data,  # Autoencoder: input = target
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(val_data, val_data),
        callbacks=callbacks,
        verbose=1,
        shuffle=True
    )
    
    print("Training completed!")
    return history

def evaluate_anomaly_detection(autoencoder, test_data, test_labels, threshold=None):
    """
    Evaluate the autoencoder for anomaly detection
    """
    print("\n" + "="*60)
    print("ANOMALY DETECTION EVALUATION")
    print("="*60)
    
    # Get reconstructions
    reconstructions = autoencoder.predict(test_data, verbose=0)
    
    # Calculate reconstruction error (MSE per sample)
    reconstruction_errors = np.mean(np.square(test_data - reconstructions), axis=1)
    
    # Determine optimal threshold if not provided
    if threshold is None:
        # Use 95th percentile of training reconstruction errors as threshold
        normal_indices = test_labels == 0
        normal_errors = reconstruction_errors[normal_indices]
        threshold = np.percentile(normal_errors, 95)
    
    # Make predictions (1 = anomaly, 0 = normal)
    predictions = (reconstruction_errors > threshold).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    
    print("Anomaly Detection Results:")
    print(f"Threshold: {threshold:.6f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(test_labels, predictions, 
                              target_names=['Normal', 'Anomaly']))
    
    # Confusion matrix
    cm = confusion_matrix(test_labels, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Anomaly'],
                yticklabels=['Normal', 'Anomaly'])
    plt.title('Confusion Matrix - Anomaly Detection', fontsize=14, fontweight='bold')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig('anomaly_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Plot reconstruction error distribution
    plt.figure(figsize=(10, 6))
    normal_errors = reconstruction_errors[test_labels == 0]
    anomalous_errors = reconstruction_errors[test_labels == 1]
    
    plt.hist(normal_errors, bins=50, alpha=0.7, label='Normal', color='blue')
    plt.hist(anomalous_errors, bins=50, alpha=0.7, label='Anomalous', color='red')
    plt.axvline(threshold, color='black', linestyle='--', label=f'Threshold: {threshold:.4f}')
    plt.xlabel('Reconstruction Error (MSE)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Reconstruction Errors')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('error_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    return reconstruction_errors, predictions, threshold, {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'threshold': threshold
    }

def plot_training_history(history):
    """Plot training history"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot loss
    ax1.plot(history.history['loss'], label='Training Loss', linewidth=2)
    ax1.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    ax1.set_title('Autoencoder Training Loss', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss (MSE)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot MAE
    ax2.plot(history.history['mae'], label='Training MAE', linewidth=2)
    ax2.plot(history.history['val_mae'], label='Validation MAE', linewidth=2)
    ax2.set_title('Autoencoder Training MAE', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('MAE')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('autoencoder_training_history.png', dpi=300, bbox_inches='tight')
    plt.show()

def visualize_reconstructions(autoencoder, normal_data, anomalous_data, num_samples=5):
    """Visualize original vs reconstructed signals"""
    # Select samples
    normal_samples = normal_data[:num_samples]
    anomalous_samples = anomalous_data[:num_samples]
    
    # Get reconstructions
    normal_recon = autoencoder.predict(normal_samples, verbose=0)
    anomalous_recon = autoencoder.predict(anomalous_samples, verbose=0)
    
    # Create subplots
    fig, axes = plt.subplots(2, num_samples, figsize=(15, 6))
    
    # Plot normal samples
    for i in range(num_samples):
        if num_samples > 1:
            ax = axes[0, i]
        else:
            ax = axes[0]
        ax.plot(normal_samples[i], 'b-', label='Original', linewidth=2)
        ax.plot(normal_recon[i], 'r--', label='Reconstructed', linewidth=2)
        ax.set_title(f'Normal Sample {i+1}')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    # Plot anomalous samples
    for i in range(num_samples):
        if num_samples > 1:
            ax = axes[1, i]
        else:
            ax = axes[1]
        ax.plot(anomalous_samples[i], 'b-', label='Original', linewidth=2)
        ax.plot(anomalous_recon[i], 'r--', label='Reconstructed', linewidth=2)
        ax.set_title(f'Anomalous Sample {i+1}')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('reconstruction_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

def visualize_latent_space(encoder, test_data, test_labels):
    """Visualize the latent space representation"""
    # Get latent representations
    latent_representations = encoder.predict(test_data, verbose=0)
    
    # If latent dimension > 2, use PCA to reduce to 2D
    if latent_representations.shape[1] > 2:
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        latent_2d = pca.fit_transform(latent_representations)
        print(f"Explained variance by 2 principal components: {pca.explained_variance_ratio_.sum():.3f}")
    else:
        latent_2d = latent_representations
    
    # Plot latent space
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(latent_2d[:, 0], latent_2d[:, 1], 
                         c=test_labels, cmap='viridis', alpha=0.7)
    plt.colorbar(scatter, label='Anomaly (0=Normal, 1=Anomaly)')
    plt.xlabel('Latent Dimension 1')
    plt.ylabel('Latent Dimension 2')
    plt.title('Latent Space Representation')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('latent_space.png', dpi=300, bbox_inches='tight')
    plt.show()

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """
    Main function to execute the complete autoencoder anomaly detection pipeline
    """
    print("AUTOENCODER ANOMALY DETECTION - SYNTHETIC ECG DATASET")
    print("="*60)
    
    try:
        # b. Load and preprocess dataset
        (train_data, val_data, test_data, test_labels, 
         normal_data, anomalous_data, scaler, input_dim, label_names) = load_and_preprocess_data()
        
        # c. Build encoder
        latent_dim = 16
        encoder = build_encoder(input_dim, latent_dim)
        
        # d. Build decoder
        decoder = build_decoder(input_dim, latent_dim)
        
        # e. Build and compile autoencoder
        autoencoder = build_and_compile_autoencoder(encoder, decoder, input_dim)
        
        # Train autoencoder
        history = train_autoencoder(autoencoder, train_data, val_data)
        
        # Plot training history
        plot_training_history(history)
        
        # Evaluate anomaly detection
        reconstruction_errors, predictions, threshold, metrics = evaluate_anomaly_detection(
            autoencoder, test_data, test_labels
        )
        
        # Visualize reconstructions
        visualize_reconstructions(autoencoder, normal_data, anomalous_data)
        
        # Visualize latent space
        visualize_latent_space(encoder, test_data, test_labels)
        
        # Save models
        autoencoder.save('autoencoder_anomaly_detection.h5')
        encoder.save('encoder_model.h5')
        decoder.save('decoder_model.h5')
        
        print("\n" + "="*60)
        print("FINAL SUMMARY")
        print("="*60)
        print(f"Final Test Accuracy: {metrics['accuracy']:.4f}")
        print(f"Final F1-Score: {metrics['f1']:.4f}")
        print(f"Optimal Threshold: {metrics['threshold']:.6f}")
        print(f"Models saved: autoencoder_anomaly_detection.h5, encoder_model.h5, decoder_model.h5")
        print("Anomaly detection pipeline completed successfully!")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

✅ All required libraries imported successfully!
AUTOENCODER ANOMALY DETECTION - SYNTHETIC ECG DATASET

STAGE b: DATASET LOADING AND PREPROCESSING
Generating synthetic ECG data...
Dataset shape: (5000, 140)
Labels shape: (5000,)

Dataset Information:
Number of samples: 5000
Time steps per sample: 140

Label Distribution:
Class 0 (Normal): 4000 samples (80.0%)
Class 1 (PVC): 250 samples (5.0%)
Class 2 (Tachycardia): 250 samples (5.0%)
Class 3 (Bradycardia): 250 samples (5.0%)
Class 4 (AFib): 250 samples (5.0%)

Normal samples (Class 0): 4000
Anomalous samples (Classes 1-4): 1000

Data splits:
Training data (normal only): (3200, 140)
Validation data (normal only): (800, 140)
Test data: (5000, 140)
Test labels - Normal: 4000, Anomalous: 1000

STAGE c: BUILDING ENCODER NETWORK
Encoder Architecture:



STAGE d: BUILDING DECODER NETWORK
Decoder Architecture:



STAGE e: BUILDING AND COMPILING AUTOENCODER
Autoencoder Architecture:



Model Compilation Details:
Optimizer: Adam (learning_rate=0.001)
Loss Function: Mean Squared Error (MSE)
Metrics: Mean Absolute Error (MAE)

TRAINING AUTOENCODER
Training Parameters:
Batch size: 32
Epochs: 100
Training samples: 3200
Validation samples: 800
Epoch 1/100
[1m 53/100[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 4ms/step - loss: 0.0511 - mae: 0.1813


KeyboardInterrupt


KeyboardInterrupt

