In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error, 
    mean_absolute_error, 
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix
)
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import math
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv('drive/MyDrive/Colab Notebooks/dataset/system_stats.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
display(df.head())
print("\nStatistical summary:")
display(df.describe())

# Check for missing values
print("\nMissing values:")
display(df.isnull().sum())

In [None]:
# Visualize the distribution of features
plt.figure(figsize=(18, 12))
features = df.select_dtypes(include=['float64', 'int64']).columns
for i, feature in enumerate(features):
    plt.subplot(4, 4, i+1)
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()
plt.show()

# Time series plot of key features
plt.figure(figsize=(15, 8))
for feature in features[:5]:  # Plot first 5 features
    plt.plot(df[feature], label=feature)
plt.legend()
plt.title('Time Series of Key Features')
plt.xlabel('Time Steps')
plt.ylabel('Values')
plt.grid(True)
plt.show()

# Correlation matrix
plt.figure(figsize=(14, 12))
corr = df.select_dtypes(include=['float64', 'int64']).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, cmap='coolwarm', 
            vmin=-1, vmax=1, fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Select numerical features for modeling
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numerical_df)
scaled_df = pd.DataFrame(scaled_data, columns=numerical_df.columns)

# Split data into training and testing sets (80/20)
train_data, test_data = train_test_split(scaled_df, test_size=0.2, 
                                         shuffle=False)  # No shuffle for time series

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")

In [None]:
# Define the autoencoder architecture
input_dim = train_data.shape[1]

# Build the autoencoder model
def build_autoencoder(input_dim):
    # Encoder
    input_layer = Input(shape=(input_dim,))
    encoder = Dense(int(input_dim * 0.75), activation="relu")(input_layer)
    encoder = Dense(int(input_dim * 0.5), activation="relu")(encoder)
    encoder = Dense(int(input_dim * 0.33), activation="relu")(encoder)
    
    # Bottleneck layer
    bottleneck = Dense(int(input_dim * 0.25), activation="relu")(encoder)
    
    # Decoder
    decoder = Dense(int(input_dim * 0.33), activation="relu")(bottleneck)
    decoder = Dense(int(input_dim * 0.5), activation="relu")(decoder)
    decoder = Dense(int(input_dim * 0.75), activation="relu")(decoder)
    output_layer = Dense(input_dim, activation="sigmoid")(decoder)
    
    # Create the autoencoder model
    autoencoder = Model(inputs=input_layer, outputs=output_layer)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    
    return autoencoder

# Build and train the autoencoder
autoencoder = build_autoencoder(input_dim)
autoencoder.summary()

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, 
                              restore_best_weights=True)

# Train the autoencoder
history_autoencoder = autoencoder.fit(
    train_data, train_data,
    epochs=50,
    batch_size=32,
    validation_data=(test_data, test_data),
    callbacks=[early_stopping],
    verbose=1
)


In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(history_autoencoder.history['loss'], label='Training Loss')
plt.plot(history_autoencoder.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Autoencoder Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

# Make predictions on test data
autoencoder_test_pred = autoencoder.predict(test_data)

# Calculate reconstruction errors
test_mae = np.mean(np.abs(test_data.values - autoencoder_test_pred), axis=1)
test_mse = np.mean(np.square(test_data.values - autoencoder_test_pred), axis=1)
test_rmse = np.sqrt(test_mse)

# Calculate overall metrics
overall_mae = mean_absolute_error(test_data, autoencoder_test_pred)
overall_mse = mean_squared_error(test_data, autoencoder_test_pred)
overall_rmse = math.sqrt(overall_mse)

print(f"Autoencoder Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {overall_mae:.4f}")
print(f"Mean Squared Error (MSE): {overall_mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {overall_rmse:.4f}")

# Visualize the reconstruction error
plt.figure(figsize=(12, 6))
plt.hist(test_mae, bins=50, alpha=0.75)
plt.axvline(x=np.percentile(test_mae, 95), color='r', linestyle='--', 
            label='95th Percentile Threshold')
plt.title('Distribution of Reconstruction Errors (MAE)')
plt.xlabel('MAE')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

# Visualize actual vs reconstructed for a sample of features
plt.figure(figsize=(16, 12))
for i, col in enumerate(numerical_df.columns[:6]):  # Plot first 6 features
    plt.subplot(2, 3, i+1)
    plt.scatter(test_data.iloc[:, i], autoencoder_test_pred[:, i], alpha=0.5)
    plt.plot([0, 1], [0, 1], 'r--')  # Perfect reconstruction line
    plt.title(f'Actual vs Reconstructed: {col}')
    plt.xlabel('Actual')
    plt.ylabel('Reconstructed')
    plt.grid(True)
plt.tight_layout()
plt.show()

# Set a threshold for anomaly detection (95th percentile of reconstruction error)
threshold = np.percentile(test_mae, 95)
print(f"Anomaly threshold: {threshold:.4f}")

# Flag anomalies
anomalies = test_mae > threshold
print(f"Number of anomalies detected: {np.sum(anomalies)}")
print(f"Percentage of anomalies: {np.sum(anomalies) / len(test_mae) * 100:.2f}%")

# Visualize anomalies over time for a key feature
plt.figure(figsize=(15, 6))
plt.scatter(range(len(test_data)), test_data.iloc[:, 0], c=['blue' if not a else 'red' for a in anomalies], alpha=0.5)
plt.title(f'Anomaly Detection for {test_data.columns[0]}')
plt.xlabel('Time Steps')
plt.ylabel('Normalized Value')
plt.legend(['Normal', 'Anomaly'])
plt.grid(True)
plt.show()

# Save the autoencoder model
autoencoder.save('drive/MyDrive/Colab Notebooks/models/autoencoder_model.h5')
np.save('drive/MyDrive/Colab Notebooks/models/anomaly_threshold.npy', threshold)


In [None]:
# Create labels for LSTM (using autoencoder's anomaly detection as pseudo-labels)
# In a real scenario, you would use actual labeled data
y_pseudo = anomalies.astype(int)

# LSTM requires sequence data
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data.iloc[i:(i + seq_length)].values
        y = y_pseudo[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Define sequence length
seq_length = 10  # Look back 10 time steps

# Create sequences for LSTM
X_seq, y_seq = create_sequences(test_data.reset_index(drop=True), seq_length)

print(f"LSTM input shape: {X_seq.shape}")
print(f"LSTM output shape: {y_seq.shape}")

# Split the sequence data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_seq, y_seq, test_size=0.2, 
                                                 random_state=42)

# Build LSTM model
def build_lstm_model(seq_length, n_features):
    model = Sequential()
    model.add(LSTM(64, input_shape=(seq_length, n_features), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(32))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Build and train LSTM model
lstm_model = build_lstm_model(seq_length, X_train.shape[2])
lstm_model.summary()

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, 
                              restore_best_weights=True)

# Train the model
history_lstm = lstm_model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)


In [None]:
# Plot training and validation metrics
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.plot(history_lstm.history['accuracy'], label='Training Accuracy')
plt.plot(history_lstm.history['val_accuracy'], label='Validation Accuracy')
plt.title('LSTM Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history_lstm.history['loss'], label='Training Loss')
plt.plot(history_lstm.history['val_loss'], label='Validation Loss')
plt.title('LSTM Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Evaluate LSTM model
y_pred_proba = lstm_model.predict(X_val)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, zero_division=0)
recall = recall_score(y_val, y_pred, zero_division=0)
f1 = f1_score(y_val, y_pred, zero_division=0)

print(f"LSTM Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot confusion matrix
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Save the LSTM model
lstm_model.save('drive/MyDrive/Colab Notebooks/models/lstm_model.h5')


In [None]:
# Function for real-time anomaly detection
def detect_anomalies_realtime(new_data, autoencoder, threshold, 
                             lstm_model=None, sequence_buffer=None, 
                             seq_length=None, scaler=None):
    """
    Detect anomalies in real-time data
    
    Parameters:
    - new_data: raw new data point (numpy array)
    - autoencoder: trained autoencoder model
    - threshold: anomaly threshold
    - lstm_model: trained LSTM model (optional)
    - sequence_buffer: buffer containing recent data points for LSTM (optional)
    - seq_length: sequence length for LSTM (optional)
    - scaler: fitted scaler for data normalization
    
    Returns:
    - Dictionary with detection results
    """
    # Preprocess the data (normalize)
    if scaler is not None:
        new_data_scaled = scaler.transform(new_data.reshape(1, -1))
    else:
        new_data_scaled = new_data.reshape(1, -1)
    
    # Get autoencoder prediction
    reconstruction = autoencoder.predict(new_data_scaled, verbose=0)
    
    # Calculate reconstruction error
    reconstruction_error = np.mean(np.abs(new_data_scaled - reconstruction))
    
    # Detect anomaly with autoencoder
    is_anomaly_autoencoder = reconstruction_error > threshold
    
    result = {
        'is_anomaly': bool(is_anomaly_autoencoder),
        'reconstruction_error': float(reconstruction_error),
        'threshold': float(threshold),
        'detection_method': 'autoencoder'
    }
    
    # If LSTM model is provided and we have enough data in the buffer
    if (lstm_model is not None and sequence_buffer is not None and 
            len(sequence_buffer) >= seq_length):
        
        # Update buffer with new data
        sequence_buffer.append(new_data_scaled[0])
        if len(sequence_buffer) > seq_length:
            sequence_buffer.pop(0)  # Remove oldest data point
        
        # Prepare sequence for LSTM
        lstm_input = np.array([sequence_buffer[-seq_length:]])
        
        # Get LSTM prediction
        lstm_prediction = lstm_model.predict(lstm_input, verbose=0)
        is_anomaly_lstm = lstm_prediction[0][0] > 0.5
        
        # Update result with LSTM information
        result['is_anomaly_lstm'] = bool(is_anomaly_lstm)
        result['lstm_confidence'] = float(lstm_prediction[0][0])
        result['detection_method'] = 'ensemble'
        
        # Final decision (can be customized)
        result['is_anomaly'] = result['is_anomaly'] or result['is_anomaly_lstm']
    
    return result

# Example of how to use the function in a real-time environment
print("\nReal-time Anomaly Detection Example:")
print("1. Load the saved models:")
print("   autoencoder = load_model('drive/MyDrive/Colab Notebooks/models/autoencoder_model.h5')")
print("   lstm_model = load_model('drive/MyDrive/Colab Notebooks/models/lstm_model.h5')")
print("   threshold = np.load('drive/MyDrive/Colab Notebooks/models/anomaly_threshold.npy')")
print("\n2. Initialize a buffer for storing recent data points:")
print("   sequence_buffer = []")
print("\n3. Every 15 seconds, fetch new data and detect anomalies:")
print("   while True:")
print("       new_data = fetch_new_data()  # Your function to get fresh data")
print("       result = detect_anomalies_realtime(new_data, autoencoder, threshold,")
print("                                         lstm_model, sequence_buffer, seq_length, scaler)")
print("       if result['is_anomaly']:")
print("           send_alert(result)")
print("       time.sleep(15)")
