In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive
import datetime

# Mount Google Drive
drive.mount('/content/drive')

# Define headers for the dataset
header = [
    "timestamp",
    "cpu_usage",
    "top_1_cpu_proc_name",
    "top_1_cpu_proc_usage",
    "top_2_cpu_proc_name",
    "top_2_cpu_proc_usage",
    "top_3_cpu_proc_name",
    "top_3_cpu_proc_usage",
    "top_4_cpu_proc_name",
    "top_4_cpu_proc_usage",
    "top_5_cpu_proc_name",
    "top_5_cpu_proc_usage",
    "mem_usage",
    "top_1_mem_proc_name",
    "top_1_mem_proc_usage",
    "top_2_mem_proc_name",
    "top_2_mem_proc_usage",
    "top_3_mem_proc_name",
    "top_3_mem_proc_usage",
    "top_4_mem_proc_name",
    "top_4_mem_proc_usage",
    "top_5_mem_proc_name",
    "top_5_mem_proc_usage",
    "nginx_active_connections",
    "nginx_rps"
]

# Load the dataset with semicolon delimiter
df = pd.read_csv(
    "drive/MyDrive/Colab Notebooks/dataset/system_stats.csv",
    header=None, 
    names=header, 
    sep=';',
    low_memory=False,
    on_bad_lines='skip'
)

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

# Handle timestamp parsing
# First, clean up any extra characters that might interfere with parsing
df['timestamp'] = df['timestamp'].str.replace('T', ' ')
df['timestamp'] = df['timestamp'].str.split('.').str[0]

# Now try to parse timestamps
try:
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')
except:
    print("Warning: Some timestamp parsing failed, using a more flexible approach")
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
# Drop rows with invalid timestamps
df = df.dropna(subset=['timestamp'])
print(f"Dataset shape after cleaning timestamps: {df.shape}")

# Extract time features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['day_of_month'] = df['timestamp'].dt.day

# Check data types and convert numerical columns from strings to float
numeric_cols = [
    'cpu_usage', 'top_1_cpu_proc_usage', 'top_2_cpu_proc_usage', 
    'top_3_cpu_proc_usage', 'top_4_cpu_proc_usage', 'top_5_cpu_proc_usage',
    'mem_usage', 'top_1_mem_proc_usage', 'top_2_mem_proc_usage', 
    'top_3_mem_proc_usage', 'top_4_mem_proc_usage', 'top_5_mem_proc_usage',
    'nginx_active_connections', 'nginx_rps'
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Select only numerical features for the autoencoder (excluding process names)
numerical_features = [
    'cpu_usage', 'top_1_cpu_proc_usage', 'top_2_cpu_proc_usage', 
    'top_3_cpu_proc_usage', 'top_4_cpu_proc_usage', 'top_5_cpu_proc_usage',
    'mem_usage', 'top_1_mem_proc_usage', 'top_2_mem_proc_usage', 
    'top_3_mem_proc_usage', 'top_4_mem_proc_usage', 'top_5_mem_proc_usage',
    'nginx_active_connections', 'nginx_rps',
    'hour', 'day_of_week', 'day_of_month'
]

# These are the features we'll use for training
features = numerical_features.copy()

# Check for and handle missing values
print(f"Missing values:\n{df[features].isna().sum()}")
df[features] = df[features].fillna(0)

# Normalize the data
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)

# Split the data
X_train, X_test = train_test_split(df_scaled, test_size=0.2, random_state=42)

# Build the autoencoder model
input_dim = X_train.shape[1]

# Define the encoder
input_layer = Input(shape=(input_dim,))
# Encoder layers
encoder = Dense(64, activation='relu')(input_layer)
encoder = Dropout(0.2)(encoder)
encoder = Dense(32, activation='relu')(encoder)
encoder = Dropout(0.2)(encoder)

# Bottleneck layer
bottleneck = Dense(16, activation='relu')(encoder)

# Decoder layers
decoder = Dense(32, activation='relu')(bottleneck)
decoder = Dropout(0.2)(decoder)
decoder = Dense(64, activation='relu')(decoder)
decoder = Dropout(0.2)(decoder)

# Output layer
output_layer = Dense(input_dim, activation='sigmoid')(decoder)

# Create model
autoencoder = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Print model summary
autoencoder.summary()

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = autoencoder.fit(
    X_train, X_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, X_test),
    callbacks=[early_stopping],
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 4))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Autoencoder Training History')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# Get reconstruction error on the entire dataset
reconstructed = autoencoder.predict(df_scaled)
mse = np.mean(np.power(df_scaled - reconstructed, 2), axis=1)

# Add reconstruction error to original dataframe
df['reconstruction_error'] = mse

# Determine anomaly threshold (you can adjust this based on your needs)
# Here we use mean + 2*std as threshold
threshold = np.mean(mse) + 2 * np.std(mse)
print(f"Anomaly threshold: {threshold}")

# Flag anomalies
df['is_anomaly'] = df['reconstruction_error'] > threshold
anomaly_count = df['is_anomaly'].sum()
print(f"Number of anomalies detected: {anomaly_count} ({anomaly_count/len(df)*100:.2f}%)")

# Plot reconstruction error
plt.figure(figsize=(15, 6))
plt.plot(df['timestamp'], df['reconstruction_error'], label='Reconstruction Error')
plt.axhline(y=threshold, color='r', linestyle='-', label=f'Threshold ({threshold:.4f})')
plt.fill_between(df['timestamp'], threshold, df['reconstruction_error'], 
                 where=(df['reconstruction_error'] > threshold), color='red', alpha=0.3)
plt.title('Reconstruction Error Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Reconstruction Error (MSE)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Display anomalies with highest reconstruction error
top_anomalies = df[df['is_anomaly']].sort_values('reconstruction_error', ascending=False).head(10)
print("Top 10 anomalies by reconstruction error:")
display(top_anomalies[['timestamp', 'cpu_usage', 'mem_usage', 'nginx_rps', 'reconstruction_error']])

# Feature contribution to anomalies
if anomaly_count > 0:
    plt.figure(figsize=(15, 8))
    anomaly_indices = df[df['is_anomaly']].index
    feature_errors = np.power(df_scaled.iloc[anomaly_indices].values - 
                            reconstructed[anomaly_indices], 2)
    feature_error_df = pd.DataFrame(feature_errors, columns=features)
    feature_contribution = feature_error_df.mean().sort_values(ascending=False)
    
    plt.bar(feature_contribution.index, feature_contribution.values)
    plt.title('Feature Contribution to Anomalies')
    plt.xlabel('Features')
    plt.ylabel('Average Squared Error')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

# Visualize normal vs anomalous data points for top contributing features
if anomaly_count > 0:
    top_features = feature_contribution.head(6).index
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(top_features):
        ax = axes[i]
        sns.histplot(data=df, x=feature, hue='is_anomaly', bins=30, ax=ax, 
                   palette={True: 'red', False: 'blue'}, alpha=0.5, legend=i==0)
        ax.set_title(f'Distribution of {feature}')
    
    plt.tight_layout()
    plt.show()

# Save the model if needed
autoencoder.save('/content/drive/MyDrive/Colab Notebooks/system_anomaly_detector.h5')
print("Model saved to Google Drive")

# Save the full results with all original features plus anomaly detection columns
# Convert boolean to string "True" or "False" for better readability in CSV
df['is_anomaly'] = df['is_anomaly'].astype(str)

# Save all original columns plus reconstruction_error and is_anomaly
# Remove temporary columns we added for analysis
columns_to_drop = ['hour', 'day_of_week', 'day_of_month']
results_df = df.drop(columns=columns_to_drop)

# Save to CSV
results_df.to_csv('/content/drive/MyDrive/Colab Notebooks/system_anomaly_full_results.csv', index=False)
print("Full results with original features and anomaly flags saved to Google Drive")

# Also save a focused version with just the anomalies
anomaly_df = results_df[results_df['is_anomaly'] == 'True']
anomaly_df.to_csv('/content/drive/MyDrive/Colab Notebooks/system_anomalies.csv', index=False)
print(f"Saved {len(anomaly_df)} anomalies to a separate file")


In [None]:
# Building on our existing autoencoder code
# This would be implemented after the autoencoder section

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed

# Create sequences from your time series data
def create_sequences(data, seq_length):
    xs = []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        xs.append(x)
    return np.array(xs)

# Define sequence length (window size)
seq_length = 24  # For example, 24 data points (could be 24 hours if hourly data)

# Use the same features from our autoencoder
lstm_features = features.copy()

# Sort by timestamp to ensure proper sequence
df_sorted = df.sort_values('timestamp')
sequence_data = df_sorted[lstm_features].values

# Scale the data using the same scaler as the autoencoder
sequence_data_scaled = scaler.transform(sequence_data)

# Create sequences
X_seq = create_sequences(sequence_data_scaled, seq_length)

# Split data
train_size = int(len(X_seq) * 0.8)
X_train_seq = X_seq[:train_size]
X_test_seq = X_seq[train_size:]

# Build LSTM Autoencoder model
lstm_autoencoder = Sequential([
    # Encoder
    LSTM(64, activation='relu', input_shape=(seq_length, len(lstm_features)), return_sequences=False),
    
    # Representation
    RepeatVector(seq_length),
    
    # Decoder
    LSTM(64, activation='relu', return_sequences=True),
    TimeDistributed(Dense(len(lstm_features)))
])

lstm_autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
lstm_history = lstm_autoencoder.fit(
    X_train_seq, X_train_seq,
    epochs=50,
    batch_size=32,
    validation_data=(X_test_seq, X_test_seq),
    callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
    verbose=1
)

# Predict on test data
X_test_pred = lstm_autoencoder.predict(X_test_seq)

# Calculate MSE for each sequence
lstm_mse = np.mean(np.square(X_test_seq - X_test_pred), axis=(1, 2))

# Determine threshold
lstm_threshold = np.mean(lstm_mse) + 2 * np.std(lstm_mse)
print(f"LSTM Anomaly threshold: {lstm_threshold}")

# Function to detect anomalies in new data
def detect_anomalies_with_lstm(new_data, model, scaler, threshold, seq_length, features):
    """
    Detect anomalies in new time series data using the trained LSTM autoencoder
    
    Parameters:
    - new_data: DataFrame with new system metrics
    - model: Trained LSTM autoencoder model
    - scaler: Fitted scaler
    - threshold: Anomaly threshold
    - seq_length: Sequence length used for training
    - features: List of feature names
    
    Returns:
    - DataFrame with anomaly scores and flags
    """
    # Prepare the data
    new_data_scaled = scaler.transform(new_data[features].values)
    
    # Create sequences
    sequences = create_sequences(new_data_scaled, seq_length)
    
    if len(sequences) == 0:
        print("Warning: Not enough data points for sequence creation")
        return new_data.copy()
    
    # Get predictions
    reconstructions = model.predict(sequences)
    
    # Calculate errors
    mse = np.mean(np.square(sequences - reconstructions), axis=(1, 2))
    
    # Create result DataFrame
    result = new_data.iloc[seq_length:].copy()
    result['lstm_reconstruction_error'] = mse
    result['lstm_is_anomaly'] = result['lstm_reconstruction_error'] > threshold
    
    return result

# Save LSTM model for future use
lstm_autoencoder.save('/content/drive/MyDrive/Colab Notebooks/system_lstm_anomaly_detector.h5')
print("LSTM model saved to Google Drive")
