In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

def extract_metric_values(df, metric_type):
    """Extract values from different metric types"""
    exclude_cols = ['Time', 'Minutes', 'source']
    value_cols = [col for col in df.columns if col not in exclude_cols]
    
    if metric_type in ['tcp', 'srtt']:
        if len(value_cols) == 1:
            return df[value_cols[0]].values
        else:
            # For multi-column TCP data, sum across columns
            return df[value_cols].fillna(0).sum(axis=1).values
    elif metric_type == 'memory':
        # Memory usually single column
        return df[value_cols[0]].values if len(value_cols) == 1 else df[value_cols].sum(axis=1).values
    elif metric_type == 'disk':
        if 'BlockLatency' in df.columns or any('latency' in col.lower() for col in value_cols):
            # For latency, take first device
            return df[value_cols[0]].values
        else:
            # For throughput/bytes, sum across devices
            return df[value_cols].sum(axis=1).values
    elif metric_type == 'cpu':
        # CPU usually single column or average
        return df[value_cols[0]].values if len(value_cols) == 1 else df[value_cols].mean(axis=1).values
    else:
        return df[value_cols[0]].values

def create_failure_labels(minutes, delay_minutes=30, duration_minutes=50, failure_threshold=0.3):
    """
    Create failure labels based on experiment timeline
    - 0: Normal (baseline + before stress)
    - 1: Failure (during stress + some buffer after)
    """
    labels = np.zeros(len(minutes))
    
    # Mark stress period and some buffer after as failure
    stress_start = delay_minutes
    stress_end = delay_minutes + duration_minutes
    failure_buffer = duration_minutes * failure_threshold  # 30% of stress duration as buffer
    
    failure_mask = (minutes >= stress_start) & (minutes <= stress_end + failure_buffer)
    labels[failure_mask] = 1
    
    return labels

def prepare_dataset_for_rnn(all_datasets, tuna_results=None, use_cleaned=True, 
                           delay_minutes=30, duration_minutes=50, 
                           sequence_length=10, test_size=0.2):
    """
    Prepare comprehensive dataset for RNN training across all metrics
    
    Parameters:
    - all_datasets: Dictionary of all metric datasets
    - tuna_results: Optional TUNA cleaned results
    - use_cleaned: Whether to use TUNA cleaned data when available
    - delay_minutes: When stress starts
    - duration_minutes: How long stress lasts
    - sequence_length: RNN sequence length (time steps)
    - test_size: Fraction for test set
    """
    
    print("🔄 Preparing comprehensive dataset for RNN training...")
    
    all_features = []
    all_labels = []
    all_metadata = []
    feature_names = []
    
    # Define metric types for proper value extraction
    metric_types = {
        # TCP metrics
        'ApiGateway': 'tcp', 'CustomersService': 'tcp', 'VetsService': 'tcp', 
        'VisitsService': 'tcp', 'SRTT': 'srtt', 'Network traffic': 'srtt',
        
        # Memory metrics
        'MemAvailable': 'memory', 'MemCache': 'memory', 'MemUtil': 'memory',
        'Memory Utilization': 'memory', 'Memory Cache': 'memory', 
        'Memory Available': 'memory', 'Memory Usage': 'memory',
        
        # Disk metrics
        'BlockLatency': 'disk', 'ReadBytes': 'disk', 'WriteBytes': 'disk',
        'DiskUtil': 'disk', 'IOPS': 'disk', 'ThroughputRead': 'disk', 'ThroughputWrite': 'disk',
        
        # CPU metrics
        'CPUUtil': 'cpu', 'CPU Utilization': 'cpu', 'LoadAvg': 'cpu', 'CPU Usage': 'cpu'
    }
    
    for metric_name, experiments in all_datasets.items():
        metric_type = metric_types.get(metric_name, 'unknown')
        print(f"  📊 Processing {metric_name} ({metric_type})...")
        
        for experiment_name, experiment_df in experiments.items():
            print(f"    🔍 {experiment_name}...")
            
            # Extract values (use TUNA cleaned if available and requested)
            if (use_cleaned and tuna_results and metric_name in tuna_results and 
                experiment_name in tuna_results[metric_name]):
                values = tuna_results[metric_name][experiment_name]['cleaned']
                data_source = 'tuna_cleaned'
            else:
                values = extract_metric_values(experiment_df, metric_type)
                data_source = 'original'
            
            # Get minutes for labeling
            minutes = experiment_df['Minutes'].values
            
            # Ensure same length
            min_length = min(len(values), len(minutes))
            values = values[:min_length]
            minutes = minutes[:min_length]
            
            # Create labels (0=normal, 1=failure)
            if experiment_name == 'baseline':
                labels = np.zeros(len(values))  # All normal for baseline
            else:
                labels = create_failure_labels(minutes, delay_minutes, duration_minutes)
            
            # Store features and metadata
            all_features.append(values)
            all_labels.append(labels)
            all_metadata.extend([{
                'metric_name': metric_name,
                'metric_type': metric_type,
                'experiment': experiment_name,
                'data_source': data_source,
                'time_index': i
            } for i in range(len(values))])
        
        feature_names.append(f"{metric_name}_{metric_type}")
    
    print(f"✅ Collected data from {len(feature_names)} metrics")
    
    # Combine all features into sequences
    print("🔄 Creating sequences for RNN...")
    
    # Find minimum length across all features
    min_length = min(len(feat) for feat in all_features)
    print(f"  📏 Minimum sequence length: {min_length}")
    
    # Truncate all to same length and create feature matrix
    feature_matrix = np.column_stack([feat[:min_length] for feat in all_features])
    label_vector = all_labels[0][:min_length]  # Use first experiment's labels as reference
    
    # Normalize features
    scaler = StandardScaler()
    feature_matrix_scaled = scaler.fit_transform(feature_matrix)
    
    print(f"  📊 Feature matrix shape: {feature_matrix_scaled.shape}")
    print(f"  🎯 Label distribution: {np.bincount(label_vector.astype(int))}")
    
    # Create sequences for LSTM
    X_sequences = []
    y_sequences = []
    
    for i in range(sequence_length, len(feature_matrix_scaled)):
        X_sequences.append(feature_matrix_scaled[i-sequence_length:i])
        y_sequences.append(label_vector[i])
    
    X_sequences = np.array(X_sequences)
    y_sequences = np.array(y_sequences)
    
    print(f"  📦 Sequence shapes: X={X_sequences.shape}, y={y_sequences.shape}")
    
    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X_sequences, y_sequences, test_size=test_size, 
        random_state=42, stratify=y_sequences
    )
    
    print(f"  🔄 Train set: {X_train.shape}, Test set: {X_test.shape}")
    print(f"  🎯 Train labels: {np.bincount(y_train.astype(int))}")
    print(f"  🎯 Test labels: {np.bincount(y_test.astype(int))}")
    
    return {
        'X_train': X_train, 'X_test': X_test,
        'y_train': y_train, 'y_test': y_test,
        'scaler': scaler,
        'feature_names': feature_names,
        'metadata': {
            'sequence_length': sequence_length,
            'n_features': len(feature_names),
            'total_samples': len(X_sequences),
            'failure_rate': np.mean(y_sequences)
        }
    }

def build_failure_detection_rnn(input_shape, learning_rate=0.001):
    """
    Build RNN model for failure detection
    
    Parameters:
    - input_shape: (sequence_length, n_features)
    - learning_rate: Learning rate for optimizer
    """
    
    model = Sequential([
        # First LSTM layer
        LSTM(64, return_sequences=True, input_shape=input_shape),
        BatchNormalization(),
        Dropout(0.3),
        
        # Second LSTM layer
        LSTM(32, return_sequences=False),
        BatchNormalization(),
        Dropout(0.3),
        
        # Dense layers
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(8, activation='relu'),
        
        # Output layer (binary classification)
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )
    
    return model

def train_failure_detection_model(data_dict, epochs=50, batch_size=32, verbose=1):
    """
    Train the RNN failure detection model
    """
    
    print("🚀 Training failure detection RNN...")
    
    # Extract data
    X_train, X_test = data_dict['X_train'], data_dict['X_test']
    y_train, y_test = data_dict['y_train'], data_dict['y_test']
    
    # Build model
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = build_failure_detection_rnn(input_shape)
    
    print(f"📊 Model input shape: {input_shape}")
    print(f"🎯 Training on {len(X_train)} samples, validating on {len(X_test)} samples")
    
    # Callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7)
    ]
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        verbose=verbose
    )
    
    return model, history

def evaluate_failure_detection_model(model, data_dict, plot_results=True):
    """
    Evaluate the trained failure detection model
    """
    
    print("📊 Evaluating failure detection model...")
    
    X_test, y_test = data_dict['X_test'], data_dict['y_test']
    
    # Predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f"🎯 AUC Score: {auc_score:.4f}")
    print("\n📈 Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Normal', 'Failure']))
    
    if plot_results:
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', ax=axes[0,0], cmap='Blues')
        axes[0,0].set_title('Confusion Matrix')
        axes[0,0].set_xlabel('Predicted')
        axes[0,0].set_ylabel('Actual')
        
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        axes[0,1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.4f})')
        axes[0,1].plot([0, 1], [0, 1], 'k--')
        axes[0,1].set_xlabel('False Positive Rate')
        axes[0,1].set_ylabel('True Positive Rate')
        axes[0,1].set_title('ROC Curve')
        axes[0,1].legend()
        
        # Prediction Distribution
        axes[1,0].hist(y_pred_proba[y_test==0], alpha=0.7, label='Normal', bins=20)
        axes[1,0].hist(y_pred_proba[y_test==1], alpha=0.7, label='Failure', bins=20)
        axes[1,0].set_xlabel('Prediction Probability')
        axes[1,0].set_ylabel('Frequency')
        axes[1,0].set_title('Prediction Distribution')
        axes[1,0].legend()
        
        # Feature Importance (simplified)
        feature_names = data_dict['feature_names']
        feature_importance = np.random.rand(len(feature_names))  # Placeholder
        axes[1,1].barh(range(len(feature_names)), feature_importance)
        axes[1,1].set_yticks(range(len(feature_names)))
        axes[1,1].set_yticklabels(feature_names, fontsize=8)
        axes[1,1].set_xlabel('Relative Importance')
        axes[1,1].set_title('Feature Importance (Placeholder)')
        
        plt.tight_layout()
        plt.suptitle('Failure Detection Model Evaluation', fontsize=16, y=1.02)
        plt.show()
    
    return {
        'auc_score': auc_score,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'true_labels': y_test
    }

def predict_failures_realtime(model, scaler, new_data, sequence_length=10, feature_names=None):
    """
    Use trained model to predict failures on new data
    
    Parameters:
    - model: Trained RNN model
    - scaler: Fitted StandardScaler
    - new_data: New metric data (DataFrame or array)
    - sequence_length: Sequence length used in training
    - feature_names: Names of features in order
    """
    
    print("🔮 Predicting failures on new data...")
    
    # Prepare new data
    if isinstance(new_data, pd.DataFrame):
        # Extract values similar to training
        exclude_cols = ['Time', 'Minutes', 'source']
        value_cols = [col for col in new_data.columns if col not in exclude_cols]
        new_features = new_data[value_cols].values
    else:
        new_features = new_data
    
    # Scale features
    new_features_scaled = scaler.transform(new_features)
    
    # Create sequences
    predictions = []
    probabilities = []
    
    for i in range(sequence_length, len(new_features_scaled)):
        sequence = new_features_scaled[i-sequence_length:i].reshape(1, sequence_length, -1)
        prob = model.predict(sequence, verbose=0)[0][0]
        pred = 1 if prob > 0.5 else 0
        
        predictions.append(pred)
        probabilities.append(prob)
    
    return np.array(predictions), np.array(probabilities)

def run_comprehensive_failure_detection(all_datasets, tuna_results=None):
    """
    Run complete failure detection pipeline
    """
    
    print("🎯 COMPREHENSIVE FAILURE DETECTION SYSTEM")
    print("="*60)
    
    # Prepare dataset
    data_dict = prepare_dataset_for_rnn(all_datasets, tuna_results)
    
    # Train model
    model, history = train_failure_detection_model(data_dict)
    
    # Evaluate model
    results = evaluate_failure_detection_model(model, data_dict)
    
    print(f"\n✅ Model training complete!")
    print(f"📊 Final AUC Score: {results['auc_score']:.4f}")
    print(f"🎯 Failure detection rate: {data_dict['metadata']['failure_rate']:.2%}")
    
    return {
        'model': model,
        'data_dict': data_dict,
        'results': results,
        'history': history
    }

# Execute the failure detection pipeline
failure_detection_system = run_comprehensive_failure_detection(all_datasets, tuna_results)

# Save the model for later use
# failure_detection_system['model'].save('failure_detection_rnn.h5')

print("🎉 Failure detection system ready!")
print("💡 Use predict_failures_realtime() for real-time predictions on new data")