In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as scipy_stats
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load all memory datasets
df_mem_util = pd.read_csv("mem_util.csv")
df_mem_cache = pd.read_csv("mem_cache.csv")
df_mem_available = pd.read_csv("mem_available.csv")

df_baseline_mem_util = pd.read_csv("../../baseline/memory related/mem_util.csv")
df_baseline_mem_cache = pd.read_csv("../../baseline/memory related/mem_cache.csv")
df_baseline_mem_available = pd.read_csv("../../baseline/memory related/mem_available.csv")

# Add source labels for CPU STRESS experiment
df_mem_util["source"] = "CPU_STRESS"
df_mem_cache["source"] = "CPU_STRESS"
df_mem_available["source"] = "CPU_STRESS"

df_baseline_mem_util["source"] = "BASELINE"
df_baseline_mem_cache["source"] = "BASELINE"
df_baseline_mem_available["source"] = "BASELINE"

# Convert to datetime
df_mem_util["Time"] = pd.to_datetime(df_mem_util["Time"])
df_mem_cache["Time"] = pd.to_datetime(df_mem_cache["Time"])
df_mem_available["Time"] = pd.to_datetime(df_mem_available["Time"])

df_baseline_mem_util["Time"] = pd.to_datetime(df_baseline_mem_util["Time"])
df_baseline_mem_cache["Time"] = pd.to_datetime(df_baseline_mem_cache["Time"])
df_baseline_mem_available["Time"] = pd.to_datetime(df_baseline_mem_available["Time"])

delay = 30
duration = 50

# Synchronize test datasets with baseline timeline
time_offset = df_baseline_mem_util["Time"].min() - df_mem_util["Time"].min()
df_mem_util["Time"] += time_offset
df_mem_cache["Time"] += time_offset
df_mem_available["Time"] += time_offset

# Convert timeline to minutes
all_dfs = [
    df_mem_util, df_mem_cache, df_mem_available,
    df_baseline_mem_util, df_baseline_mem_cache, df_baseline_mem_available
]

for df in all_dfs:
    df["Minutes"] = (df["Time"] - df["Time"].min()).dt.total_seconds() / 60

# STORE ORIGINAL DATASETS BEFORE CLEANING
original_datasets = {
    'Memory Utilization': {'cpu_stress': df_mem_util.copy(), 'baseline': df_baseline_mem_util.copy()},
    'Memory Cache': {'cpu_stress': df_mem_cache.copy(), 'baseline': df_baseline_mem_cache.copy()},
    'Memory Available': {'cpu_stress': df_mem_available.copy(), 'baseline': df_baseline_mem_available.copy()}
}

# SIMPLE ISOLATION FOREST OUTLIER DETECTION FOR MEMORY METRICS
def remove_outliers_isolation_forest_memory(df, contamination=0.02):
    """Simple Isolation Forest outlier detection - uses only original numeric columns"""
    df_clean = df.copy()
    outlier_stats = {}
    
    # Get all numeric columns except Time, Minutes, and source
    numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns 
                   if col not in ['Time', 'Minutes']]
    
    if len(numeric_cols) == 0:
        outlier_stats['_summary'] = {
            'total_outliers': 0,
            'contamination_rate': contamination,
            'features_used': 0,
            'percentage_removed': 0
        }
        return df_clean, outlier_stats
    
    # Use ONLY the original numeric columns - no feature engineering
    feature_matrix = df[numeric_cols].fillna(0).values
    
    # Check if there's enough variance to detect outliers
    if np.std(feature_matrix.flatten()) > 1e-10:
        # Scale features for better performance
        scaler = StandardScaler()
        feature_matrix_scaled = scaler.fit_transform(feature_matrix)
        
        # Initialize and fit Isolation Forest
        iso_forest = IsolationForest(
            contamination=contamination,
            n_estimators=100,
            random_state=42,
            n_jobs=-1
        )
        
        # Fit and predict
        iso_forest.fit(feature_matrix_scaled)
        outlier_predictions = iso_forest.predict(feature_matrix_scaled)
        
        # Create outlier mask (-1 = outlier, 1 = normal)
        outlier_mask = outlier_predictions == -1
        
        # Apply outlier removal to each numeric column
        for col in numeric_cols:
            outlier_stats[col] = {
                'count': outlier_mask.sum(),
                'percentage': (outlier_mask.sum() / len(df[col])) * 100,
                'method': 'isolation_forest_simple'
            }
            
            # Set outliers to NaN and interpolate
            df_clean.loc[outlier_mask, col] = np.nan
            df_clean[col] = df_clean[col].interpolate(method='linear').fillna(0)
        
        # Overall statistics
        outlier_stats['_summary'] = {
            'total_outliers': outlier_mask.sum(),
            'contamination_rate': contamination,
            'features_used': len(numeric_cols),
            'percentage_removed': (outlier_mask.sum() / len(df)) * 100
        }
    else:
        # No variance - no outliers to remove
        for col in numeric_cols:
            outlier_stats[col] = {
                'count': 0,
                'percentage': 0,
                'method': 'isolation_forest_simple'
            }
        
        outlier_stats['_summary'] = {
            'total_outliers': 0,
            'contamination_rate': contamination,
            'features_used': len(numeric_cols),
            'percentage_removed': 0
        }
    
    return df_clean, outlier_stats

# OUTLIER CLEANING WITH TRACKING
datasets_clean = {}
all_outlier_stats = {}

for dataset_name, dataset_pair in original_datasets.items():
    print(f"\n📊 Processing {dataset_name} with Simple Isolation Forest:")
    
    datasets_clean[dataset_name] = {}
    all_outlier_stats[dataset_name] = {}
    
    for source_type, df in dataset_pair.items():
        print(f"  🔍 {source_type.upper().replace('_', ' ')}:")
        
        # Apply Simple Isolation Forest outlier detection
        df_clean, stats = remove_outliers_isolation_forest_memory(df, contamination=0.02)
        
        datasets_clean[dataset_name][source_type] = df_clean
        all_outlier_stats[dataset_name][source_type] = stats
        
        # Print outlier summary
        total_outliers = stats['_summary']['total_outliers']
        features_used = stats['_summary']['features_used']
        percentage_removed = stats['_summary']['percentage_removed']
        print(f"    ✅ Cleaned {total_outliers} outliers ({percentage_removed:.1f}%) using {features_used} original features")

# ADD PHASE COLUMN 
def add_phase_column(df, delay_minutes, duration_minutes):
    df = df.copy()
    df['phase'] = 'before'  
    df.loc[(df['Minutes'] >= delay_minutes) & (df['Minutes'] <= delay_minutes + duration_minutes), 'phase'] = 'during'  
    df.loc[df['Minutes'] > delay_minutes + duration_minutes, 'phase'] = 'after' 
    return df

# FIXED CACHE-SPECIFIC ANALYSIS FUNCTION
def analyze_cache_properly(dataset_name, cpu_stress_df_clean, baseline_df_clean, delay_minutes, duration_minutes):
    """
    Proper cache analysis without harmful normalization
    Focuses on absolute values and real cache behavior patterns
    """
    
    print(f"🔍 PROPER CACHE ANALYSIS FOR {dataset_name}")
    print("=" * 60)
    
    # Get cache column (assuming it's the main numeric column)
    numeric_cols = [col for col in cpu_stress_df_clean.select_dtypes(include=[np.number]).columns 
                   if col not in ['Time', 'Minutes']]
    
    if not numeric_cols:
        print("❌ No numeric cache columns found")
        return None
        
    cache_col = numeric_cols[0]  # Assuming main cache metric
    print(f"📊 Analyzing cache metric: {cache_col}")
    
    # Add phase information WITHOUT normalization
    cpu_stress_df = add_phase_column(cpu_stress_df_clean.copy(), delay_minutes, duration_minutes)
    baseline_df = add_phase_column(baseline_df_clean.copy(), delay_minutes, duration_minutes)
    
    # Calculate cache statistics in ORIGINAL units
    cache_stats_baseline = baseline_df.groupby('phase')[cache_col].agg([
        'mean', 'std', 'min', 'max', 'count'
    ]).round(0)
    
    cache_stats_cpu_stress = cpu_stress_df.groupby('phase')[cache_col].agg([
        'mean', 'std', 'min', 'max', 'count'  
    ]).round(0)
    
    # Calculate cache growth rates
    def calculate_growth_rate(df, col):
        """Calculate cache growth rate per minute"""
        df_sorted = df.sort_values('Minutes')
        df_sorted['cache_change'] = df_sorted[col].diff()
        df_sorted['time_change'] = df_sorted['Minutes'].diff()
        df_sorted['growth_rate'] = df_sorted['cache_change'] / df_sorted['time_change']
        return df_sorted
    
    baseline_with_growth = calculate_growth_rate(baseline_df, cache_col)
    cpu_stress_with_growth = calculate_growth_rate(cpu_stress_df, cache_col)
    
    # Cache pressure events (significant drops)
    def find_cache_pressure_events(df, col, threshold_kb=50000):
        """Find significant cache drops indicating memory pressure"""
        df_sorted = df.sort_values('Minutes')
        cache_drops = df_sorted[col].diff()
        pressure_events = df_sorted[cache_drops < -threshold_kb]
        return pressure_events
    
    baseline_pressure = find_cache_pressure_events(baseline_df, cache_col)
    cpu_stress_pressure = find_cache_pressure_events(cpu_stress_df, cache_col)
    
    # Statistical tests on ABSOLUTE values
    t_tests = {}
    absolute_differences = {}
    
    for phase in ['before', 'during', 'after']:
        baseline_data = baseline_df[baseline_df['phase'] == phase][cache_col].dropna()
        cpu_stress_data = cpu_stress_df[cpu_stress_df['phase'] == phase][cache_col].dropna()
        
        if len(baseline_data) > 1 and len(cpu_stress_data) > 1:
            t_stat, p_value = scipy_stats.ttest_ind(baseline_data, cpu_stress_data)
            t_tests[phase] = {'t': t_stat, 'p': p_value}
            
            # Calculate ABSOLUTE difference in KB
            baseline_mean = baseline_data.mean()
            cpu_stress_mean = cpu_stress_data.mean()
            absolute_differences[phase] = {
                'absolute_kb': cpu_stress_mean - baseline_mean,
                'percentage': ((cpu_stress_mean - baseline_mean) / baseline_mean) * 100 if baseline_mean != 0 else 0
            }
        else:
            t_tests[phase] = {'t': 0, 'p': 1.0}
            absolute_differences[phase] = {'absolute_kb': 0, 'percentage': 0}
    
    # PLOTTING - NO NORMALIZATION
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    
    # 1. Cache Timeline (Absolute Values)
    axes[0,0].plot(baseline_df['Minutes'], baseline_df[cache_col]/1024, 
                   label='Baseline', linewidth=2, color='blue')
    axes[0,0].plot(cpu_stress_df['Minutes'], cpu_stress_df[cache_col]/1024, 
                   label='CPU Stress', linewidth=2, color='red')
    axes[0,0].axvspan(delay_minutes, delay_minutes + duration_minutes, 
                      color='red', alpha=0.2, label="CPU Stress Period")
    axes[0,0].set_title('Cache Usage Over Time (MB)', fontweight='bold')
    axes[0,0].set_xlabel('Minutes')
    axes[0,0].set_ylabel('Cache Size (MB)')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # 2. Cache Growth Rates
    axes[0,1].plot(baseline_with_growth['Minutes'], baseline_with_growth['growth_rate'], 
                   label='Baseline Growth', alpha=0.7, color='blue')
    axes[0,1].plot(cpu_stress_with_growth['Minutes'], cpu_stress_with_growth['growth_rate'], 
                   label='CPU Stress Growth', alpha=0.7, color='red')
    axes[0,1].axhline(y=0, color='black', linestyle='-', alpha=0.5)
    axes[0,1].axvspan(delay_minutes, delay_minutes + duration_minutes, 
                      color='red', alpha=0.2)
    axes[0,1].set_title('Cache Growth Rate (KB/min)', fontweight='bold')
    axes[0,1].set_xlabel('Minutes')
    axes[0,1].set_ylabel('Growth Rate (KB/min)')
    axes[0,1].legend()
    axes[0,1].grid(True, alpha=0.3)
    
    # 3. Cache Statistics by Phase
    phases = ['before', 'during', 'after']
    x = np.arange(len(phases))
    width = 0.35
    
    baseline_means = [cache_stats_baseline.loc[phase, 'mean']/1024 for phase in phases]
    cpu_stress_means = [cache_stats_cpu_stress.loc[phase, 'mean']/1024 for phase in phases]
    
    axes[0,2].bar(x - width/2, baseline_means, width, label='Baseline', alpha=0.8, color='blue')
    axes[0,2].bar(x + width/2, cpu_stress_means, width, label='CPU Stress', alpha=0.8, color='red')
    axes[0,2].set_title('Average Cache Size by Phase (MB)', fontweight='bold')
    axes[0,2].set_xlabel('Phase')
    axes[0,2].set_ylabel('Average Cache (MB)')
    axes[0,2].set_xticks(x)
    axes[0,2].set_xticklabels(phases)
    axes[0,2].legend()
    axes[0,2].grid(axis='y', alpha=0.3)
    
    # 4. Cache Pressure Events
    if len(baseline_pressure) > 0 or len(cpu_stress_pressure) > 0:
        all_times = list(baseline_df['Minutes']) + list(cpu_stress_df['Minutes'])
        axes[1,0].hist([baseline_pressure['Minutes'], cpu_stress_pressure['Minutes']], 
                       bins=20, alpha=0.7, label=['Baseline Pressure', 'CPU Stress Pressure'],
                       color=['blue', 'red'])
    axes[1,0].axvspan(delay_minutes, delay_minutes + duration_minutes, 
                      color='red', alpha=0.2)
    axes[1,0].set_title('Cache Pressure Events', fontweight='bold')
    axes[1,0].set_xlabel('Minutes')
    axes[1,0].set_ylabel('Pressure Event Count')
    axes[1,0].legend()
    axes[1,0].grid(True, alpha=0.3)
    
    # 5. Absolute Differences
    abs_diffs = [absolute_differences[phase]['absolute_kb']/1024 for phase in phases]
    colors = ['green' if x >= 0 else 'orange' for x in abs_diffs]
    
    axes[1,1].bar(phases, abs_diffs, color=colors, alpha=0.7)
    axes[1,1].axhline(y=0, color='black', linestyle='-', alpha=0.5)
    axes[1,1].set_title('Cache Impact: CPU Stress - Baseline (MB)', fontweight='bold')
    axes[1,1].set_xlabel('Phase')
    axes[1,1].set_ylabel('Cache Difference (MB)')
    axes[1,1].grid(axis='y', alpha=0.3)
    
    # 6. Distribution Comparison
    baseline_all = baseline_df[cache_col] / 1024
    cpu_stress_all = cpu_stress_df[cache_col] / 1024
    
    axes[1,2].hist([baseline_all, cpu_stress_all], bins=30, alpha=0.7, 
                   label=['Baseline', 'CPU Stress'], color=['blue', 'red'])
    axes[1,2].set_title('Cache Size Distribution (MB)', fontweight='bold')
    axes[1,2].set_xlabel('Cache Size (MB)')
    axes[1,2].set_ylabel('Frequency')
    axes[1,2].legend()
    axes[1,2].grid(True, alpha=0.3)
    
    plt.suptitle(f'{dataset_name}: Proper Cache Analysis (No Harmful Normalization)', 
                 fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # RESULTS SUMMARY
    print("\n" + "="*80)
    print("CACHE ANALYSIS RESULTS (ABSOLUTE VALUES)")
    print("="*80)
    
    print(f"\n📊 BASELINE CACHE STATISTICS (KB):")
    print(cache_stats_baseline)
    
    print(f"\n📊 CPU STRESS CACHE STATISTICS (KB):")
    print(cache_stats_cpu_stress)
    
    print(f"\n🔬 STATISTICAL SIGNIFICANCE:")
    for phase, test in t_tests.items():
        sig = "✅ SIGNIFICANT" if test['p'] < 0.05 else "❌ NOT SIGNIFICANT"
        print(f"  {phase.upper():8}: t={test['t']:6.2f}, p={test['p']:8.4f} ({sig})")
    
    print(f"\n📈 ABSOLUTE CACHE IMPACT:")
    for phase in phases:
        abs_kb = absolute_differences[phase]['absolute_kb']
        pct = absolute_differences[phase]['percentage']
        direction = "↗️ INCREASE" if abs_kb > 0 else "↘️ DECREASE" if abs_kb < 0 else "→ NO CHANGE"
        print(f"  {phase.upper():8}: {abs_kb:+7.0f} KB ({pct:+5.1f}%) {direction}")
    
    print(f"\n⚠️  CACHE PRESSURE EVENTS:")
    print(f"  Baseline: {len(baseline_pressure)} events")
    print(f"  CPU Stress: {len(cpu_stress_pressure)} events")
    
    return {
        'cache_stats_baseline': cache_stats_baseline,
        'cache_stats_cpu_stress': cache_stats_cpu_stress,
        't_tests': t_tests,
        'absolute_differences': absolute_differences,
        'pressure_events': {
            'baseline': len(baseline_pressure),
            'cpu_stress': len(cpu_stress_pressure)
        }
    }

# NORMALIZATION FUNCTION (for non-cache metrics)
def normalize_df_memory(df, columns):
    result = df.copy()
    for col in columns:
        min_val = df[col].min()
        max_val = df[col].max()
        if max_val > min_val:  
            result[col] = (df[col] - min_val) / (max_val - min_val)
        else:
            result[col] = 0
    return result

# ORIGINAL ANALYSIS FUNCTION FOR NON-CACHE METRICS
def analyze_and_plot_memory_dataset_wide(dataset_name, cpu_stress_df_clean, baseline_df_clean, delay_minutes, duration_minutes):
    
    # Get original datasets for comparison
    cpu_stress_df_original = original_datasets[dataset_name]['cpu_stress']
    baseline_df_original = original_datasets[dataset_name]['baseline']
    
    # Get numeric columns for memory metrics
    numeric_cols = [col for col in cpu_stress_df_clean.select_dtypes(include=[np.number]).columns 
                   if col not in ['Time', 'Minutes']]
    
    cpu_stress_norm = normalize_df_memory(cpu_stress_df_clean, numeric_cols)
    baseline_norm = normalize_df_memory(baseline_df_clean, numeric_cols)
    
    # For memory metrics, we typically have single metrics, so we'll use the main metric
    main_metric = numeric_cols[0] if numeric_cols else None
    
    if main_metric is None:
        print(f"No numeric columns found for {dataset_name}")
        return None
    
    # Add phase information
    cpu_stress_norm = add_phase_column(cpu_stress_norm, delay_minutes, duration_minutes)
    baseline_norm = add_phase_column(baseline_norm, delay_minutes, duration_minutes)
    
    # Statistical analysis
    stats_baseline = baseline_norm.groupby('phase')[main_metric].agg(['mean', 'std', 'min', 'max'])
    stats_cpu_stress = cpu_stress_norm.groupby('phase')[main_metric].agg(['mean', 'std', 'min', 'max'])
    
    # Combined dataframe for analysis
    df_combined = pd.DataFrame({
        'Baseline': baseline_norm[main_metric],
        'CPU_STRESS': cpu_stress_norm[main_metric],
        'Minutes': baseline_norm['Minutes'],
        'phase': baseline_norm['phase'],
        'difference': cpu_stress_norm[main_metric] - baseline_norm[main_metric]
    })
    
    # T-tests for statistical significance
    t_tests = {}
    for phase in ['before', 'during', 'after']:
        data = df_combined[df_combined['phase'] == phase]
        t_stat, p_value = scipy_stats.ttest_ind(data['Baseline'].dropna(), data['CPU_STRESS'].dropna())
        t_tests[phase] = {'t': t_stat, 'p': p_value}
    
    # Calculate percentage impact
    impact = {}
    for phase in ['before', 'during', 'after']:
        baseline_mean = stats_baseline.loc[phase, 'mean']
        cpu_stress_mean = stats_cpu_stress.loc[phase, 'mean']
        impact[phase] = ((cpu_stress_mean - baseline_mean) / baseline_mean) * 100 if baseline_mean != 0 else float('inf')
    
    # PLOTTING (FIXED TO SHOW ORIGINAL VS CLEANED)
    fig = plt.figure(figsize=(24, 12))
    
    # 1. Baseline outlier cleaning effect
    plt.subplot(2, 3, 1)
    plt.plot(baseline_df_original['Minutes'], baseline_df_original[main_metric], 
             label='Baseline (with outliers)', color='orange', alpha=0.7, linewidth=1)
    plt.plot(baseline_df_clean['Minutes'], baseline_df_clean[main_metric], 
             label='Baseline (cleaned)', linewidth=3, color='blue')
    plt.title(f'{dataset_name}: Baseline Outlier Cleaning Effect\n(Simple Isolation Forest)', fontsize=14, fontweight='bold')
    plt.xlabel('Minutes', fontsize=12)
    plt.ylabel(f'{main_metric}', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    
    # 2. CPU STRESS outlier cleaning effect
    plt.subplot(2, 3, 2)
    plt.plot(cpu_stress_df_original['Minutes'], cpu_stress_df_original[main_metric], 
             label='CPU STRESS (with outliers)', alpha=0.7, color='lightcoral', linewidth=1)
    plt.plot(cpu_stress_df_clean['Minutes'], cpu_stress_df_clean[main_metric], 
             label='CPU STRESS (cleaned)', linewidth=3, color='darkred')
    plt.title(f'{dataset_name}: CPU Stress Outlier Cleaning Effect\n(Simple Isolation Forest)', fontsize=14, fontweight='bold')
    plt.xlabel('Minutes', fontsize=12)
    plt.ylabel(f'{main_metric}', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    
    # 3. Timeline comparison
    plt.subplot(2, 3, 3)
    plt.plot(baseline_norm['Minutes'], baseline_norm[main_metric], 
             label='Baseline', linewidth=3, color='blue', alpha=0.9)
    plt.plot(cpu_stress_norm['Minutes'], cpu_stress_norm[main_metric], 
             label='CPU Stress', linewidth=3, color='red', alpha=0.9)
    plt.axvspan(delay_minutes, delay_minutes + duration_minutes, 
                color='red', alpha=0.2, label="CPU Stress Period")
    plt.title(f"{dataset_name}: Baseline vs CPU Stress", fontsize=14, fontweight='bold')
    plt.xlabel("Minutes", fontsize=12)
    plt.ylabel(f"Normalized {main_metric}", fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend(fontsize=10)
    
    # 4. Box plot by phase
    plt.subplot(2, 3, 4)
    box_data = pd.melt(df_combined[['Baseline', 'CPU_STRESS', 'phase']], 
                      id_vars=['phase'], var_name='source', value_name='value')
    sns.boxplot(x='phase', y='value', hue='source', data=box_data, ax=plt.gca())
    plt.title(f'{dataset_name}: Distribution by Phase', fontsize=14, fontweight='bold')
    plt.xlabel('Phase', fontsize=12)
    plt.ylabel('Normalized Values', fontsize=12)
    plt.grid(axis='y', alpha=0.3)
    plt.legend(fontsize=10)
    
    # 5. Statistics by phase
    plt.subplot(2, 3, 5)
    phases = ['before', 'during', 'after']
    x = np.arange(len(phases))
    width = 0.35
    
    bars1 = plt.bar(x - width/2, stats_baseline['mean'], width, label='Baseline', alpha=0.8, color='blue')
    bars2 = plt.bar(x + width/2, stats_cpu_stress['mean'], width, label='CPU Stress', alpha=0.8, color='red')
    
    plt.errorbar(x - width/2, stats_baseline['mean'], yerr=stats_baseline['std'], 
                fmt='none', ecolor='black', capsize=5, linewidth=2)
    plt.errorbar(x + width/2, stats_cpu_stress['mean'], yerr=stats_cpu_stress['std'], 
                fmt='none', ecolor='black', capsize=5, linewidth=2)
    
    plt.title(f'{dataset_name}: Average by Phase', fontsize=14, fontweight='bold')
    plt.xlabel('Phase', fontsize=12)
    plt.ylabel('Mean Normalized Values', fontsize=12)
    plt.xticks(x, phases)
    plt.legend(fontsize=10)
    plt.grid(axis='y', alpha=0.3)
    
    # 6. Difference over time
    plt.subplot(2, 3, 6)
    plt.plot(df_combined['Minutes'], df_combined['difference'], 
             color='purple', linewidth=2, alpha=0.8)
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.5, linewidth=1)
    plt.axvspan(delay_minutes, delay_minutes + duration_minutes, 
                color='red', alpha=0.2)
    plt.title(f'{dataset_name}: CPU Stress - Baseline Difference', fontsize=14, fontweight='bold')
    plt.xlabel('Minutes', fontsize=12)
    plt.ylabel('Difference', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    plt.suptitle(f'{dataset_name} Memory CPU Stress Analysis ', 
                fontsize=18, fontweight='bold', y=0.98)
    plt.tight_layout()
    plt.subplots_adjust(top=0.90)
    plt.show()
    
    # Print statistical results
    print(f"\n{'='*80}")
    print(f"{dataset_name.upper()} - CPU STRESS ANALYSIS RESULTS")
    print(f"{'='*80}")
    
    print(f"\n📊 BASELINE STATISTICS BY PHASE:")
    print(stats_baseline.round(4))
    
    print(f"\n📊 CPU STRESS STATISTICS BY PHASE:")
    print(stats_cpu_stress.round(4))
    
    print(f"\n🔬 STATISTICAL SIGNIFICANCE TESTS:")
    for phase, test in t_tests.items():
        sig = "✅ SIGNIFICANT" if test['p'] < 0.05 else "❌ NOT SIGNIFICANT"
        print(f"  {phase.upper():8}: t={test['t']:6.2f}, p={test['p']:8.4f} ({sig})")
    
    print(f"\n📈 CPU STRESS IMPACT ON MEMORY (% CHANGE):")
    for phase, change in impact.items():
        direction = "↗️ INCREASE" if change > 0 else "↘️ DECREASE" if change < 0 else "→ NO CHANGE"
        print(f"  {phase.upper():8}: {change:+7.2f}% ({direction})")
    
    return {
        'stats_baseline': stats_baseline,
        'stats_cpu_stress': stats_cpu_stress,
        't_tests': t_tests,
        'impact': impact,
        'combined_data': df_combined,
        'main_metric': main_metric
    }

# RUN ANALYSIS WITH CACHE-SPECIFIC HANDLING
print(f"\n{'='*80}")
print("COMPREHENSIVE MEMORY CPU STRESS ANALYSIS (FIXED)")
print(f"{'='*80}")

analysis_results = {}


📊 Processing Memory Utilization with Simple Isolation Forest:
  🔍 CPU STRESS:
    ✅ Cleaned 10 outliers (2.1%) using 1 original features
  🔍 BASELINE:
    ✅ Cleaned 8 outliers (1.6%) using 1 original features

📊 Processing Memory Cache with Simple Isolation Forest:
  🔍 CPU STRESS:
    ✅ Cleaned 10 outliers (2.1%) using 1 original features
  🔍 BASELINE:
    ✅ Cleaned 8 outliers (1.6%) using 1 original features

📊 Processing Memory Available with Simple Isolation Forest:
  🔍 CPU STRESS:
    ✅ Cleaned 10 outliers (2.1%) using 1 original features
  🔍 BASELINE:
    ✅ Cleaned 8 outliers (1.6%) using 1 original features

COMPREHENSIVE MEMORY CPU STRESS ANALYSIS (FIXED)
