In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from scipy import stats as scipy_stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
from typing import Dict, List, Any 
from sklearn.model_selection import GridSearchCV
from pathlib import Path
import datetime
warnings.filterwarnings('ignore')

# Load BASELINE datasets
df_baseline_iowait = pd.read_csv("baseline/cpu_iowait.csv")
df_baseline_irq = pd.read_csv("baseline/cpu_irq.csv")
df_baseline_system_msec = pd.read_csv("baseline/cpu_system_msec.csv")
df_baseline_user_msec = pd.read_csv("baseline/cpu_user_msec.csv")
df_baseline_util_per = pd.read_csv("baseline/cpu_util_per.csv")

# Load CACHE FILLING datasets
df_cachefilling_iowait = pd.read_csv("cache filling/cpu_iowait.csv")
df_cachefilling_irq = pd.read_csv("cache filling/cpu_irq.csv")
df_cachefilling_system_msec = pd.read_csv("cache filling/cpu_system_msec.csv")
df_cachefilling_user_msec = pd.read_csv("cache filling/cpu_user_msec.csv")
df_cachefilling_util_per = pd.read_csv("cache filling/cpu_util_per.csv")

# Load GC STRESS datasets
df_gcstress_iowait = pd.read_csv("gc stress/cpu_iowait.csv")
df_gcstress_irq = pd.read_csv("gc stress/cpu_irq.csv")
df_gcstress_system_msec = pd.read_csv("gc stress/cpu_system_msec.csv")
df_gcstress_user_msec = pd.read_csv("gc stress/cpu_user_msec.csv")
df_gcstress_util_per = pd.read_csv("gc stress/cpu_util_per.csv")

# Load OOM datasets
df_oom_iowait = pd.read_csv("oom/cpu_iowait.csv")
df_oom_irq = pd.read_csv("oom/cpu_irq.csv")
df_oom_system_msec = pd.read_csv("oom/cpu_system_msec.csv")
df_oom_user_msec = pd.read_csv("oom/cpu_user_msec.csv")
df_oom_util_per = pd.read_csv("oom/cpu_util_per.csv")

# Load THREADFULL datasets
df_threadfull_iowait = pd.read_csv("threadfull/cpu_iowait.csv")
df_threadfull_irq = pd.read_csv("threadfull/cpu_irq.csv")
df_threadfull_system_msec = pd.read_csv("threadfull/cpu_system_msec.csv")
df_threadfull_user_msec = pd.read_csv("threadfull/cpu_user_msec.csv")
df_threadfull_util_per = pd.read_csv("threadfull/cpu_util_per.csv")


# Add source labels - BASELINE
df_baseline_iowait["source"] = "BASELINE"
df_baseline_irq["source"] = "BASELINE"
df_baseline_system_msec["source"] = "BASELINE"
df_baseline_user_msec["source"] = "BASELINE"
df_baseline_util_per["source"] = "BASELINE"

# Add source labels - CACHE FILLING
df_cachefilling_iowait["source"] = "CACHE_FILLING"
df_cachefilling_irq["source"] = "CACHE_FILLING"
df_cachefilling_system_msec["source"] = "CACHE_FILLING"
df_cachefilling_user_msec["source"] = "CACHE_FILLING"
df_cachefilling_util_per["source"] = "CACHE_FILLING"

# Add source labels - GC STRESS
df_gcstress_iowait["source"] = "GC_STRESS"
df_gcstress_irq["source"] = "GC_STRESS"
df_gcstress_system_msec["source"] = "GC_STRESS"
df_gcstress_user_msec["source"] = "GC_STRESS"
df_gcstress_util_per["source"] = "GC_STRESS"

# Add source labels - OOM
df_oom_iowait["source"] = "OOM"
df_oom_irq["source"] = "OOM"
df_oom_system_msec["source"] = "OOM"
df_oom_user_msec["source"] = "OOM"
df_oom_util_per["source"] = "OOM"

# Add source labels - THREADFULL
df_threadfull_iowait["source"] = "THREADFULL"
df_threadfull_irq["source"] = "THREADFULL"
df_threadfull_system_msec["source"] = "THREADFULL"
df_threadfull_user_msec["source"] = "THREADFULL"
df_threadfull_util_per["source"] = "THREADFULL"


# Convert to datetime - BASELINE
df_baseline_iowait["Time"] = pd.to_datetime(df_baseline_iowait["Time"])
df_baseline_irq["Time"] = pd.to_datetime(df_baseline_irq["Time"])
df_baseline_system_msec["Time"] = pd.to_datetime(df_baseline_system_msec["Time"])
df_baseline_user_msec["Time"] = pd.to_datetime(df_baseline_user_msec["Time"])
df_baseline_util_per["Time"] = pd.to_datetime(df_baseline_util_per["Time"])

# Convert to datetime - CACHE FILLING
df_cachefilling_iowait["Time"] = pd.to_datetime(df_cachefilling_iowait["Time"])
df_cachefilling_irq["Time"] = pd.to_datetime(df_cachefilling_irq["Time"])
df_cachefilling_system_msec["Time"] = pd.to_datetime(df_cachefilling_system_msec["Time"])
df_cachefilling_user_msec["Time"] = pd.to_datetime(df_cachefilling_user_msec["Time"])
df_cachefilling_util_per["Time"] = pd.to_datetime(df_cachefilling_util_per["Time"])

# Convert to datetime - GC STRESS
df_gcstress_iowait["Time"] = pd.to_datetime(df_gcstress_iowait["Time"])
df_gcstress_irq["Time"] = pd.to_datetime(df_gcstress_irq["Time"])
df_gcstress_system_msec["Time"] = pd.to_datetime(df_gcstress_system_msec["Time"])
df_gcstress_user_msec["Time"] = pd.to_datetime(df_gcstress_user_msec["Time"])
df_gcstress_util_per["Time"] = pd.to_datetime(df_gcstress_util_per["Time"])

# Convert to datetime - OOM
df_oom_iowait["Time"] = pd.to_datetime(df_oom_iowait["Time"])
df_oom_irq["Time"] = pd.to_datetime(df_oom_irq["Time"])
df_oom_system_msec["Time"] = pd.to_datetime(df_oom_system_msec["Time"])
df_oom_user_msec["Time"] = pd.to_datetime(df_oom_user_msec["Time"])
df_oom_util_per["Time"] = pd.to_datetime(df_oom_util_per["Time"])

# Convert to datetime - THREADFULL
df_threadfull_iowait["Time"] = pd.to_datetime(df_threadfull_iowait["Time"])
df_threadfull_irq["Time"] = pd.to_datetime(df_threadfull_irq["Time"])
df_threadfull_system_msec["Time"] = pd.to_datetime(df_threadfull_system_msec["Time"])
df_threadfull_user_msec["Time"] = pd.to_datetime(df_threadfull_user_msec["Time"])
df_threadfull_util_per["Time"] = pd.to_datetime(df_threadfull_util_per["Time"])


delay = 30
duration = 50

# Synchronize all datasets with baseline timeline
time_offset = df_baseline_iowait["Time"].min()

# Synchronize CACHE FILLING datasets
cachefilling_offset = time_offset - df_cachefilling_iowait["Time"].min()
df_cachefilling_iowait["Time"] += cachefilling_offset
df_cachefilling_irq["Time"] += cachefilling_offset
df_cachefilling_system_msec["Time"] += cachefilling_offset
df_cachefilling_user_msec["Time"] += cachefilling_offset
df_cachefilling_util_per["Time"] += cachefilling_offset

# Synchronize GC STRESS datasets
gcstress_offset = time_offset - df_gcstress_iowait["Time"].min()
df_gcstress_iowait["Time"] += gcstress_offset
df_gcstress_irq["Time"] += gcstress_offset
df_gcstress_system_msec["Time"] += gcstress_offset
df_gcstress_user_msec["Time"] += gcstress_offset
df_gcstress_util_per["Time"] += gcstress_offset

# Synchronize OOM datasets
oom_offset = time_offset - df_oom_iowait["Time"].min()
df_oom_iowait["Time"] += oom_offset
df_oom_irq["Time"] += oom_offset
df_oom_system_msec["Time"] += oom_offset
df_oom_user_msec["Time"] += oom_offset
df_oom_util_per["Time"] += oom_offset

# Synchronize THREADFULL datasets
threadfull_offset = time_offset - df_threadfull_iowait["Time"].min()
df_threadfull_iowait["Time"] += threadfull_offset
df_threadfull_irq["Time"] += threadfull_offset
df_threadfull_system_msec["Time"] += threadfull_offset
df_threadfull_user_msec["Time"] += threadfull_offset
df_threadfull_util_per["Time"] += threadfull_offset


# Convert timeline to minutes for ALL datasets
all_dfs = [
    df_baseline_iowait, df_baseline_irq, df_baseline_system_msec, df_baseline_user_msec, df_baseline_util_per,
    df_cachefilling_iowait, df_cachefilling_irq, df_cachefilling_system_msec, df_cachefilling_user_msec, df_cachefilling_util_per,
    df_gcstress_iowait, df_gcstress_irq, df_gcstress_system_msec, df_gcstress_user_msec, df_gcstress_util_per,
    df_oom_iowait, df_oom_irq, df_oom_system_msec, df_oom_user_msec, df_oom_util_per,
    df_threadfull_iowait, df_threadfull_irq, df_threadfull_system_msec, df_threadfull_user_msec, df_threadfull_util_per
]

for df in all_dfs:
    df["Minutes"] = (df["Time"] - df["Time"].min()).dt.total_seconds() / 60

# COMPLETE DATASETS DICTIONARY 
all_datasets = {
    'IOWait': {
        'baseline': df_baseline_iowait,
        'cache_filling': df_cachefilling_iowait,
        'gc_stress': df_gcstress_iowait,
        'oom': df_oom_iowait,
        'threadfull': df_threadfull_iowait,
    },
    'IRQ': {
        'baseline': df_baseline_irq,
        'cache_filling': df_cachefilling_irq,
        'gc_stress': df_gcstress_irq,
        'oom': df_oom_irq,
        'threadfull': df_threadfull_irq,
    },
    'System': {
        'baseline': df_baseline_system_msec,
        'cache_filling': df_cachefilling_system_msec,
        'gc_stress': df_gcstress_system_msec,
        'oom': df_oom_system_msec,
        'threadfull': df_threadfull_system_msec,
    },
    'User': {
        'baseline': df_baseline_user_msec,
        'cache_filling': df_cachefilling_user_msec,
        'gc_stress': df_gcstress_user_msec,
        'oom': df_oom_user_msec,
        'threadfull': df_threadfull_user_msec,
    },
    'Utilization': {
        'baseline': df_baseline_util_per,
        'cache_filling': df_cachefilling_util_per,
        'gc_stress': df_gcstress_util_per,
        'oom': df_oom_util_per,
        'threadfull': df_threadfull_util_per,
    }
}

print("✅ All datasets loaded successfully!")
print(f"📊 Loaded {len(all_datasets)} metrics across {len(all_datasets['IOWait'])} experiment types")
print("\nDataset structure:")
for metric, experiments in all_datasets.items():
    print(f"  {metric}: {list(experiments.keys())}")

FileNotFoundError: [Errno 2] No such file or directory: 'threadfull/cpu_irq.csv'