In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import numpy as np
from scipy import stats as scipy_stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')

# Load BASELINE datasets
df_baseline_api_gateway = pd.read_csv("baseline/retrans_packets/api_gateway.csv")
df_baseline_customers_service = pd.read_csv("baseline/retrans_packets/customers_service.csv")
df_baseline_vets_service = pd.read_csv("baseline/retrans_packets/vets_service.csv")
df_baseline_visits_service = pd.read_csv("baseline/retrans_packets/visits_service.csv")
df_baseline_srtt = pd.read_csv("baseline/srtt.csv")

# Load CACHE FILLING datasets
df_cache_filling_api_gateway = pd.read_csv("cache filling/retrans_packets/api_gateway.csv")
df_cache_filling_customers_service = pd.read_csv("cache filling/retrans_packets/customers_service.csv")
df_cache_filling_vets_service = pd.read_csv("cache filling/retrans_packets/vets_service.csv")
df_cache_filling_visits_service = pd.read_csv("cache filling/retrans_packets/visits_service.csv")
df_cache_filling_srtt = pd.read_csv("cache filling/srtt.csv")

# Load GC STRESS datasets
df_gc_stress_api_gateway = pd.read_csv("gc stress/retrans_packets/api_gateway.csv")
df_gc_stress_customers_service = pd.read_csv("gc stress/retrans_packets/customers_service.csv")
df_gc_stress_vets_service = pd.read_csv("gc stress/retrans_packets/vets_service.csv")
df_gc_stress_visits_service = pd.read_csv("gc stress/retrans_packets/visits_service.csv")
df_gc_stress_srtt = pd.read_csv("gc stress/srtt.csv")

# Load OOM datasets
df_oom_api_gateway = pd.read_csv("oom/retrans_packets/api_gateway.csv")
df_oom_customers_service = pd.read_csv("oom/retrans_packets/customers_service.csv")
df_oom_vets_service = pd.read_csv("oom/retrans_packets/vets_service.csv")
df_oom_visits_service = pd.read_csv("oom/retrans_packets/visits_service.csv")
df_oom_srtt = pd.read_csv("oom/srtt.csv")

# Load THREADFULL datasets
df_threadfull_api_gateway = pd.read_csv("threadfull/retrans_packets/api_gateway.csv")
df_threadfull_customers_service = pd.read_csv("threadfull/retrans_packets/customers_service.csv")
df_threadfull_vets_service = pd.read_csv("threadfull/retrans_packets/vets_service.csv")
df_threadfull_visits_service = pd.read_csv("threadfull/retrans_packets/visits_service.csv")
df_threadfull_srtt = pd.read_csv("threadfull/srtt.csv")


# Add source labels - BASELINE
df_baseline_api_gateway["source"] = "BASELINE"
df_baseline_customers_service["source"] = "BASELINE"
df_baseline_vets_service["source"] = "BASELINE"
df_baseline_visits_service["source"] = "BASELINE"
df_baseline_srtt["source"] = "BASELINE"

# Add source labels - CACHE FILLING
df_cache_filling_api_gateway["source"] = "CACHE_FILLING"
df_cache_filling_customers_service["source"] = "CACHE_FILLING"
df_cache_filling_vets_service["source"] = "CACHE_FILLING"
df_cache_filling_visits_service["source"] = "CACHE_FILLING"
df_cache_filling_srtt["source"] = "CACHE_FILLING"

# Add source labels - GC STRESS
df_gc_stress_api_gateway["source"] = "GC_STRESS"
df_gc_stress_customers_service["source"] = "GC_STRESS"
df_gc_stress_vets_service["source"] = "GC_STRESS"
df_gc_stress_visits_service["source"] = "GC_STRESS"
df_gc_stress_srtt["source"] = "GC_STRESS"

# Add source labels - OOM
df_oom_api_gateway["source"] = "OOM"
df_oom_customers_service["source"] = "OOM"
df_oom_vets_service["source"] = "OOM"
df_oom_visits_service["source"] = "OOM"
df_oom_srtt["source"] = "OOM"

# Add source labels - THREADFULL
df_threadfull_api_gateway["source"] = "THREADFULL"
df_threadfull_customers_service["source"] = "THREADFULL"
df_threadfull_vets_service["source"] = "THREADFULL"
df_threadfull_visits_service["source"] = "THREADFULL"
df_threadfull_srtt["source"] = "THREADFULL"


# Convert to datetime - BASELINE
df_baseline_api_gateway["Time"] = pd.to_datetime(df_baseline_api_gateway["Time"])
df_baseline_customers_service["Time"] = pd.to_datetime(df_baseline_customers_service["Time"])
df_baseline_vets_service["Time"] = pd.to_datetime(df_baseline_vets_service["Time"])
df_baseline_visits_service["Time"] = pd.to_datetime(df_baseline_visits_service["Time"])
df_baseline_srtt["Time"] = pd.to_datetime(df_baseline_srtt["Time"])

# Convert to datetime - CACHE FILLING
df_cache_filling_api_gateway["Time"] = pd.to_datetime(df_cache_filling_api_gateway["Time"])
df_cache_filling_customers_service["Time"] = pd.to_datetime(df_cache_filling_customers_service["Time"])
df_cache_filling_vets_service["Time"] = pd.to_datetime(df_cache_filling_vets_service["Time"])
df_cache_filling_visits_service["Time"] = pd.to_datetime(df_cache_filling_visits_service["Time"])
df_cache_filling_srtt["Time"] = pd.to_datetime(df_cache_filling_srtt["Time"])

# Convert to datetime - GC STRESS
df_gc_stress_api_gateway["Time"] = pd.to_datetime(df_gc_stress_api_gateway["Time"])
df_gc_stress_customers_service["Time"] = pd.to_datetime(df_gc_stress_customers_service["Time"])
df_gc_stress_vets_service["Time"] = pd.to_datetime(df_gc_stress_vets_service["Time"])
df_gc_stress_visits_service["Time"] = pd.to_datetime(df_gc_stress_visits_service["Time"])
df_gc_stress_srtt["Time"] = pd.to_datetime(df_gc_stress_srtt["Time"])

# Convert to datetime - OOM
df_oom_api_gateway["Time"] = pd.to_datetime(df_oom_api_gateway["Time"])
df_oom_customers_service["Time"] = pd.to_datetime(df_oom_customers_service["Time"])
df_oom_vets_service["Time"] = pd.to_datetime(df_oom_vets_service["Time"])
df_oom_visits_service["Time"] = pd.to_datetime(df_oom_visits_service["Time"])
df_oom_srtt["Time"] = pd.to_datetime(df_oom_srtt["Time"])

# Convert to datetime - THREADFULL
df_threadfull_api_gateway["Time"] = pd.to_datetime(df_threadfull_api_gateway["Time"])
df_threadfull_customers_service["Time"] = pd.to_datetime(df_threadfull_customers_service["Time"])
df_threadfull_vets_service["Time"] = pd.to_datetime(df_threadfull_vets_service["Time"])
df_threadfull_visits_service["Time"] = pd.to_datetime(df_threadfull_visits_service["Time"])
df_threadfull_srtt["Time"] = pd.to_datetime(df_threadfull_srtt["Time"])


delay = 30
duration = 50

# Synchronize all datasets with baseline timeline
time_offset = df_baseline_api_gateway["Time"].min()

# Synchronize CPU STRESS datasets
cpustress_offset = time_offset - df_cache_filling_api_gateway["Time"].min()
df_cache_filling_api_gateway["Time"] += cpustress_offset
df_cache_filling_customers_service["Time"] += cpustress_offset
df_cache_filling_vets_service["Time"] += cpustress_offset
df_cache_filling_visits_service["Time"] += cpustress_offset
df_cache_filling_srtt["Time"] += cpustress_offset

# Synchronize DELAY datasets
delay_offset = time_offset - df_gc_stress_api_gateway["Time"].min()
df_gc_stress_api_gateway["Time"] += delay_offset
df_gc_stress_customers_service["Time"] += delay_offset
df_gc_stress_vets_service["Time"] += delay_offset
df_gc_stress_visits_service["Time"] += delay_offset
df_gc_stress_srtt["Time"] += delay_offset

# Synchronize MEM STRESS datasets
memstress_offset = time_offset - df_oom_api_gateway["Time"].min()
df_oom_api_gateway["Time"] += memstress_offset
df_oom_customers_service["Time"] += memstress_offset
df_oom_vets_service["Time"] += memstress_offset
df_oom_visits_service["Time"] += memstress_offset
df_oom_srtt["Time"] += memstress_offset

# Synchronize NET LOSS datasets
netloss_offset = time_offset - df_threadfull_api_gateway["Time"].min()
df_threadfull_api_gateway["Time"] += netloss_offset
df_threadfull_customers_service["Time"] += netloss_offset
df_threadfull_vets_service["Time"] += netloss_offset
df_threadfull_visits_service["Time"] += netloss_offset
df_threadfull_srtt["Time"] += netloss_offset


# Convert timeline to minutes for ALL datasets
all_dfs = [
    # Baseline
    df_baseline_api_gateway, df_baseline_customers_service, df_baseline_vets_service, 
    df_baseline_visits_service, df_baseline_srtt,
    # CPU Stress
    df_cache_filling_api_gateway, df_cache_filling_customers_service, df_cache_filling_vets_service, 
    df_cache_filling_visits_service, df_cache_filling_srtt,
    # Delay
    df_gc_stress_api_gateway, df_gc_stress_customers_service, df_gc_stress_vets_service, 
    df_gc_stress_visits_service, df_gc_stress_srtt,
    # Memory Stress
    df_oom_api_gateway, df_oom_customers_service, df_oom_vets_service, 
    df_oom_visits_service, df_oom_srtt,
    # Network Loss
    df_threadfull_api_gateway, df_threadfull_customers_service, df_threadfull_vets_service, 
    df_threadfull_visits_service, df_threadfull_srtt
]

for df in all_dfs:
    df["Minutes"] = (df["Time"] - df["Time"].min()).dt.total_seconds() / 60

# COMPLETE DATASETS DICTIONARY 
all_datasets = {
    'ApiGateway': {
        'baseline': df_baseline_api_gateway,
        'cpu_stress': df_cache_filling_api_gateway,
        'delay': df_gc_stress_api_gateway,
        'mem_stress': df_oom_api_gateway,
        'net_loss': df_threadfull_api_gateway,
    },
    'CustomersService': {
        'baseline': df_baseline_customers_service,
        'cpu_stress': df_cache_filling_customers_service,
        'delay': df_gc_stress_customers_service,
        'mem_stress': df_oom_customers_service,
        'net_loss': df_threadfull_customers_service,
    },
    'VetsService': {
        'baseline': df_baseline_vets_service,
        'cpu_stress': df_cache_filling_vets_service,
        'delay': df_gc_stress_vets_service,
        'mem_stress': df_oom_vets_service,
        'net_loss': df_threadfull_vets_service,
    },
    'VisitsService': {
        'baseline': df_baseline_visits_service,
        'cpu_stress': df_cache_filling_visits_service,
        'delay': df_gc_stress_visits_service,
        'mem_stress': df_oom_visits_service,
        'net_loss': df_threadfull_visits_service,
    },
    'SRTT': {
        'baseline': df_baseline_srtt,
        'cpu_stress': df_cache_filling_srtt,
        'delay': df_gc_stress_srtt,
        'mem_stress': df_oom_srtt,
        'net_loss': df_threadfull_srtt,
    }
}

print("✅ All TCP retransmission datasets loaded successfully!")
print(f"📊 Loaded {len(all_datasets)} TCP metrics across {len(all_datasets['ApiGateway'])} experiment types")
print("\nDataset structure:")
for metric, experiments in all_datasets.items():
    print(f"  {metric}: {list(experiments.keys())}")