In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as scipy_stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')

# Load BASELINE datasets
df_baseline_api_gateway = pd.read_csv("baseline/retrans_packets/api_gateway.csv")
df_baseline_customers_service = pd.read_csv("baseline/retrans_packets/customers_service.csv")
df_baseline_vets_service = pd.read_csv("baseline/retrans_packets/vets_service.csv")
df_baseline_visits_service = pd.read_csv("baseline/retrans_packets/visits_service.csv")
df_baseline_srtt = pd.read_csv("baseline/srtt.csv")

# Load CPU STRESS datasets
df_cpustress_api_gateway = pd.read_csv("cpu stress/retrans_packets/api_gateway.csv")
df_cpustress_customers_service = pd.read_csv("cpu stress/retrans_packets/customers_service.csv")
df_cpustress_vets_service = pd.read_csv("cpu stress/retrans_packets/vets_service.csv")
df_cpustress_visits_service = pd.read_csv("cpu stress/retrans_packets/visits_service.csv")
df_cpustress_srtt = pd.read_csv("cpu stress/srtt.csv")

# Load IO datasets
df_IO_api_gateway = pd.read_csv("delay/retrans_packets/api_gateway.csv")
df_IO_customers_service = pd.read_csv("delay/retrans_packets/customers_service.csv")
df_IO_vets_service = pd.read_csv("delay/retrans_packets/vets_service.csv")
df_IO_visits_service = pd.read_csv("delay/retrans_packets/visits_service.csv")
df_IO_srtt = pd.read_csv("delay/srtt.csv")

# Load MEM STRESS datasets
df_memstress_api_gateway = pd.read_csv("mem stress/retrans_packets/api_gateway.csv")
df_memstress_customers_service = pd.read_csv("mem stress/retrans_packets/customers_service.csv")
df_memstress_vets_service = pd.read_csv("mem stress/retrans_packets/vets_service.csv")
df_memstress_visits_service = pd.read_csv("mem stress/retrans_packets/visits_service.csv")
df_memstress_srtt = pd.read_csv("mem stress/srtt.csv")

# Load NET LOSS datasets
df_netloss_api_gateway = pd.read_csv("net loss/retrans_packets/api_gateway.csv")
df_netloss_customers_service = pd.read_csv("net loss/retrans_packets/customers_service.csv")
df_netloss_vets_service = pd.read_csv("net loss/retrans_packets/vets_service.csv")
df_netloss_visits_service = pd.read_csv("net loss/retrans_packets/visits_service.csv")
df_netloss_srtt = pd.read_csv("net loss/srtt.csv")


# Add source labels - BASELINE
df_baseline_api_gateway["source"] = "BASELINE"
df_baseline_customers_service["source"] = "BASELINE"
df_baseline_vets_service["source"] = "BASELINE"
df_baseline_visits_service["source"] = "BASELINE"
df_baseline_srtt["source"] = "BASELINE"

# Add source labels - CPU STRESS
df_cpustress_api_gateway["source"] = "CPU_STRESS"
df_cpustress_customers_service["source"] = "CPU_STRESS"
df_cpustress_vets_service["source"] = "CPU_STRESS"
df_cpustress_visits_service["source"] = "CPU_STRESS"
df_cpustress_srtt["source"] = "CPU_STRESS"

# Add source labels - DELAY
df_IO_api_gateway["source"] = "IO"
df_IO_customers_service["source"] = "IO"
df_IO_vets_service["source"] = "IO"
df_IO_visits_service["source"] = "IO"
df_IO_srtt["source"] = "IO"

# Add source labels - MEM STRESS
df_memstress_api_gateway["source"] = "MEM_STRESS"
df_memstress_customers_service["source"] = "MEM_STRESS"
df_memstress_vets_service["source"] = "MEM_STRESS"
df_memstress_visits_service["source"] = "MEM_STRESS"
df_memstress_srtt["source"] = "MEM_STRESS"

# Add source labels - NET LOSS
df_netloss_api_gateway["source"] = "NET_LOSS"
df_netloss_customers_service["source"] = "NET_LOSS"
df_netloss_vets_service["source"] = "NET_LOSS"
df_netloss_visits_service["source"] = "NET_LOSS"
df_netloss_srtt["source"] = "NET_LOSS"


# Convert to datetime - BASELINE
df_baseline_api_gateway["Time"] = pd.to_datetime(df_baseline_api_gateway["Time"])
df_baseline_customers_service["Time"] = pd.to_datetime(df_baseline_customers_service["Time"])
df_baseline_vets_service["Time"] = pd.to_datetime(df_baseline_vets_service["Time"])
df_baseline_visits_service["Time"] = pd.to_datetime(df_baseline_visits_service["Time"])
df_baseline_srtt["Time"] = pd.to_datetime(df_baseline_srtt["Time"])

# Convert to datetime - CPU STRESS
df_cpustress_api_gateway["Time"] = pd.to_datetime(df_cpustress_api_gateway["Time"])
df_cpustress_customers_service["Time"] = pd.to_datetime(df_cpustress_customers_service["Time"])
df_cpustress_vets_service["Time"] = pd.to_datetime(df_cpustress_vets_service["Time"])
df_cpustress_visits_service["Time"] = pd.to_datetime(df_cpustress_visits_service["Time"])
df_cpustress_srtt["Time"] = pd.to_datetime(df_cpustress_srtt["Time"])

# Convert to datetime - IO
df_IO_api_gateway["Time"] = pd.to_datetime(df_IO_api_gateway["Time"])
df_IO_customers_service["Time"] = pd.to_datetime(df_IO_customers_service["Time"])
df_IO_vets_service["Time"] = pd.to_datetime(df_IO_vets_service["Time"])
df_IO_visits_service["Time"] = pd.to_datetime(df_IO_visits_service["Time"])
df_IO_srtt["Time"] = pd.to_datetime(df_IO_srtt["Time"])

# Convert to datetime - MEM STRESS
df_memstress_api_gateway["Time"] = pd.to_datetime(df_memstress_api_gateway["Time"])
df_memstress_customers_service["Time"] = pd.to_datetime(df_memstress_customers_service["Time"])
df_memstress_vets_service["Time"] = pd.to_datetime(df_memstress_vets_service["Time"])
df_memstress_visits_service["Time"] = pd.to_datetime(df_memstress_visits_service["Time"])
df_memstress_srtt["Time"] = pd.to_datetime(df_memstress_srtt["Time"])

# Convert to datetime - NET LOSS
df_netloss_api_gateway["Time"] = pd.to_datetime(df_netloss_api_gateway["Time"])
df_netloss_customers_service["Time"] = pd.to_datetime(df_netloss_customers_service["Time"])
df_netloss_vets_service["Time"] = pd.to_datetime(df_netloss_vets_service["Time"])
df_netloss_visits_service["Time"] = pd.to_datetime(df_netloss_visits_service["Time"])
df_netloss_srtt["Time"] = pd.to_datetime(df_netloss_srtt["Time"])


delay = 30
duration = 50

# Synchronize all datasets with baseline timeline
time_offset = df_baseline_api_gateway["Time"].min()

# Synchronize CPU STRESS datasets
cpustress_offset = time_offset - df_cpustress_api_gateway["Time"].min()
df_cpustress_api_gateway["Time"] += cpustress_offset
df_cpustress_customers_service["Time"] += cpustress_offset
df_cpustress_vets_service["Time"] += cpustress_offset
df_cpustress_visits_service["Time"] += cpustress_offset
df_cpustress_srtt["Time"] += cpustress_offset

# Synchronize DELAY datasets
delay_offset = time_offset - df_IO_api_gateway["Time"].min()
df_IO_api_gateway["Time"] += delay_offset
df_IO_customers_service["Time"] += delay_offset
df_IO_vets_service["Time"] += delay_offset
df_IO_visits_service["Time"] += delay_offset
df_IO_srtt["Time"] += delay_offset

# Synchronize MEM STRESS datasets
memstress_offset = time_offset - df_memstress_api_gateway["Time"].min()
df_memstress_api_gateway["Time"] += memstress_offset
df_memstress_customers_service["Time"] += memstress_offset
df_memstress_vets_service["Time"] += memstress_offset
df_memstress_visits_service["Time"] += memstress_offset
df_memstress_srtt["Time"] += memstress_offset

# Synchronize NET LOSS datasets
netloss_offset = time_offset - df_netloss_api_gateway["Time"].min()
df_netloss_api_gateway["Time"] += netloss_offset
df_netloss_customers_service["Time"] += netloss_offset
df_netloss_vets_service["Time"] += netloss_offset
df_netloss_visits_service["Time"] += netloss_offset
df_netloss_srtt["Time"] += netloss_offset


# Convert timeline to minutes for ALL datasets
all_dfs = [
    # Baseline
    df_baseline_api_gateway, df_baseline_customers_service, df_baseline_vets_service, 
    df_baseline_visits_service, df_baseline_srtt,
    # CPU Stress
    df_cpustress_api_gateway, df_cpustress_customers_service, df_cpustress_vets_service, 
    df_cpustress_visits_service, df_cpustress_srtt,
    # Delay
    df_IO_api_gateway, df_IO_customers_service, df_IO_vets_service, 
    df_IO_visits_service, df_IO_srtt,
    # Memory Stress
    df_memstress_api_gateway, df_memstress_customers_service, df_memstress_vets_service, 
    df_memstress_visits_service, df_memstress_srtt,
    # Network Loss
    df_netloss_api_gateway, df_netloss_customers_service, df_netloss_vets_service, 
    df_netloss_visits_service, df_netloss_srtt
]

for df in all_dfs:
    df["Minutes"] = (df["Time"] - df["Time"].min()).dt.total_seconds() / 60

# COMPLETE DATASETS DICTIONARY 
all_datasets = {
    'ApiGateway': {
        'baseline': df_baseline_api_gateway,
        'cpu_stress': df_cpustress_api_gateway,
        'IO': df_IO_api_gateway,
        'mem_stress': df_memstress_api_gateway,
        'net_loss': df_netloss_api_gateway,
    },
    'CustomersService': {
        'baseline': df_baseline_customers_service,
        'cpu_stress': df_cpustress_customers_service,
        'IO': df_IO_customers_service,
        'mem_stress': df_memstress_customers_service,
        'net_loss': df_netloss_customers_service,
    },
    'VetsService': {
        'baseline': df_baseline_vets_service,
        'cpu_stress': df_cpustress_vets_service,
        'IO': df_IO_vets_service,
        'mem_stress': df_memstress_vets_service,
        'net_loss': df_netloss_vets_service,
    },
    'VisitsService': {
        'baseline': df_baseline_visits_service,
        'cpu_stress': df_cpustress_visits_service,
        'IO': df_IO_visits_service,
        'mem_stress': df_memstress_visits_service,
        'net_loss': df_netloss_visits_service,
    },
    'SRTT': {
        'baseline': df_baseline_srtt,
        'cpu_stress': df_cpustress_srtt,
        'IO': df_IO_srtt,
        'mem_stress': df_memstress_srtt,
        'net_loss': df_netloss_srtt,
    }
}

print("✅ All TCP retransmission datasets loaded successfully!")
print(f"📊 Loaded {len(all_datasets)} TCP metrics across {len(all_datasets['ApiGateway'])} experiment types")
print("\nDataset structure:")
for metric, experiments in all_datasets.items():
    print(f"  {metric}: {list(experiments.keys())}")