In [37]:
import pandas as pd
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [21]:
# Paths to service 1 datasets
cpu_path_s1 = "../../results/prometheus_data/service1_cpu_limit_reduction.csv"
memory_path_s1 = "../../results/prometheus_data/new datasets/service1_memory_limit_reduction.csv"
both_path_s1 = "../../results/prometheus_data/service1_both_limits_reduction.csv"

# Import datasets
df_cpu_s1 = pd.read_csv(cpu_path_s1)
df_memory_s1 = pd.read_csv(memory_path_s1)
df_both_s1 = pd.read_csv(both_path_s1)

df_all_s1 = pd.concat([df_cpu_s1, df_memory_s1, df_both_s1], ignore_index=True)
print(df_all_s1.columns)

# Paths to service 2 datasets
cpu_path_s2 = "../../results/prometheus_data/service2_cpu_limit_reduction.csv"
memory_path_s2 = "../../results/prometheus_data/service2_memory_limit_reduction.csv"
both_path_s2 = "../../results/prometheus_data/service2_both_limit_reduction.csv"

# Import datasets
df_cpu_s2 = pd.read_csv(cpu_path_s2)
df_memory_s2 = pd.read_csv(memory_path_s2)
df_both_s2 = pd.read_csv(both_path_s2)

# Combine all three DataFrames
df_all_s2 = pd.concat([df_cpu_s2, df_memory_s2, df_both_s2], ignore_index=True)

# Paths to datasets
cpu_path_hg = "../../results/prometheus_data/hashgen_cpu_limit_reduction.csv"
memory_path_hg = "../../results/prometheus_data/hashgen_memory_limit_reduction.csv"
both_path_hg = "../../results/prometheus_data/hashgen_both_limit_reduction.csv"

# Import datasets
df_cpu_hg = pd.read_csv(cpu_path_hg)
df_memory_hg = pd.read_csv(memory_path_hg)
df_both_hg = pd.read_csv(both_path_hg)

# Combine all three DataFrames
df_all_hg = pd.concat([df_cpu_hg, df_memory_hg, df_both_hg], ignore_index=True)

# Paths to datasets
cpu_path_rp = "../../results/prometheus_data/ranspw_cpu_limit_reduction.csv"
memory_path_rp = "../../results/prometheus_data/randpw_memory_limit_reduction.csv"
both_path_rp = "../../results/prometheus_data/randpw_both_limits_reduction.csv"

# Import datasets
df_cpu_rp = pd.read_csv(cpu_path_rp)
df_memory_rp = pd.read_csv(memory_path_rp)
df_both_rp = pd.read_csv(both_path_rp)

# Combine all three DataFrames
df_all_rp = pd.concat([df_cpu_rp, df_memory_rp, df_both_rp], ignore_index=True)

Index(['Timestamp', 'Service', 'CPU Request', 'Memory Request', 'CPU Limit',
       'Memory Limit', 'Latency', 'CPU Usage', 'Memory Usage'],
      dtype='object')


In [22]:
configs = {
    "Service 1": df_all_s1,
    "Service 2": df_all_s2,
    "HashGen": df_all_hg,
    "RandPw": df_all_rp,
}

test_sizes = [0.3, 0.2, 0.1]

In [54]:
from pmdarima import auto_arima
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

def train_auto_arima(df, feature_col, test_size=0.2, plot=True, seasonal=False, m=1):
    # Sort by timestamp if present
    if 'Timestamp' in df.columns:
        df = df.sort_values('Timestamp')

    series = df[feature_col].values
    split_idx = int(len(series) * (1 - test_size))
    train, test = series[:split_idx], series[split_idx:]

    # Fit AutoARIMA model
    model = auto_arima(
        train,
        seasonal=True,     # enable seasonality if applicable
        m=m,              
        max_p=5,
        max_q=5,
        max_d=2,
        # trace=True,        # show progress for diagnostics
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True,
        information_criterion='aic',
        boxcox=True,
    )


    # Predict in-sample (train)
    train_pred = model.predict_in_sample()

    # Forecast (test)
    test_pred = model.predict(n_periods=len(test))

    # Evaluate train
    train_rmse = np.sqrt(mean_squared_error(train, train_pred))
    train_r2 = r2_score(train, train_pred)

    # Evaluate test
    test_rmse = np.sqrt(mean_squared_error(test, test_pred))
    test_r2 = r2_score(test, test_pred)

    # Print metrics
    print(f"{feature_col} - Train RMSE: {train_rmse:.4f}, R²: {train_r2:.4f}")
    print(f"{feature_col} - Test  RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")

    # Optional plot
    # if plot:
    #     plt.figure(figsize=(10, 4))
    #     plt.plot(np.arange(len(train)), train, label='Train')
    #     plt.plot(np.arange(len(train)), train_pred, label='Train Predicted')
    #     plt.plot(np.arange(len(train), len(train) + len(test)), test, label='Test')
    #     plt.plot(np.arange(len(train), len(train) + len(test)), test_pred, label='Test Predicted')
    #     plt.title(f"{feature_col} AutoARIMA Prediction")
    #     plt.legend()
    #     plt.grid(True)
    #     plt.tight_layout()
    #     plt.show()

    return model


In [55]:
for name, df in configs.items():
    for test_size in test_sizes:
        print(f"Training Model for {name} - CPU Usage with test size {test_size}")
        model_cpu = train_auto_arima(df, "CPU Usage", test_size)

        print(f"Training Model for {name} - Memory Usage with test size {test_size}")
        model_mem = train_auto_arima(df, "Memory Usage", test_size)
        print()

Training Model for Service 1 - CPU Usage with test size 0.3
CPU Usage - Train RMSE: 0.0034, R²: 0.9866
CPU Usage - Test  RMSE: 0.0559, R²: -3.3336
Training Model for Service 1 - Memory Usage with test size 0.3
Memory Usage - Train RMSE: 4787648.0455, R²: 0.8199
Memory Usage - Test  RMSE: 7150341.9013, R²: -0.0850

Training Model for Service 1 - CPU Usage with test size 0.2
CPU Usage - Train RMSE: 0.0032, R²: 0.9867
CPU Usage - Test  RMSE: 0.0384, R²: -3.3780
Training Model for Service 1 - Memory Usage with test size 0.2
Memory Usage - Train RMSE: 4486531.5814, R²: 0.8228
Memory Usage - Test  RMSE: 7924769.1846, R²: -0.0361

Training Model for Service 1 - CPU Usage with test size 0.1
CPU Usage - Train RMSE: 0.0033, R²: 0.9868
CPU Usage - Test  RMSE: 0.0050, R²: 0.6904
Training Model for Service 1 - Memory Usage with test size 0.1
Memory Usage - Train RMSE: 4237992.0142, R²: 0.8280
Memory Usage - Test  RMSE: 9907114.3315, R²: 0.0520

Training Model for Service 2 - CPU Usage with test siz