In [6]:
import pandas as pd
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import warnings
from sklearn.model_selection import TimeSeriesSplit
warnings.filterwarnings("ignore", category=FutureWarning)


In [7]:
# Paths to service 1 datasets
cpu_path_s1 = "../../results/prometheus_data/service1_cpu_limit_reduction.csv"
memory_path_s1 = "../../results/prometheus_data/new datasets/service1_memory_limit_reduction.csv"
both_path_s1 = "../../results/prometheus_data/service1_both_limits_reduction.csv"

# Import datasets
df_cpu_s1 = pd.read_csv(cpu_path_s1)
df_memory_s1 = pd.read_csv(memory_path_s1)
df_both_s1 = pd.read_csv(both_path_s1)

df_all_s1 = pd.concat([df_cpu_s1, df_memory_s1, df_both_s1], ignore_index=True)
print(df_all_s1.columns)

# Paths to service 2 datasets
cpu_path_s2 = "../../results/prometheus_data/service2_cpu_limit_reduction.csv"
memory_path_s2 = "../../results/prometheus_data/service2_memory_limit_reduction.csv"
both_path_s2 = "../../results/prometheus_data/service2_both_limit_reduction.csv"

# Import datasets
df_cpu_s2 = pd.read_csv(cpu_path_s2)
df_memory_s2 = pd.read_csv(memory_path_s2)
df_both_s2 = pd.read_csv(both_path_s2)

# Combine all three DataFrames
df_all_s2 = pd.concat([df_cpu_s2, df_memory_s2, df_both_s2], ignore_index=True)

# Paths to datasets
cpu_path_hg = "../../results/prometheus_data/hashgen_cpu_limit_reduction.csv"
memory_path_hg = "../../results/prometheus_data/hashgen_memory_limit_reduction.csv"
both_path_hg = "../../results/prometheus_data/hashgen_both_limit_reduction.csv"

# Import datasets
df_cpu_hg = pd.read_csv(cpu_path_hg)
df_memory_hg = pd.read_csv(memory_path_hg)
df_both_hg = pd.read_csv(both_path_hg)

# Combine all three DataFrames
df_all_hg = pd.concat([df_cpu_hg, df_memory_hg, df_both_hg], ignore_index=True)

# Paths to datasets
cpu_path_rp = "../../results/prometheus_data/ranspw_cpu_limit_reduction.csv"
memory_path_rp = "../../results/prometheus_data/randpw_memory_limit_reduction.csv"
both_path_rp = "../../results/prometheus_data/randpw_both_limits_reduction.csv"

# Import datasets
df_cpu_rp = pd.read_csv(cpu_path_rp)
df_memory_rp = pd.read_csv(memory_path_rp)
df_both_rp = pd.read_csv(both_path_rp)

# Combine all three DataFrames
df_all_rp = pd.concat([df_cpu_rp, df_memory_rp, df_both_rp], ignore_index=True)

Index(['Timestamp', 'Service', 'CPU Request', 'Memory Request', 'CPU Limit',
       'Memory Limit', 'Latency', 'CPU Usage', 'Memory Usage'],
      dtype='object')


In [8]:
configs = {
    "Service 1": df_all_s1,
    "Service 2": df_all_s2,
    "HashGen": df_all_hg,
    "RandPw": df_all_rp,
}

test_sizes = [0.3, 0.2, 0.1]

In [9]:
def train_auto_arima(df, feature_col, test_size, plot=False, seasonal=False, m=1):
    if 'Timestamp' in df.columns:
        df = df.sort_values('Timestamp')

    if "Memory" in feature_col:
        df[feature_col] = df[feature_col] / (1024 * 1024)

    series = df[feature_col].values

    split_idx = int(len(series) * (1 - test_size))
    train, test = series[:split_idx], series[split_idx:]

    model = auto_arima(
        train,
        seasonal=seasonal,
        m=m,
        max_p=5,
        max_q=5,
        max_d=2,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True,
        information_criterion='aic',
        boxcox=True,
    )

    train_pred = model.predict_in_sample()
    test_pred = model.predict(n_periods=len(test))

    train_rmse = np.sqrt(mean_squared_error(train, train_pred))
    train_r2 = r2_score(train, train_pred)
    test_rmse = np.sqrt(mean_squared_error(test, test_pred))
    test_r2 = r2_score(test, test_pred)

    print(f"\nTest size: {test_size*100:.0f}%")
    print(f"{feature_col} - Train RMSE: {train_rmse:.4f}, R²: {train_r2:.4f}")
    print(f"{feature_col} - Test  RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")

    if plot:
        plt.figure(figsize=(10, 4))
        plt.plot(np.arange(len(train)), train, label='Train')
        plt.plot(np.arange(len(train)), train_pred, label='Train Predicted')
        plt.plot(np.arange(len(train), len(series)), test, label='Test')
        plt.plot(np.arange(len(train), len(series)), test_pred, label='Test Predicted')
        plt.title(f"{feature_col} AutoARIMA Prediction (Test Size: {test_size*100:.0f}%)")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()


In [10]:
for name, df in configs.items():
    for test_size in test_sizes:
        print(f"Training Model for {name} - CPU Usage with test size {test_size}")
        model_cpu = train_auto_arima(df, "CPU Usage", test_size)

        print(f"Training Model for {name} - Memory Usage with test size {test_size}")
        model_mem = train_auto_arima(df, "Memory Usage", test_size)
        print()

Training Model for Service 1 - CPU Usage with test size 0.3

Test size: 30%
CPU Usage - Train RMSE: 0.0034, R²: 0.9866
CPU Usage - Test  RMSE: 0.0559, R²: -3.3336
Training Model for Service 1 - Memory Usage with test size 0.3

Test size: 30%
Memory Usage - Train RMSE: 4.5158, R²: 0.8238
Memory Usage - Test  RMSE: 7.3307, R²: -0.2539

Training Model for Service 1 - CPU Usage with test size 0.2

Test size: 20%
CPU Usage - Train RMSE: 0.0032, R²: 0.9867
CPU Usage - Test  RMSE: 0.0384, R²: -3.3780
Training Model for Service 1 - Memory Usage with test size 0.2

Test size: 20%
Memory Usage - Train RMSE: 4.2309, R²: 0.8267
Memory Usage - Test  RMSE: 9.0655, R²: -0.4908

Training Model for Service 1 - CPU Usage with test size 0.1

Test size: 10%
CPU Usage - Train RMSE: 0.0033, R²: 0.9869
CPU Usage - Test  RMSE: 0.0044, R²: 0.7589
Training Model for Service 1 - Memory Usage with test size 0.1

Test size: 10%
Memory Usage - Train RMSE: 4.0055, R²: 0.8311
Memory Usage - Test  RMSE: 10.3196, R²: -

  return np.roots(self.polynomial_reduced_ar)**-1



Test size: 30%
CPU Usage - Train RMSE: 0.0036, R²: 0.9851
CPU Usage - Test  RMSE: 0.1920, R²: -72.5929
Training Model for Service 2 - Memory Usage with test size 0.3

Test size: 30%
Memory Usage - Train RMSE: 0.2369, R²: 0.2432
Memory Usage - Test  RMSE: 0.2502, R²: -0.0100

Training Model for Service 2 - CPU Usage with test size 0.2


  return np.roots(self.polynomial_reduced_ar)**-1



Test size: 20%
CPU Usage - Train RMSE: 0.0034, R²: 0.9851
CPU Usage - Test  RMSE: 0.0285, R²: -2.4273
Training Model for Service 2 - Memory Usage with test size 0.2

Test size: 20%
Memory Usage - Train RMSE: 0.2295, R²: 0.2998
Memory Usage - Test  RMSE: 0.2864, R²: -0.3692

Training Model for Service 2 - CPU Usage with test size 0.1


  return np.roots(self.polynomial_reduced_ar)**-1



Test size: 10%
CPU Usage - Train RMSE: 0.0032, R²: 0.9866
CPU Usage - Test  RMSE: 0.0169, R²: -3.8522
Training Model for Service 2 - Memory Usage with test size 0.1

Test size: 10%
Memory Usage - Train RMSE: 0.2249, R²: 0.3263
Memory Usage - Test  RMSE: 0.2357, R²: -0.1839

Training Model for HashGen - CPU Usage with test size 0.3

Test size: 30%
CPU Usage - Train RMSE: 0.0004, R²: 0.9985
CPU Usage - Test  RMSE: 0.0198, R²: -1.5941
Training Model for HashGen - Memory Usage with test size 0.3

Test size: 30%
Memory Usage - Train RMSE: 10.5485, R²: 0.6688
Memory Usage - Test  RMSE: 24.7381, R²: -0.0984

Training Model for HashGen - CPU Usage with test size 0.2

Test size: 20%
CPU Usage - Train RMSE: 0.0004, R²: 0.9984
CPU Usage - Test  RMSE: 0.0154, R²: -0.3764
Training Model for HashGen - Memory Usage with test size 0.2

Test size: 20%
Memory Usage - Train RMSE: 10.1833, R²: 0.6692
Memory Usage - Test  RMSE: 27.3470, R²: -0.0331

Training Model for HashGen - CPU Usage with test size 0.