In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [43]:
# Paths to service1 datasets
cpu_path_s1 = "../../results/prometheus_data/service1_cpu_limit_reduction.csv"
memory_path_s1 = "../../results/prometheus_data/new datasets/service1_memory_limit_reduction.csv"
both_path_s1 = "../../results/prometheus_data/service1_both_limits_reduction.csv"

# Import datasets
df_cpu_s1 = pd.read_csv(cpu_path_s1)
df_memory_s1 = pd.read_csv(memory_path_s1)
df_both_s1 = pd.read_csv(both_path_s1)

df_all_s1 = pd.concat([df_cpu_s1, df_memory_s1, df_both_s1], ignore_index=True)
print(df_all_s1.columns)

Index(['Timestamp', 'Service', 'CPU Request', 'Memory Request', 'CPU Limit',
       'Memory Limit', 'Latency', 'CPU Usage', 'Memory Usage'],
      dtype='object')


In [44]:
# Paths to service1 datasets
cpu_path_s2 = "../../results/prometheus_data/service2_cpu_limit_reduction.csv"
memory_path_s2 = "../../results/prometheus_data/service2_memory_limit_reduction.csv"
both_path_s2 = "../../results/prometheus_data/service2_both_limit_reduction.csv"

# Import datasets
df_cpu_s2 = pd.read_csv(cpu_path_s2)
df_memory_s2 = pd.read_csv(memory_path_s2)
df_both_s2 = pd.read_csv(both_path_s2)

# Combine all three DataFrames
df_all_s2 = pd.concat([df_cpu_s2, df_memory_s2, df_both_s2], ignore_index=True)

In [45]:
# Paths to datasets
cpu_path_hg = "../../results/prometheus_data/hashgen_cpu_limit_reduction.csv"
memory_path_hg = "../../results/prometheus_data/hashgen_memory_limit_reduction.csv"
both_path_hg = "../../results/prometheus_data/hashgen_both_limit_reduction.csv"

# Import datasets
df_cpu_hg = pd.read_csv(cpu_path_hg)
df_memory_hg = pd.read_csv(memory_path_hg)
df_both_hg = pd.read_csv(both_path_hg)

# Combine all three DataFrames
df_all_hg = pd.concat([df_cpu_hg, df_memory_hg, df_both_hg], ignore_index=True)

In [46]:
# Paths to datasets
cpu_path_rp = "../../results/prometheus_data/ranspw_cpu_limit_reduction.csv"
memory_path_rp = "../../results/prometheus_data/randpw_memory_limit_reduction.csv"
both_path_rp = "../../results/prometheus_data/randpw_both_limits_reduction.csv"

# Import datasets
df_cpu_rp = pd.read_csv(cpu_path_rp)
df_memory_rp = pd.read_csv(memory_path_rp)
df_both_rp = pd.read_csv(both_path_rp)

# Combine all three DataFrames
df_all_rp = pd.concat([df_cpu_rp, df_memory_rp, df_both_rp], ignore_index=True)

In [47]:
def evaluate_random_forest(df, target_column, test_size, random_state=42):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Select numeric features only (adjust if you want categorical encoding)
    X = X.select_dtypes(include=['number'])

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(n_estimators=100, random_state=random_state))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)

    return train_score, test_score

In [48]:
configs = {
    "Service 1": df_all_s1,
    "Service 2": df_all_s2,
    "HashGen": df_all_hg,
    "RandPw": df_all_rp,
}

test_sizes = [0.3, 0.2, 0.1]

for test_size in test_sizes:
    print(f"\nTest size: {test_size}")
    for label, df in configs.items():
        cpu_train_score, cpu_test_score = evaluate_random_forest(df, 'CPU Usage', test_size)
        mem_train_score, mem_test_score = evaluate_random_forest(df, 'Memory Usage', test_size)

        print(f"CPU Usage R² Train - {label}: {cpu_train_score:.4f}")
        print(f"CPU Usage R² Test  - {label}: {cpu_test_score:.4f}")
        print(f"Memory Usage R² Train - {label}: {mem_train_score:.4f}")
        print(f"Memory Usage R² Test  - {label}: {mem_test_score:.4f}")



Test size: 0.3


CPU Usage R² Train - Service 1: 0.9992
CPU Usage R² Test  - Service 1: 0.9963
Memory Usage R² Train - Service 1: 0.9587
Memory Usage R² Test  - Service 1: 0.7567
CPU Usage R² Train - Service 2: 0.9988
CPU Usage R² Test  - Service 2: 0.9920
Memory Usage R² Train - Service 2: 0.9200
Memory Usage R² Test  - Service 2: 0.4245
CPU Usage R² Train - HashGen: 0.9992
CPU Usage R² Test  - HashGen: 0.9943
Memory Usage R² Train - HashGen: 0.9339
Memory Usage R² Test  - HashGen: 0.5385
CPU Usage R² Train - RandPw: 0.9934
CPU Usage R² Test  - RandPw: 0.9644
Memory Usage R² Train - RandPw: 0.9999
Memory Usage R² Test  - RandPw: 0.9993

Test size: 0.2
CPU Usage R² Train - Service 1: 0.9993
CPU Usage R² Test  - Service 1: 0.9961
Memory Usage R² Train - Service 1: 0.9660
Memory Usage R² Test  - Service 1: 0.7416
CPU Usage R² Train - Service 2: 0.9990
CPU Usage R² Test  - Service 2: 0.9935
Memory Usage R² Train - Service 2: 0.9244
Memory Usage R² Test  - Service 2: 0.4674
CPU Usage R² Train - HashGen: 0.

For Memory Usage, the training R² scores are consistently high (above 0.9), showing that the model fits the training data very closely.

However, the test R² scores drop significantly (sometimes below 0.6), indicating the model struggles to generalize to new data.

This large gap between training and test performance is a classic sign of overfitting. The model is too tailored to the training set specifics and cannot accurately predict unseen examples.


In [49]:
from sklearn.model_selection import cross_val_score

def evaluate_random_forest_new(df, target_column, test_size=0.2, random_state=42,
                           max_depth=5, min_samples_split=2, min_samples_leaf=2, cv=5):
    X = df.drop(columns=[target_column])
    y = df[target_column]

    X = X.select_dtypes(include=['number'])

    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
    )

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', model)
    ])

    # Perform cross-validation on full data for better generalization estimate
    cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2')

    # Also, get train/test split score for comparison
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)

    return {
        "cv_mean_score": cv_scores.mean(),
        "cv_std_score": cv_scores.std(),
        "train_score": train_score,
        "test_score": test_score
    }


In [50]:
configs = {
    "Service 1": df_all_s1,
    "Service 2": df_all_s2,
    "HashGen": df_all_hg,
    "RandPw": df_all_rp,
}

test_sizes = [0.3, 0.2, 0.1]

for test_size in test_sizes:
    print(f"\nTest size: {test_size}")
    for label, df in configs.items():
        cpu_results = evaluate_random_forest_new(df, 'CPU Usage', test_size)
        mem_results = evaluate_random_forest_new(df, 'Memory Usage', test_size)

        print(f"CPU Usage R² Train - {label}: {cpu_results['train_score']:.4f}")
        print(f"CPU Usage R² Test  - {label}: {cpu_results['test_score']:.4f}")
        print(f"CPU Usage CV Mean R² - {label}: {cpu_results['cv_mean_score']:.4f} ± {cpu_results['cv_std_score']:.4f}")

        print(f"Memory Usage R² Train - {label}: {mem_results['train_score']:.4f}")
        print(f"Memory Usage R² Test  - {label}: {mem_results['test_score']:.4f}")
        print(f"Memory Usage CV Mean R² - {label}: {mem_results['cv_mean_score']:.4f} ± {mem_results['cv_std_score']:.4f}")
        print()  # Add a blank line between services for readability



Test size: 0.3
CPU Usage R² Train - Service 1: 0.9646
CPU Usage R² Test  - Service 1: 0.9540
CPU Usage CV Mean R² - Service 1: -12.1674 ± 24.0663
Memory Usage R² Train - Service 1: 0.7689
Memory Usage R² Test  - Service 1: 0.7662
Memory Usage CV Mean R² - Service 1: -3.4206 ± 5.7981

CPU Usage R² Train - Service 2: 0.9863
CPU Usage R² Test  - Service 2: 0.9851
CPU Usage CV Mean R² - Service 2: 0.5980 ± 0.3536
Memory Usage R² Train - Service 2: 0.2458
Memory Usage R² Test  - Service 2: 0.1808
Memory Usage CV Mean R² - Service 2: -0.5560 ± 0.2762

CPU Usage R² Train - HashGen: 0.9858
CPU Usage R² Test  - HashGen: 0.9841
CPU Usage CV Mean R² - HashGen: 0.7958 ± 0.0451
Memory Usage R² Train - HashGen: 0.4242
Memory Usage R² Test  - HashGen: 0.3307
Memory Usage CV Mean R² - HashGen: -0.8824 ± 1.2092

CPU Usage R² Train - RandPw: 0.7550
CPU Usage R² Test  - RandPw: 0.7153
CPU Usage CV Mean R² - RandPw: -0.1028 ± 0.2873
Memory Usage R² Train - RandPw: 0.9427
Memory Usage R² Test  - RandPw: 0

In [51]:
def add_rolling_features(df, window=3):
    df = df.copy()
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='mixed')

    df = df.sort_values(['Service', 'Timestamp'])  # Service-wise time sorting
    df.set_index('Timestamp', inplace=True)

    # Rolling averages per service
    for col in ['CPU Usage', 'Memory Usage', 'Latency']:
        df[f'{col}_RollingMean'] = df.groupby('Service')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
        df[f'{col}_RollingSTD'] = df.groupby('Service')[col].transform(lambda x: x.rolling(window, min_periods=1).std())

    # Spike detection
    df["CPU_Spike"] = df["CPU Usage"] - df["CPU Usage_RollingMean"]
    df["Memory_Spike"] = df["Memory Usage"] - df["Memory Usage_RollingMean"]

    # Latency trend direction
    df["Latency_Trend"] = df.groupby("Service")["Latency"].transform(lambda x: x.diff().fillna(0).apply(lambda y: 1 if y > 0 else (-1 if y < 0 else 0)))

    df.reset_index(inplace=True)  # Reset index to include Timestamp again
    df.dropna(inplace=True)  # Optional: drop rows with NaNs from rolling
    return df

In [52]:
def evaluate_random_forest_with_grid_search(df, target_column, test_size=0.2, random_state=42, cv=5):
    df = add_rolling_features(df)

    X = df.drop(columns=[target_column, 'Timestamp', 'Service'])  # Drop non-numeric/time/grouping columns
    y = df[target_column]

    # Keep only numeric features
    X = X.select_dtypes(include=['number'])

    # Define pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(random_state=random_state))
    ])

    # Grid search parameters
    param_grid = {
        'rf__max_depth': [3, 5, 10],
        'rf__min_samples_split': [2, 5, 10],
        'rf__min_samples_leaf': [1, 2, 4]
    }

    # Grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        scoring='r2',
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    best_pipeline = grid_search.best_estimator_

    # Evaluate
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    best_pipeline.fit(X_train, y_train)

    y_train_pred = best_pipeline.predict(X_train)
    y_test_pred = best_pipeline.predict(X_test)

    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)

    return {
        "best_params": best_params,
        "cv_best_score": grid_search.best_score_,
        "train_score": train_score,
        "test_score": test_score
    }

In [53]:
test_sizes = [0.3, 0.2, 0.1]

for test_size in test_sizes:
    print(f"\nTest size: {test_size}")
    for label, df in configs.items():
        print(label)
        cpu_results = evaluate_random_forest_with_grid_search(df, 'CPU Usage', test_size)
        mem_results = evaluate_random_forest_with_grid_search(df, 'Memory Usage', test_size)

        print(f"CPU Usage R² Train - {label}: {cpu_results['train_score']:.4f}")
        print(f"CPU Usage R² Test  - {label}: {cpu_results['test_score']:.4f}")
        print(f"CPU Usage CV R² - {label}: {cpu_results['cv_best_score']:.4f}")
        print(f"Best Params - {label}: {cpu_results['best_params']}")


        print(f"Memory Usage R² Train - {label}: {mem_results['train_score']:.4f}")
        print(f"Memory Usage R² Test  - {label}: {mem_results['test_score']:.4f}")
        print(f"Memory Usage CV R² - {label}: {mem_results['cv_best_score']:.4f}")
        print(f"Best Params - {label}: {mem_results['best_params']}")

        print()  # Add a blank line between services for readability


Test size: 0.3
Service 1
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU Usage R² Train - Service 1: 0.9999
CPU Usage R² Test  - Service 1: 0.9998
CPU Usage CV R² - Service 1: 0.9838
Best Params - Service 1: {'rf__max_depth': 10, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 5}
Memory Usage R² Train - Service 1: 0.9895
Memory Usage R² Test  - Service 1: 0.5835
Memory Usage CV R² - Service 1: 0.8593
Best Params - Service 1: {'rf__max_depth': 10, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 10}

Service 2
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU Usage R² Train - Service 2: 1.0000
CPU Usage R² Test  - Service 2: 0.9999
CPU Usage CV R² - Service 2: 0.9763
Best Params - Service 2: {'rf__max_depth': 10, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2}
Memory Usage R² Train - Service 2: 0.9990
Memory Usage R²

Training with
Test Size = 0.1
Max Depth = 10
Min Samples Leaf = 2
Min Samples Split = 10
{'max_depth': 10, 'min_samples_leaf':2, 'min_samples_split':2}


In [57]:
def train_fixed_random_forest(df, target_column, test_size=0.1, random_state=42, cv=5):
    df = add_rolling_features(df)

    X = df.drop(columns=[target_column, 'Timestamp', 'Service'], errors='ignore')
    y = df[target_column]
    X = X.select_dtypes(include=['number'])

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(
            max_depth=10,
            min_samples_leaf=2,
            min_samples_split=2,
            random_state=random_state
        ))
    ])

    # Cross-validation (before train-test split)
    cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='r2', n_jobs=-1)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    pipeline.fit(X_train, y_train)

    # Predictions and R²
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)

    return {
        "pipeline": pipeline,
        "train_score": train_score,
        "test_score": test_score,
        "cv_mean_score": cv_mean,
        "cv_std_score": cv_std
    }

In [58]:
for label, df in configs.items():
    print(label)
    cpu_results = train_fixed_random_forest(df, 'CPU Usage', 0.1)
    mem_results = train_fixed_random_forest(df, 'Memory Usage', 0.1)

    print(f"CPU Usage R² Train - {label}: {cpu_results['train_score']:.4f}")
    print(f"CPU Usage R² Test  - {label}: {cpu_results['test_score']:.4f}")
    print(f"CPU Usage CV Mean R² - {label}: {cpu_results['cv_mean_score']:.4f} ± {cpu_results['cv_std_score']:.4f}")

    print(f"Memory Usage R² Train - {label}: {mem_results['train_score']:.4f}")
    print(f"Memory Usage R² Test  - {label}: {mem_results['test_score']:.4f}")
    print(f"Memory Usage CV Mean R² - {label}: {mem_results['cv_mean_score']:.4f} ± {mem_results['cv_std_score']:.4f}")
    print()


Service 1
CPU Usage R² Train - Service 1: 0.9999
CPU Usage R² Test  - Service 1: 0.9999
CPU Usage CV Mean R² - Service 1: 0.9835 ± 0.0175
Memory Usage R² Train - Service 1: 0.9935
Memory Usage R² Test  - Service 1: 0.9965
Memory Usage CV Mean R² - Service 1: 0.8474 ± 0.1926

Service 2
CPU Usage R² Train - Service 2: 1.0000
CPU Usage R² Test  - Service 2: 0.9999
CPU Usage CV Mean R² - Service 2: 0.9763 ± 0.0461
Memory Usage R² Train - Service 2: 0.9987
Memory Usage R² Test  - Service 2: 0.9953
Memory Usage CV Mean R² - Service 2: 0.9930 ± 0.0037

HashGen
CPU Usage R² Train - HashGen: 0.9999
CPU Usage R² Test  - HashGen: 0.9998
CPU Usage CV Mean R² - HashGen: 0.9990 ± 0.0003
Memory Usage R² Train - HashGen: 0.9988
Memory Usage R² Test  - HashGen: 0.9979
Memory Usage CV Mean R² - HashGen: 0.9779 ± 0.0167

RandPw
CPU Usage R² Train - RandPw: 0.9990
CPU Usage R² Test  - RandPw: 0.9965
CPU Usage CV Mean R² - RandPw: 0.9767 ± 0.0333
Memory Usage R² Train - RandPw: 0.9985
Memory Usage R² Test 