In [17]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Regression

In [18]:
# Load data

file_path = os.path.join("data","processed_data.csv")

df = pd.read_csv(file_path)  

print(len(df))
df.head()

364


Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF,Class,Service Name_0,Service Name_1,Service Name_2,Service Name_3
0,-0.562485,-0.16406,-0.211258,-0.682793,1.704262,0.607503,1.607536,-0.504205,0.288022,2.963956,1,-0.594273,-0.749821,-0.915633,-0.910574
1,-0.530722,0.753015,1.372452,1.114427,1.142165,-0.648275,-0.154461,-0.477097,1.058299,2.338,1,-0.594273,-0.749821,-0.915633,-0.910574
2,-0.476992,0.753015,2.920968,0.87795,1.260253,1.86328,0.96681,-0.418147,0.315532,2.069733,1,-0.594273,-0.749821,-0.915633,-0.910574
3,-0.5328,0.753015,-0.246452,0.87795,0.849309,1.86328,-0.955369,-0.478894,1.195848,2.069733,1,-0.594273,-0.749821,-0.915633,-0.910574
4,-0.491004,0.753015,2.005936,0.736065,1.45864,-0.648275,0.486265,-0.426246,1.195848,2.069733,1,-0.594273,-0.749821,-0.915633,-0.910574


- On Response Time / Latency

In [19]:

regressors = {
    'lin_reg': LinearRegression(),
    'tree_reg': DecisionTreeRegressor(),
    'rf_reg': RandomForestRegressor(random_state=42),
    'svr': SVR()
}

# Parameter distributions for RandomForest and SVR
param_distributions = {
    'rf_reg': {
        'n_estimators': np.arange(50, 300, 50),
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'svr': {
        'C': np.logspace(-2, 2, 5),
        'epsilon': [0.01, 0.1, 0.5, 1],
        'kernel': ['linear', 'rbf']
    }
}


In [20]:
X_rest, X_test, y_rest, y_test = train_test_split(
    df['Response Time'], df['Latency'], test_size=0.15, random_state=42
)
X_rest = X_rest.to_frame()  # Ensure X is 2D

k_folds = KFold(n_splits=5)
reg_scores = {}
best_models = {}

# Loop through regressors
for name, reg in regressors.items():
    if name in param_distributions:
        # Apply RandomizedSearchCV for models with hyperparameters
        search = RandomizedSearchCV(
            reg,
            param_distributions=param_distributions[name],
            n_iter=10,
            scoring='r2',
            cv=k_folds,
            n_jobs=-1,
            random_state=42
        )
        search.fit(X_rest, y_rest)
        best_models[name] = search.best_estimator_
        reg_scores[name] = search.best_score_
        print(f"{name}: Best CV R²={search.best_score_:.4f}, Params={search.best_params_}")
    else:
        # Simple cross-validation for models without tuning
        scores = cross_val_score(reg, X_rest, y_rest, cv=k_folds, scoring='r2')
        reg_scores[name] = scores.mean()
        reg.fit(X_rest, y_rest)
        best_models[name] = reg
        print(f"{name}: CV R² mean={scores.mean():.4f}")

# Select best model
best_reg_name = max(reg_scores, key=reg_scores.get)
final_model = best_models[best_reg_name]

# Train on full training set
final_model.fit(X_rest, y_rest)

# Predict on test set
y_pred = final_model.predict(X_test.to_frame())

# Evaluation metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"\nBest model: {best_reg_name}")
print(f"Test R²: {r2:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

lin_reg: CV R² mean=0.8231
tree_reg: CV R² mean=0.7735
rf_reg: Best CV R²=0.7696, Params={'n_estimators': np.int64(150), 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': None}
svr: Best CV R²=0.8093, Params={'kernel': 'linear', 'epsilon': 0.5, 'C': np.float64(0.1)}

Best model: lin_reg
Test R²: 0.9616
Test RMSE: 0.2832
Test MAE: 0.1380


- On Successability/Availability

In [21]:
X_rest, X_test, y_rest, y_test = train_test_split(
    df['Successability'], df['Availability'], test_size=0.15, random_state=42
)
X_rest = X_rest.to_frame()  # Ensure X is 2D

k_folds = KFold(n_splits=5)
reg_scores = {}
best_models = {}

# Loop through regressors
for name, reg in regressors.items():
    if name in param_distributions:
        # Apply RandomizedSearchCV for models with hyperparameters
        search = RandomizedSearchCV(
            reg,
            param_distributions=param_distributions[name],
            n_iter=10,
            scoring='r2',
            cv=k_folds,
            n_jobs=-1,
            random_state=42
        )
        search.fit(X_rest, y_rest)
        best_models[name] = search.best_estimator_
        reg_scores[name] = search.best_score_
        print(f"{name}: Best CV R²={search.best_score_:.4f}, Params={search.best_params_}")
    else:
        # Simple cross-validation for models without tuning
        scores = cross_val_score(reg, X_rest, y_rest, cv=k_folds, scoring='r2')
        reg_scores[name] = scores.mean()
        reg.fit(X_rest, y_rest)
        best_models[name] = reg
        print(f"{name}: CV R² mean={scores.mean():.4f}")

# Select best model
best_reg_name = max(reg_scores, key=reg_scores.get)
final_model = best_models[best_reg_name]

# Train on full training set
final_model.fit(X_rest, y_rest)

# Predict on test set
y_pred = final_model.predict(X_test.to_frame())

# Evaluation metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"\nBest model: {best_reg_name}")
print(f"Test R²: {r2:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

lin_reg: CV R² mean=0.5380
tree_reg: CV R² mean=0.6257
rf_reg: Best CV R²=0.6465, Params={'n_estimators': np.int64(50), 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}
svr: Best CV R²=0.5763, Params={'kernel': 'rbf', 'epsilon': 0.5, 'C': np.float64(100.0)}

Best model: rf_reg
Test R²: 0.7082
Test RMSE: 0.6351
Test MAE: 0.4009
