In [107]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score, classification_report

#import matplotlib.pyplot as plt
#import seaborn as sns

# Regression

In [89]:
# Load data

file_path = "processed_data.csv"

df = pd.read_csv(file_path)  

print(len(df))
df.head()

364


Unnamed: 0,Response Time,Availability,Throughput,Successability,Reliability,Compliance,Best Practices,Latency,Documentation,WsRF,Class,Service Name_0,Service Name_1,Service Name_2,Service Name_3,Service Name_4,Service Name_5,Service Name_6,Service Name_7,Service Name_8
0,-0.562485,-0.16406,-0.211258,-0.682793,1.704262,0.607503,1.607536,-0.504205,0.288022,2.963956,1,-0.594273,-0.749821,-0.915633,-0.910574,-0.967559,-0.989071,-0.98365,-1.01105,0.989071
1,-0.530722,0.753015,1.372452,1.114427,1.142165,-0.648275,-0.154461,-0.477097,1.058299,2.338,1,-0.594273,-0.749821,-0.915633,-0.910574,-0.967559,-0.989071,-0.98365,0.989071,-1.01105
2,-0.476992,0.753015,2.920968,0.87795,1.260253,1.86328,0.96681,-0.418147,0.315532,2.069733,1,-0.594273,-0.749821,-0.915633,-0.910574,-0.967559,-0.989071,-0.98365,0.989071,0.989071
3,-0.5328,0.753015,-0.246452,0.87795,0.849309,1.86328,-0.955369,-0.478894,1.195848,2.069733,1,-0.594273,-0.749821,-0.915633,-0.910574,-0.967559,-0.989071,1.016622,-1.01105,-1.01105
4,-0.491004,0.753015,2.005936,0.736065,1.45864,-0.648275,0.486265,-0.426246,1.195848,2.069733,1,-0.594273,-0.749821,-0.915633,-0.910574,-0.967559,-0.989071,1.016622,-1.01105,0.989071


- On Response Time / Latency

In [115]:

regressors = {
    'lin_reg': LinearRegression(),
    'tree_reg': DecisionTreeRegressor(),
    'rf_reg': RandomForestRegressor(random_state=42),
    'svr': SVR()
}

# Parameter distributions for RandomForest and SVR
param_distributions = {
    'rf_reg': {
        'n_estimators': np.arange(50, 300, 50),
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'svr': {
        'C': np.logspace(-2, 2, 5),
        'epsilon': [0.01, 0.1, 0.5, 1],
        'kernel': ['linear', 'rbf']
    }
}


In [116]:
X_rest, X_test, y_rest, y_test = train_test_split(
    df['Response Time'], df['Latency'], test_size=0.15, random_state=42
)
X_rest = X_rest.to_frame()  # Ensure X is 2D

k_folds = KFold(n_splits=5)
reg_scores = {}
best_models = {}

# Loop through regressors
for name, reg in regressors.items():
    if name in param_distributions:
        # Apply RandomizedSearchCV for models with hyperparameters
        search = RandomizedSearchCV(
            reg,
            param_distributions=param_distributions[name],
            n_iter=10,
            scoring='r2',
            cv=k_folds,
            n_jobs=-1,
            random_state=42
        )
        search.fit(X_rest, y_rest)
        best_models[name] = search.best_estimator_
        reg_scores[name] = search.best_score_
        print(f"{name}: Best CV R²={search.best_score_:.4f}, Params={search.best_params_}")
    else:
        # Simple cross-validation for models without tuning
        scores = cross_val_score(reg, X_rest, y_rest, cv=k_folds, scoring='r2')
        reg_scores[name] = scores.mean()
        reg.fit(X_rest, y_rest)
        best_models[name] = reg
        print(f"{name}: CV R² mean={scores.mean():.4f}")

# Select best model
best_reg_name = max(reg_scores, key=reg_scores.get)
final_model = best_models[best_reg_name]

# Train on full training set
final_model.fit(X_rest, y_rest)

# Predict on test set
y_pred = final_model.predict(X_test.to_frame())

# Evaluation metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"\nBest model: {best_reg_name}")
print(f"Test R²: {r2:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

lin_reg: CV R² mean=0.8231
tree_reg: CV R² mean=0.7735
rf_reg: Best CV R²=0.7696, Params={'n_estimators': np.int64(150), 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': None}
svr: Best CV R²=0.8093, Params={'kernel': 'linear', 'epsilon': 0.5, 'C': np.float64(0.1)}

Best model: lin_reg
Test R²: 0.9616
Test RMSE: 0.2832
Test MAE: 0.1380


- On Successability/Availability

In [117]:
X_rest, X_test, y_rest, y_test = train_test_split(
    df['Successability'], df['Availability'], test_size=0.15, random_state=42
)
X_rest = X_rest.to_frame()  # Ensure X is 2D

k_folds = KFold(n_splits=5)
reg_scores = {}
best_models = {}

# Loop through regressors
for name, reg in regressors.items():
    if name in param_distributions:
        # Apply RandomizedSearchCV for models with hyperparameters
        search = RandomizedSearchCV(
            reg,
            param_distributions=param_distributions[name],
            n_iter=10,
            scoring='r2',
            cv=k_folds,
            n_jobs=-1,
            random_state=42
        )
        search.fit(X_rest, y_rest)
        best_models[name] = search.best_estimator_
        reg_scores[name] = search.best_score_
        print(f"{name}: Best CV R²={search.best_score_:.4f}, Params={search.best_params_}")
    else:
        # Simple cross-validation for models without tuning
        scores = cross_val_score(reg, X_rest, y_rest, cv=k_folds, scoring='r2')
        reg_scores[name] = scores.mean()
        reg.fit(X_rest, y_rest)
        best_models[name] = reg
        print(f"{name}: CV R² mean={scores.mean():.4f}")

# Select best model
best_reg_name = max(reg_scores, key=reg_scores.get)
final_model = best_models[best_reg_name]

# Train on full training set
final_model.fit(X_rest, y_rest)

# Predict on test set
y_pred = final_model.predict(X_test.to_frame())

# Evaluation metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f"\nBest model: {best_reg_name}")
print(f"Test R²: {r2:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

lin_reg: CV R² mean=0.5380
tree_reg: CV R² mean=0.6257
rf_reg: Best CV R²=0.6465, Params={'n_estimators': np.int64(50), 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}
svr: Best CV R²=0.5763, Params={'kernel': 'rbf', 'epsilon': 0.5, 'C': np.float64(100.0)}

Best model: rf_reg
Test R²: 0.7082
Test RMSE: 0.6351
Test MAE: 0.4009


# Multi-class classification
- With dimentionality reduction

In [33]:
# Load data

file_path = "data_umap.csv"

df = pd.read_csv(file_path)  

print(len(df))
df.head()

364


Unnamed: 0,UMAP1,UMAP2,label
0,14.672715,-12.427366,1
1,14.359391,-12.942661,1
2,14.858253,-11.748547,1
3,14.077615,-12.789752,1
4,14.808619,-12.802444,1


In [None]:
X = df[['UMAP1','UMAP2']]
Y = df['label']

In [86]:
models = {
    'lr': LogisticRegression(max_iter=1000, random_state=42),
    'dt': DecisionTreeClassifier(),
    'rf': RandomForestClassifier(max_depth=2, random_state=42),
    'nb': GaussianNB(),
    'svm': svm.SVC(),
    'knc': KNeighborsClassifier(n_neighbors=3),
    'xgb': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

param_grids = {
    'lr': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'dt': {
        'max_depth': [None, 3, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'rf': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 5, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'svm': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'knc': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'xgb': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
}

k_folds = KFold(n_splits=5)
model_scores = {}


In [123]:
X_rest, X_test, Y_rest, Y_test = train_test_split(X,Y, test_size=0.15, random_state=42)

encoder = LabelEncoder()
Y_rest = encoder.fit_transform(Y_rest)
Y_test = encoder.transform(Y_test)

k_folds = KFold(n_splits = 5)

model_scores={}

model_scores = {}

for name, model in models.items():
    if name in param_grids:
        # Use GridSearchCV or RandomizedSearchCV
        search = RandomizedSearchCV(model, param_grids[name], cv=k_folds, scoring='accuracy', n_jobs=-1)
        search.fit(X_rest if name in ['lr', 'svm', 'knc'] else X_rest, Y_rest)
        best_score = search.best_score_
        best_model = search.best_estimator_
        print(f"{name}: Best Score={best_score:.4f}, Best Params={search.best_params_}")
    else:
        scores = cross_val_score(model, X_rest, Y_rest, cv=k_folds, scoring='accuracy')
        best_score = scores.mean()
        best_model = model.fit(X_rest, Y_rest)
        print(f"{name}: Score={best_score:.4f}")
    
    model_scores[name] = (best_score, best_model)

# Select best model
best_model_name = max(model_scores, key=lambda k: model_scores[k][0])
best_model = model_scores[best_model_name][1]
print(f"\nBest model: {best_model_name} with score {model_scores[best_model_name][0]:.4f}")



lr: Best Score=0.8865, Best Params={'solver': 'lbfgs', 'C': 1}
dt: Best Score=0.9060, Best Params={'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 5}
rf: Best Score=0.9059, Best Params={'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 10}
nb: Score=0.8897
svm: Best Score=0.8994, Best Params={'kernel': 'rbf', 'gamma': 'auto', 'C': 10}
knc: Best Score=0.8962, Best Params={'weights': 'uniform', 'n_neighbors': 9}




xgb: Best Score=0.8994, Best Params={'subsample': 0.8, 'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1.0}

Best model: dt with score 0.9060


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


- RandomizedSearchCV
Best Model: random forest with score 0.9026


- GridSearchCV
Best Model: decision tree with score 0.9092

In [87]:
best_model = search.best_estimator_  # from GridSearchCV or RandomizedSearchCV

best_model.fit(X_rest, Y_rest)

y_pred = best_model.predict(X_test)

print('Best Model ', best_model_name)
print(f"Test Accuracy: {accuracy_score(Y_test, y_pred):.4f}")
print(classification_report(Y_test, y_pred))

Best Model  df
Test Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.67      0.89      0.76         9
           1       0.80      0.50      0.62        16
           2       0.74      0.93      0.82        15
           3       1.00      0.93      0.97        15

    accuracy                           0.80        55
   macro avg       0.80      0.81      0.79        55
weighted avg       0.82      0.80      0.79        55



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


- Feature Importance

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(best_model, X_test, Y_test, n_repeats=10, random_state=42)
print(result.importances_mean)

#corr = pd.DataFrame(X_original).corrwith(pd.Series(X_umap[:,0]), method='spearman')
#print(corr.sort_values(ascending=False))


[0.20181818 0.43090909]
