<a href="https://colab.research.google.com/github/cepdnaclk/e19-4yp-AI-Dirven-Latency-Constrained-Resource-Management-In-Kubernetes/blob/main/Latency_Models/xg/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Service 1

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import optuna

# Load and preprocess data
df = pd.read_csv("/content/service-1-deployment_dataset.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format = 'mixed')
df = df.sort_values("Timestamp")

# Feature engineering
df["cpu_allocated"] = df["CPU Request"]
df["memory_allocated"] = df["Memory Request"]
df["cpu_usage_pct"] = df["CPU Usage"] / df["CPU Limit"]
df["memory_usage_pct"] = df["Memory Usage"] / df["Memory Limit"]
df["request_rate_rps"] = df["Request Rate"]
df["latency_p95_t"] = df["Latency"].shift(1)
df["burstiness_score"] = df["Request Rate"].rolling(5).max() - df["Request Rate"].rolling(5).mean()

# Drop rows with NaNs from shifting
df = df.dropna()

# Define features and target
features = [
    "cpu_allocated", "memory_allocated",
    "cpu_usage_pct", "memory_usage_pct",
    "request_rate_rps", "latency_p95_t",
    "burstiness_score"
]
X = df[features]
y = df["Latency"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Optuna objective function
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-4, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
        "objective": "reg:squarederror",
        "verbosity": 0,
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=params["n_estimators"]
    )

    preds = model.predict(dtest)
    mae = mean_absolute_error(y_test, preds)
    return mae

# Run Optuna tuning
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

# Best model
best_params = study.best_params
best_params["objective"] = "reg:squarederror"
best_params["verbosity"] = 0

# ✅ Train base model on initial batch
batch_size = 100
model = None
for i in range(0, len(X_train), batch_size):
    X_batch = X_train.iloc[i:i + batch_size]
    y_batch = y_train.iloc[i:i + batch_size]
    dtrain = xgb.DMatrix(X_batch, label=y_batch)

    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=best_params["n_estimators"] // 5,
        xgb_model=model,  # warm-start
        #process_type="update",
        #updater="refresh"
    )

# Evaluate on test set
dtest = xgb.DMatrix(X_test)
y_pred = model.predict(dtest)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Final MAE:", round(mae, 4))
print("Final R² Score:", round(r2, 4))
print("Best Parameters:", best_params)


[I 2025-06-30 05:28:26,463] A new study created in memory with name: no-name-3ac17339-ad84-4edd-b468-6fa94ff201a1
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-4, 1.0),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-4, 1.0),
[I 2025-06-30 05:28:26,532] Trial 0 finished with value: 8.802447880412743e-06 and parameters: {'max_depth': 4, 'learning_rate': 0.10992905205027517, 'n_estimators': 76, 'subsample': 0.9330886997874446, 'colsample_bytree': 0.7950763579175599, 'reg_alpha': 0.003076691921828455, 'reg_lambda': 0.07500021586378287}. Best is trial 0 with value: 8.802447880412743e-06.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
  "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
  "colsample_bytree": trial.suggest_uniform("c

Final MAE: 0.0006
Final R² Score: -4291.0824
Best Parameters: {'max_depth': 4, 'learning_rate': 0.09477044299797562, 'n_estimators': 194, 'subsample': 0.9496493331499487, 'colsample_bytree': 0.8286941927103819, 'reg_alpha': 0.0018315833934875285, 'reg_lambda': 0.0045766275367754945, 'objective': 'reg:squarederror', 'verbosity': 0}
