# Model Training and Inference Time Benchmarking (SimulatedQueryMetrics)

This notebook benchmarks training and inference time for ARIMA, Prophet, LSTM, Random Forest, and XGBoost using your provided CSV structure.

In [None]:
# !pip install numpy pandas scikit-learn xgboost statsmodels prophet tensorflow

In [2]:
import numpy as np
import pandas as pd
import time
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

## Load Data and Select Query/Metric
- You can change `target_col` and `query_name` below to benchmark other metrics/queries.

In [4]:
csv_file = 'SimulatedQueryMetrics.csv'  # Change path if needed
target_col = 'CPU'                      # Change to 'LatencyMs' or 'LogicalReads' if desired
query_name = 'Q1'                       # Change to other query names (e.g., 'Q2') as needed

df = pd.read_csv(csv_file)
df['MetricDate'] = pd.to_datetime(df['MetricDate'])
df = df.sort_values(['QueryName', 'MetricDate', 'QueryVariant']).reset_index(drop=True)
dfq = df[df['QueryName'] == query_name].copy().sort_values('MetricDate').reset_index(drop=True)
print("First rows of selected data:")
print(dfq[['MetricDate', 'QueryVariant', target_col]].head())

First rows of selected data:
  MetricDate  QueryVariant        CPU
0 2025-07-04             1  53.432663
1 2025-07-04             2  57.975274
2 2025-07-04             3  55.829855
3 2025-07-04             4  52.646475
4 2025-07-04             5  55.029501


## Prepare Data for Modeling
- Use `MetricDate` as the time column, `target_col` as value.
- Create lag features for tree/LSTM models.

In [6]:
def create_lag_features(df, lags=7, val_col='y'):
    df = df.copy()
    for lag in range(1, lags+1):
        df[f'lag_{lag}'] = df[val_col].shift(lag)
    df = df.dropna().reset_index(drop=True)
    return df

lags = 7
dfq = dfq[['MetricDate', target_col, 'QueryVariant']].rename(columns={'MetricDate':'ds', target_col:'y'})
dfq_lagged = create_lag_features(dfq, lags=lags, val_col='y')

# Train/test split (last 20% for testing)
split = int(len(dfq_lagged) * 0.8)
train_df = dfq_lagged.iloc[:split]
test_df = dfq_lagged.iloc[split:]

X_train = train_df[[f'lag_{i}' for i in range(1, lags+1)]]
y_train = train_df['y']
X_test = test_df[[f'lag_{i}' for i in range(1, lags+1)]]
y_test = test_df['y']

## Helper Functions for Timing

In [8]:
def time_fit(model, X, y):
    start = time.time()
    model_fit = model.fit(X, y)
    end = time.time()
    train_time = end - start
    return model_fit, train_time

def time_predict(model, X):
    start = time.time()
    y_pred = model.predict(X)
    end = time.time()
    pred_time = end - start
    return y_pred, pred_time

## Random Forest

In [10]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf_fit, rf_train_time = time_fit(rf, X_train, y_train)
rf_pred, rf_pred_time = time_predict(rf_fit, X_test)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
print(f"Random Forest - Train time: {rf_train_time:.4f}s, Predict time: {rf_pred_time:.4f}s, RMSE: {rf_rmse:.4f}")

Random Forest - Train time: 1.4002s, Predict time: 0.0167s, RMSE: 8.3849


## XGBoost

In [12]:
xgb = XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
xgb_fit, xgb_train_time = time_fit(xgb, X_train, y_train)
xgb_pred, xgb_pred_time = time_predict(xgb_fit, X_test)
xgb_rmse = mean_squared_error(y_test, xgb_pred, squared=False)
print(f"XGBoost - Train time: {xgb_train_time:.4f}s, Predict time: {xgb_pred_time:.4f}s, RMSE: {xgb_rmse:.4f}")

XGBoost - Train time: 2.3536s, Predict time: 0.0091s, RMSE: 9.4040


## ARIMA

In [14]:
arima_train = train_df['y']
arima_test = test_df['y']
start = time.time()
arima_model = ARIMA(arima_train, order=(lags, 0, 0))
arima_fit = arima_model.fit()
arima_train_time = time.time() - start

start = time.time()
arima_pred = arima_fit.forecast(steps=len(arima_test))
arima_pred_time = time.time() - start
arima_rmse = mean_squared_error(arima_test, arima_pred, squared=False)
print(f"ARIMA - Train time: {arima_train_time:.4f}s, Predict time: {arima_pred_time:.4f}s, RMSE: {arima_rmse:.4f}")

ARIMA - Train time: 1.6758s, Predict time: 0.0220s, RMSE: 12.0738


## Prophet

In [16]:
prophet_train = train_df[['ds', 'y']]
prophet_test = test_df[['ds', 'y']]
start = time.time()
m = Prophet()
m.fit(prophet_train)
prophet_train_time = time.time() - start

future = prophet_test[['ds']]
start = time.time()
forecast = m.predict(future)
prophet_pred_time = time.time() - start
prophet_rmse = mean_squared_error(prophet_test['y'], forecast['yhat'], squared=False)
print(f"Prophet - Train time: {prophet_train_time:.4f}s, Predict time: {prophet_pred_time:.4f}s, RMSE: {prophet_rmse:.4f}")

04:18:33 - cmdstanpy - INFO - Chain [1] start processing
04:18:33 - cmdstanpy - INFO - Chain [1] done processing


Prophet - Train time: 0.9323s, Predict time: 0.1964s, RMSE: 13.2982


## LSTM (Keras)

In [18]:
X_train_lstm = X_train.values.reshape((-1, lags, 1))
X_test_lstm = X_test.values.reshape((-1, lags, 1))

model = Sequential()
model.add(LSTM(32, input_shape=(lags, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

start = time.time()
model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=0)
lstm_train_time = time.time() - start

start = time.time()
lstm_pred = model.predict(X_test_lstm).flatten()
lstm_pred_time = time.time() - start
lstm_rmse = mean_squared_error(y_test, lstm_pred, squared=False)
print(f"LSTM - Train time: {lstm_train_time:.4f}s, Predict time: {lstm_pred_time:.4f}s, RMSE: {lstm_rmse:.4f}")

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step
LSTM - Train time: 6.1479s, Predict time: 0.7455s, RMSE: 38.9456


## Summary Table

In [20]:
results = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost', 'ARIMA', 'Prophet', 'LSTM'],
    'Train Time (s)': [rf_train_time, xgb_train_time, arima_train_time, prophet_train_time, lstm_train_time],
    'Predict Time (s)': [rf_pred_time, xgb_pred_time, arima_pred_time, prophet_pred_time, lstm_pred_time],
    'RMSE': [rf_rmse, xgb_rmse, arima_rmse, prophet_rmse, lstm_rmse]
})
results

Unnamed: 0,Model,Train Time (s),Predict Time (s),RMSE
0,Random Forest,1.400186,0.016738,8.384908
1,XGBoost,2.353561,0.009125,9.404005
2,ARIMA,1.675765,0.022027,12.073801
3,Prophet,0.93231,0.19644,13.298219
4,LSTM,6.147939,0.745462,38.945615
