# Build a regression model to estimate energy consumption

After cleaning the SPEC dataset in the previous notebook, we will use this notebook to experiment with a couple of ML models for power estimation. We will use the cleaned dataset directly obtained from the cleaning notebook.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
data = pd.read_csv("../data/spec_clean.csv")
print(data.shape)

(6809, 8)


In [3]:
data.head()

Unnamed: 0,CPU_Model,RAM_Capacity_GB,CPU_Freq_MHz,Num_Cores,Total_Threads,Target_Load_Pct,Achieved_Load_Pct,Avg_Power_Watts
0,Intel Xeon X5670,12.0,2933,12,24,100,99.2,258.0
1,Intel Xeon X5670,12.0,2933,12,24,90,90.1,241.0
2,Intel Xeon X5670,12.0,2933,12,24,80,80.0,227.0
3,Intel Xeon X5670,12.0,2933,12,24,70,70.0,209.0
4,Intel Xeon X5670,12.0,2933,12,24,60,59.9,189.0


In [4]:
# count of values per CPU
data.CPU_Model.value_counts()

CPU_Model
Intel Xeon X5670                    385
Intel Xeon E5-2660                  209
Intel Xeon X5675                    187
Intel Xeon Platinum 8180 2.50GHz    176
Intel Xeon L5430                    154
                                   ... 
AMD Opteron 2377 EE                  11
AMD EPYC 7742 2.25 GHz               11
Intel Xeon X3220                     11
AMD Opteron 8384                     11
Intel Xeon Processor E7330           11
Name: count, Length: 216, dtype: int64

In [5]:

data.CPU_Freq_MHz.unique()

array([2933, 2200, 2700, 2100, 2600, 2500, 3200, 3067, 2260, 2300, 2667,
       2450, 2000, 2250, 2400, 2333, 2830, 2900, 3100, 2330, 2266, 3700,
       3400, 2930, 3500, 3000, 2833, 2533, 2660, 1800, 3600, 2666, 2267,
       3066, 1900, 1860, 2133, 3800, 1600, 2800])

## Splitting into train/test

In [6]:
from sklearn.model_selection import train_test_split

X = data.drop(['Total_Threads','Target_Load_Pct', 'Avg_Power_Watts'], axis=1)
y = data['Avg_Power_Watts']

In [7]:
X.columns

Index(['CPU_Model', 'RAM_Capacity_GB', 'CPU_Freq_MHz', 'Num_Cores',
       'Achieved_Load_Pct'],
      dtype='object')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [9]:
# Perform mean target encoding
from sklearn.preprocessing import TargetEncoder

encoder = TargetEncoder()
X_train['CPU_Model'] = encoder.fit_transform(X_train[['CPU_Model']], y_train)
X_test['CPU_Model'] = encoder.transform(X_test[['CPU_Model']])


In [10]:
# Generate python code that saves the target encoding model/transformer into a json format so that I can use it in golang easily, it should map the category values to the calculated mean value

import json
def save_target_encoding_model(encoder, filename):
    # Create a dictionary to hold the mapping
    mapping = {}
    
    # Iterate through the categories and their corresponding mean values
    for category, mean_value in zip(encoder.categories_[0], encoder.transform([[category] for category in encoder.categories_[0]])):
        mapping[category] = mean_value[0]
    
    # Save the mapping to a JSON file
    with open(filename, 'w') as f:
        json.dump(mapping, f)
# Save the target encoding model
save_target_encoding_model(encoder, 'target_encoding_model.json')



## Train ML models


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    'lr': LinearRegression(),
    'dtr': DecisionTreeRegressor(random_state=13),
    'rf': RandomForestRegressor(random_state=13),
    'lgbm': LGBMRegressor(random_state=13)
}

for name, model in models.items():
    print("----------------------------------")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"{name}: MSE={mean_squared_error(y_test, preds):.4f}, "
          f"MAE={mean_absolute_error(y_test, preds):.4f}, "
          f"R2={r2_score(y_test, preds):.4f}")

----------------------------------
Training lr...
lr: MSE=7160.4214, MAE=52.7808, R2=0.7820
----------------------------------
Training dtr...
dtr: MSE=1940.1637, MAE=22.8682, R2=0.9409
----------------------------------
Training rf...
rf: MSE=876.6314, MAE=17.1511, R2=0.9733
----------------------------------
Training lgbm...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005025 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 480
[LightGBM] [Info] Number of data points in the train set: 5447, number of used features: 5
[LightGBM] [Info] Start training from score 209.160956
lgbm: MSE=1104.8427, MAE=16.5899, R2=0.9664


In [12]:
# Get LR coefficients
print("----------------------------------")
print("Linear Regression Coefficients:")
print("Intercept:", models['lr'].intercept_)
print("Coefficients:", models['lr'].coef_)


----------------------------------
Linear Regression Coefficients:
Intercept: -162.35484442181556
Coefficients: [0.89162966 0.14972551 0.01736159 0.49707893 2.24026731]


# Save models as joblib

We save the models using joblib library to load them in main.go, this is then loaded in the Go Scheduler via WASM.

In [13]:
# Save models via joblib
import joblib

for name, model in models.items():
    joblib.dump(model, f"../pickle/{name}_model.pkl")
    print(f"Saved {name} model to {name}_model.pkl")

Saved lr model to lr_model.pkl
Saved dtr model to dtr_model.pkl
Saved rf model to rf_model.pkl
Saved lgbm model to lgbm_model.pkl


# Save the models as ONNX

In [14]:
from sklearn2pmml import sklearn2pmml, PMMLPipeline
from sklearn2pmml.pipeline import PMMLPipeline
# Save RF model to PMML
pipeline = PMMLPipeline([("rf", models['rf'])])
sklearn2pmml(pipeline, "rf.pmml", with_repr=True)



In [15]:
# Save LGBM model to PMML
pipeline = PMMLPipeline([("lgbm", models['lgbm'])])
sklearn2pmml(pipeline, "lgbm.pmml", with_repr=True)
# Save DTR model to PMML
pipeline = PMMLPipeline([("dtr", models['dtr'])])
sklearn2pmml(pipeline, "dtr.pmml", with_repr=True)
# Save LR model to PMML
pipeline = PMMLPipeline([("lr", models['lr'])])
sklearn2pmml(pipeline, "lr.pmml", with_repr=True)

In [13]:
# Save the models to ONNX
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]

# Save the LR model to ONNX
model_onnx = convert_sklearn(models['lr'], initial_types=initial_type)
with open("lr.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())
# Save the RF model to ONNX
model_onnx = convert_sklearn(models['rf'], initial_types=initial_type)
with open("rf.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())
# Save the DTR model to ONNX
model_onnx = convert_sklearn(models['dtr'], initial_types=initial_type)
with open("dtr.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

