# Build a regression model to estimate energy consumption

After cleaning the SPEC dataset in the previous notebook, we will use this notebook to experiment with a couple of ML models for power estimation. We will use the cleaned dataset directly obtained from the cleaning notebook.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
data = pd.read_csv("../data/spec_clean.csv")
print(data.shape)

(6809, 8)


In [3]:
data.head()

Unnamed: 0,CPU_Model,RAM_Capacity_GB,CPU_Freq_MHz,Num_Cores,Total_Threads,Target_Load_Pct,Achieved_Load_Pct,Avg_Power_Watts
0,Intel Xeon X5670,12.0,2933,12,24,100,99.2,258.0
1,Intel Xeon X5670,12.0,2933,12,24,90,90.1,241.0
2,Intel Xeon X5670,12.0,2933,12,24,80,80.0,227.0
3,Intel Xeon X5670,12.0,2933,12,24,70,70.0,209.0
4,Intel Xeon X5670,12.0,2933,12,24,60,59.9,189.0


In [4]:
# count of values per CPU
data.CPU_Model.value_counts()

CPU_Model
Intel Xeon X5670                    385
Intel Xeon E5-2660                  209
Intel Xeon X5675                    187
Intel Xeon Platinum 8180 2.50GHz    176
Intel Xeon L5430                    154
                                   ... 
Intel(R) Xeon(R) Gold 6226R          11
Intel Xeon Platinum 8160             11
AMD EPYC 7702 2.0Ghz                 11
AMD Opteron 2382                     11
Intel Xeon Processor E7330           11
Name: count, Length: 216, dtype: int64

In [7]:
data.describe()

Unnamed: 0,RAM_Capacity_GB,CPU_Freq_MHz,Num_Cores,Total_Threads,Target_Load_Pct,Achieved_Load_Pct,Avg_Power_Watts
count,6809.0,6809.0,6809.0,6809.0,6809.0,6809.0,6809.0
mean,96.594507,2539.840065,35.037157,66.497577,50.0,49.966397,209.038556
std,150.369565,386.773687,38.68188,78.105117,31.625099,31.569834,176.609696
min,4.0,1600.0,2.0,2.0,0.0,0.0,9.33
25%,12.0,2250.0,8.0,12.0,20.0,20.0,104.0
50%,24.0,2450.0,16.0,32.0,50.0,50.0,170.0
75%,128.0,2833.0,56.0,112.0,80.0,79.9,250.0
max,1536.0,3800.0,224.0,448.0,100.0,100.5,2148.0


## Splitting into train/test

In [5]:
from sklearn.model_selection import train_test_split

X = data.drop(['Target_Load_Pct', 'Avg_Power_Watts'], axis=1)
y = data['Avg_Power_Watts']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [7]:
# Perform mean target encoding
from sklearn.preprocessing import TargetEncoder

encoder = TargetEncoder()
X_train['CPU_Model'] = encoder.fit_transform(X_train[['CPU_Model']], y_train)
X_test['CPU_Model'] = encoder.transform(X_test[['CPU_Model']])


## Train ML models


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    'lr': LinearRegression(),
    'dtr': DecisionTreeRegressor(random_state=13),
    'rf': RandomForestRegressor(random_state=13),
    'lgbm': LGBMRegressor(random_state=13)
}

for name, model in models.items():
    print("----------------------------------")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"{name}: MSE={mean_squared_error(y_test, preds):.4f}, "
          f"MAE={mean_absolute_error(y_test, preds):.4f}, "
          f"R2={r2_score(y_test, preds):.4f}")

----------------------------------
Training lr...
lr: MSE=6902.3608, MAE=52.1306, R2=0.7899
----------------------------------
Training dtr...
dtr: MSE=1290.3174, MAE=20.2807, R2=0.9607
----------------------------------
Training rf...
rf: MSE=751.9607, MAE=16.3024, R2=0.9771
----------------------------------
Training lgbm...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 504
[LightGBM] [Info] Number of data points in the train set: 5447, number of used features: 6
[LightGBM] [Info] Start training from score 209.160956
lgbm: MSE=988.6949, MAE=15.7950, R2=0.9699


# Save the models as PMML

In [None]:
from sklearn2pmml import sklearn2pmml, PMMLPipeline
from sklearn2pmml.pipeline import PMMLPipeline
# Save RF model to PMML
pipeline = PMMLPipeline([("rf", models['rf'])])
sklearn2pmml(pipeline, "rf.pmml", with_repr=True)

