#Electricity Usage Modeling

##Set-up

In [None]:
## Installing any missing packages
#!pip install interpret

## Loading necessary packages
import numpy as np
import pandas as pd
from google.colab import files
import io
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
import xgboost
import matplotlib.pyplot as plt
from interpret import glassbox
from interpret import show
import statsmodels.api as sm
import shap

In [None]:
## Upload relevant files
uploads = files.upload()

Saving pep_electric_usage_interval_data_Service 1_1_2025-03-13_to_2025-04-24.csv to pep_electric_usage_interval_data_Service 1_1_2025-03-13_to_2025-04-24.csv
Saving Electricity_Usage_20250425.csv to Electricity_Usage_20250425.csv


In [None]:
## Read in the inputs data
inputs_df = pd.read_csv(io.BytesIO(uploads['Electricity_Usage_20250425.csv']))

## Read in the outputs data
outputs_df = pd.read_csv(io.BytesIO(uploads['pep_electric_usage_interval_data_Service 1_1_2025-03-13_to_2025-04-24.csv']))

## Print out shape of the dfs
print('inputs_df shape:', inputs_df.shape)
print('outputs_df shape:', outputs_df.shape)

inputs_df shape: (38, 9)
outputs_df shape: (1032, 7)


In [None]:
## Converting cost column to numeric values without the dollar sign
outputs_df['COST'] = outputs_df['COST'].str.replace('$', '').astype(float)

In [None]:
## Sum up usage (kwh) and cost by day
outputs_df_by_day = outputs_df.groupby('DATE')[['USAGE (kWh)', 'COST']].sum()

In [None]:
## Merge the two dfs
df = pd.merge(inputs_df, outputs_df_by_day, how = 'left', left_on = 'Date', right_index = True)

In [None]:
## Drop rows that are missing inputs or outputs
df = df.dropna(subset=['Temp_Low'])
df = df.dropna(subset=['COST'])

In [None]:
## Check out the data
df.head(5)

Unnamed: 0,Date,Temp_Low,Temp_High,Heating,Cooling,Thermostat_Day,Thermostat_Night,Laundry,Dishwasher,USAGE (kWh),COST
0,3/13/25,43.0,57.0,1.0,0.0,68.0,68.0,0.0,0.0,27.53,5.3
1,3/14/25,42.0,60.0,1.0,0.0,68.0,68.0,1.0,0.0,33.19,6.42
2,3/15/25,45.0,61.0,1.0,0.0,68.0,68.0,0.0,0.0,23.68,4.53
3,3/16/25,53.0,73.0,1.0,0.0,68.0,68.0,1.0,1.0,29.72,5.73
4,3/17/25,42.0,59.0,1.0,0.0,68.0,68.0,1.0,0.0,22.53,4.35


In [None]:
## Define x and y vars
x = df[['Temp_Low', 'Temp_High', 'Heating', 'Laundry', 'Dishwasher']]
y = df['COST']

In [None]:
## Check out the shape of our objects
print('df shape:', df.shape)
print('x shape:', x.shape)
print('y shape:', y.shape)

df shape: (33, 11)
x shape: (33, 5)
y shape: (33,)


##Explainable Boosting Machine (EBM)

In [None]:
## Creating the model object
ebm = glassbox.ExplainableBoostingRegressor()

## Fitting the model
ebm.fit(x, y)

In [None]:
## Look at global explanations
show(ebm.explain_global())

## Linear Regression

In [None]:
## Creating bins
bins = np.arange(0,115,5)

## Making a copy of the x df for linear regression
x_linreg = x.copy(deep = True)

## Binning the Temp_Low and Temp_High vars
x_linreg['Temp_Low_Binned'] = pd.cut(df['Temp_Low'], bins).astype(str)
x_linreg['Temp_High_Binned'] = pd.cut(df['Temp_High'], bins).astype(str)

In [None]:
## Getting dummy vars for the temp binned columns
x_linreg = pd.get_dummies(x_linreg, columns = ['Temp_Low_Binned'], drop_first = False, dtype = float)
x_linreg = pd.get_dummies(x_linreg, columns = ['Temp_High_Binned'], drop_first = False, dtype = float)

In [None]:
## Dropping the last category for each - HARDCODED FOR NOW
x_linreg = x_linreg.drop(columns = ['Temp_Low_Binned_(65, 70]', 'Temp_High_Binned_(80, 85]'])

In [None]:
## Drop the original temp vars
x_linreg = x_linreg.drop(columns = ['Temp_Low', 'Temp_High'])

In [None]:
## Add a constant
x_linreg = sm.add_constant(x_linreg)

In [None]:
## Check out our data
x_linreg.head(5)

Unnamed: 0,const,Heating,Laundry,Dishwasher,"Temp_Low_Binned_(30, 35]","Temp_Low_Binned_(35, 40]","Temp_Low_Binned_(40, 45]","Temp_Low_Binned_(45, 50]","Temp_Low_Binned_(50, 55]","Temp_Low_Binned_(55, 60]","Temp_Low_Binned_(60, 65]","Temp_High_Binned_(45, 50]","Temp_High_Binned_(50, 55]","Temp_High_Binned_(55, 60]","Temp_High_Binned_(60, 65]","Temp_High_Binned_(65, 70]","Temp_High_Binned_(70, 75]","Temp_High_Binned_(75, 80]"
0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
## Check the shape of our object
print('x_linreg shape:', x_linreg.shape)

x_linreg shape: (33, 18)


In [None]:
## Building the model and fitting the data
lin_reg = sm.OLS(y, x_linreg).fit()

## Check out summary
print(lin_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                   COST   R-squared:                       0.825
Model:                            OLS   Adj. R-squared:                  0.628
Method:                 Least Squares   F-statistic:                     4.172
Date:                Sat, 26 Apr 2025   Prob (F-statistic):            0.00396
Time:                        02:31:04   Log-Likelihood:                -33.047
No. Observations:                  33   AIC:                             102.1
Df Residuals:                      15   BIC:                             129.0
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 