Machine Learning Project: **Machine Learning Model to Optimise Energy in Smart Homes**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


step1: data reading and importing files

In [5]:
import pandas as pd
file_path = "/content/drive/MyDrive/smart_home_energy_consumption_large.csv"
df = pd.read_csv(file_path)

In [6]:
df.head()

Unnamed: 0,Home ID,Appliance Type,Energy Consumption (kWh),Time,Date,Outdoor Temperature (°C),Season,Household Size
0,94,Fridge,0.2,21:12,2023-12-02,-1.0,Fall,2
1,435,Oven,0.23,20:11,2023-08-06,31.1,Summer,5
2,466,Dishwasher,0.32,06:39,2023-11-21,21.3,Fall,3
3,496,Heater,3.92,21:56,2023-01-21,-4.2,Winter,1
4,137,Microwave,0.44,04:31,2023-08-26,34.5,Summer,5


In [7]:
df.tail()

Unnamed: 0,Home ID,Appliance Type,Energy Consumption (kWh),Time,Date,Outdoor Temperature (°C),Season,Household Size
99995,124,Microwave,0.42,09:56,2023-09-28,20.5,Summer,1
99996,184,Computer,0.71,12:48,2023-05-27,-5.4,Spring,2
99997,101,Dishwasher,0.25,05:45,2023-02-18,35.6,Winter,3
99998,423,Air Conditioning,2.69,12:39,2023-04-20,3.7,Spring,1
99999,429,Fridge,0.37,18:46,2023-02-27,36.0,Winter,5


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Home ID                   100000 non-null  int64  
 1   Appliance Type            100000 non-null  object 
 2   Energy Consumption (kWh)  100000 non-null  float64
 3   Time                      100000 non-null  object 
 4   Date                      100000 non-null  object 
 5   Outdoor Temperature (°C)  100000 non-null  float64
 6   Season                    100000 non-null  object 
 7   Household Size            100000 non-null  int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 6.1+ MB


In [9]:
df.describe()

Unnamed: 0,Home ID,Energy Consumption (kWh),Outdoor Temperature (°C),Household Size
count,100000.0,100000.0,100000.0,100000.0
mean,250.37498,1.499952,14.950135,3.00177
std,144.435367,1.181176,14.438755,1.417077
min,1.0,0.1,-10.0,1.0
25%,125.0,0.59,2.4,2.0
50%,250.0,1.23,14.9,3.0
75%,375.0,1.87,27.4,4.0
max,500.0,5.0,40.0,5.0


In [10]:
print(df.isnull().sum())

Home ID                     0
Appliance Type              0
Energy Consumption (kWh)    0
Time                        0
Date                        0
Outdoor Temperature (°C)    0
Season                      0
Household Size              0
dtype: int64


step2: data cleaning and preprocessing

In [11]:
print(df.dtypes)

Home ID                       int64
Appliance Type               object
Energy Consumption (kWh)    float64
Time                         object
Date                         object
Outdoor Temperature (°C)    float64
Season                       object
Household Size                int64
dtype: object


In [12]:
df['timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])

df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month

df.drop(columns=['Date', 'Time'], inplace=True)

print(df.head())
print(df.dtypes)

   Home ID Appliance Type  Energy Consumption (kWh)  Outdoor Temperature (°C)  \
0       94         Fridge                      0.20                      -1.0   
1      435           Oven                      0.23                      31.1   
2      466     Dishwasher                      0.32                      21.3   
3      496         Heater                      3.92                      -4.2   
4      137      Microwave                      0.44                      34.5   

   Season  Household Size           timestamp  hour  day  month  
0    Fall               2 2023-12-02 21:12:00    21    2     12  
1  Summer               5 2023-08-06 20:11:00    20    6      8  
2    Fall               3 2023-11-21 06:39:00     6   21     11  
3  Winter               1 2023-01-21 21:56:00    21   21      1  
4  Summer               5 2023-08-26 04:31:00     4   26      8  
Home ID                              int64
Appliance Type                      object
Energy Consumption (kWh)       

step3: Handle Missing Values

In [13]:
print("Missing Values in Dataset:\n", df.isnull().sum())

Missing Values in Dataset:
 Home ID                     0
Appliance Type              0
Energy Consumption (kWh)    0
Outdoor Temperature (°C)    0
Season                      0
Household Size              0
timestamp                   0
hour                        0
day                         0
month                       0
dtype: int64


step4: Feature Selection & Data Preprocessing

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_cols = ['Appliance Type', 'Season']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print(df.head())
print(df.dtypes)

   Home ID  Appliance Type  Energy Consumption (kWh)  \
0       94               3                      0.20   
1      435               7                      0.23   
2      466               2                      0.32   
3      496               4                      3.92   
4      137               6                      0.44   

   Outdoor Temperature (°C)  Season  Household Size           timestamp  hour  \
0                      -1.0       0               2 2023-12-02 21:12:00    21   
1                      31.1       2               5 2023-08-06 20:11:00    20   
2                      21.3       0               3 2023-11-21 06:39:00     6   
3                      -4.2       3               1 2023-01-21 21:56:00    21   
4                      34.5       2               5 2023-08-26 04:31:00     4   

   day  month  
0    2     12  
1    6      8  
2   21     11  
3   21      1  
4   26      8  
Home ID                              int64
Appliance Type                       

step5: splitting data into training and testing sets

In [15]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Energy Consumption (kWh)'])
y = df['Energy Consumption (kWh)']

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (80000, 9) (80000,)
Testing set shape: (20000, 9) (20000,)


step6: training of ML model using Random Forest Regressor, due to continous values

In [16]:
from sklearn.ensemble import RandomForestRegressor
# Drop the 'timestamp' column from features
X = X.drop(columns=['timestamp'], errors='ignore')

# Re-split the data after removing 'timestamp'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Re-train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model training completed successfully")

Model training completed successfully


step7: model evaluation

In [17]:
y_pred = model.predict(X_test)

step8: calc performance metrics

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error (MAE): 0.4812
Mean Squared Error (MSE): 0.3544
Root Mean Squared Error (RMSE): 0.5953
R² Score: 0.7480


step8: save model and download

In [19]:
import pickle

model_filename = "energy_optimizer_model.pkl"

with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"Model saved as {model_filename}")

Model saved as energy_optimizer_model.pkl
