In [None]:
import pandas as pd
path = r"D:\BONEYS\WEB\PYTHON\Project\HouseHold_energy_usage\household_power_consumption.txt"
df_raw = pd.read_csv(path, header=None)

#Split the single column into multiple columns using ';' as delimiter
df_split = df_raw[0].str.split(';', expand=True)

#Set the first row as header
df_split.columns = df_split.iloc[0]
df_split = df_split.drop(index=0).reset_index(drop=True)

#Convert Date and Time columns to proper formats and seperating Datetime to Date and Time Individually
df_split['Date'] = pd.to_datetime(df_split['Date'], format='%d/%m/%Y').dt.date
df_split['Time'] = pd.to_datetime(df_split['Time'], format='%H:%M:%S').dt.time

df_split.to_csv('household_power_consumption.csv', index=False)

In [None]:
import pandas as pd
path = r"D:\BONEYS\WEB\PYTHON\Project\HouseHold_energy_usage\household_power_consumption.csv"
df = pd.read_csv(path,low_memory=False)
df = df.ffill()
df = df.bfill()

# Combine 'Date' and 'Time' into a single datetime column
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%Y-%m-%d %H:%M:%S')

# Convert Global_active_power to numeric (handle non-numeric values)
df['Global_active_power'] = pd.to_numeric(df['Global_active_power'],errors='coerce')

# Set DateTime as index
df.set_index('DateTime', inplace=True)

# Daily Average
df['Daily_Average'] = df['Global_active_power'].resample('D').transform('mean')

# Peak Hour during the day
df['Hour'] = df.index.hour
daily_peak_hours = (
    df.groupby(['Date', 'Hour'])['Global_active_power']
    .sum()
    .reset_index()
    .sort_values(['Date', 'Global_active_power'], ascending=[True, False])
    .drop_duplicates(subset='Date')  # keep only the top hour per date
    .rename(columns={'Hour': 'Peak_Hour'})
)

if 'Peak_Hour' in df.columns:
    df = df.drop(columns=['Peak_Hour'])

# merge to get Peak Hour during the day
df = df.merge(daily_peak_hours[['Date', 'Peak_Hour']], on='Date', how='left')
df = df.drop(columns=['Hour'],axis=1)

# Convert object columns to numeric, excluding 'Date' and 'Time'
cols_to_convert = df.select_dtypes(include='object').columns.drop(['Date', 'Time'])

# Apply conversion
df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

#removing outliners we have 2075259 rows
df['Voltage_normalized'] = (df['Voltage']-df['Voltage'].mean())/df['Voltage'].std()
df['Global_intensity_normalized'] = (df['Global_intensity']-df['Global_intensity'].mean())/df['Global_intensity'].std()
condition1 = df['Voltage_normalized']<=3
condition2 = df['Voltage_normalized']>=-3
condition3 = df['Global_intensity_normalized']<=3
condition4 = df['Global_intensity_normalized']>=-3
df = df[(condition1 & condition2 & condition3 & condition4)]
print(df)

               Date      Time  Global_active_power  Global_reactive_power  \
4        2006-12-16  17:28:00                3.666                  0.528   
5        2006-12-16  17:29:00                3.520                  0.522   
6        2006-12-16  17:30:00                3.702                  0.520   
7        2006-12-16  17:31:00                3.700                  0.520   
8        2006-12-16  17:32:00                3.668                  0.510   
...             ...       ...                  ...                    ...   
2075254  2010-11-26  20:58:00                0.946                  0.000   
2075255  2010-11-26  20:59:00                0.944                  0.000   
2075256  2010-11-26  21:00:00                0.938                  0.000   
2075257  2010-11-26  21:01:00                0.934                  0.000   
2075258  2010-11-26  21:02:00                0.932                  0.000   

         Voltage  Global_intensity  Sub_metering_1  Sub_metering_2  \
4    

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2001563 entries, 4 to 2075258
Data columns (total 13 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Date                         object 
 1   Time                         object 
 2   Global_active_power          float64
 3   Global_reactive_power        float64
 4   Voltage                      float64
 5   Global_intensity             float64
 6   Sub_metering_1               float64
 7   Sub_metering_2               float64
 8   Sub_metering_3               float64
 9   Daily_Average                float64
 10  Peak_Hour                    int32  
 11  Voltage_normalized           float64
 12  Global_intensity_normalized  float64
dtypes: float64(10), int32(1), object(2)
memory usage: 206.2+ MB


In [None]:
#deep learning model evauation
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
#getting memory allocation error so reducing size
df =df.sample(200000)
x = df.drop(['Date', 'Time', 'Global_active_power','Peak_Hour','Voltage_normalized','Global_intensity_normalized'], axis=1)
y = df['Global_active_power']

models = [
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    LinearRegression(),
    KNeighborsRegressor(),
]

evaluation = {
    'DecisionTreeRegressor': {'rmse': [], 'mae': [], 'r2': []},
    'RandomForestRegressor': {'rmse': [], 'mae': [], 'r2': []},
    'LinearRegression': {'rmse': [], 'mae': [], 'r2': []},
    'KNeighborsRegressor': {'rmse': [], 'mae': [], 'r2': []}
}

for i in range(10):
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25)
    for model in models:
        model.fit(xtrain, ytrain)
        predictions = model.predict(xtest)

        rmse = root_mean_squared_error(ytest, predictions)
        mae = mean_absolute_error(ytest, predictions)
        r2 = r2_score(ytest, predictions)

        evaluation[type(model).__name__]['rmse'].append(rmse)
        evaluation[type(model).__name__]['mae'].append(mae)
        evaluation[type(model).__name__]['r2'].append(r2)

# Print average metrics
for model_name, metrics in evaluation.items():
    print(f"\nModel: {model_name}")
    print(f"Average RMSE: {np.mean(metrics['rmse']):.4f}")
    print(f"Average MAE: {np.mean(metrics['mae']):.4f}")
    print(f"Average R2 Score: {np.mean(metrics['r2']):.4f}")


Model: DecisionTreeRegressor
Average RMSE: 0.0410
Average MAE: 0.0199
Average R2 Score: 0.9979

Model: RandomForestRegressor
Average RMSE: 0.0303
Average MAE: 0.0156
Average R2 Score: 0.9989

Model: LinearRegression
Average RMSE: 0.0394
Average MAE: 0.0249
Average R2 Score: 0.9981

Model: KNeighborsRegressor
Average RMSE: 0.0404
Average MAE: 0.0237
Average R2 Score: 0.9980


In [None]:
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# 1. Sample and split data
df = df.sample(200000)
x = df.drop(['Date', 'Time', 'Global_active_power', 'Peak_Hour', 'Voltage_normalized', 'Global_intensity_normalized'], axis=1)
y = df['Global_active_power']

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)

# 2. Convert to PyTorch tensors
xtrain = torch.tensor(xtrain.values, dtype=torch.float32)
ytrain = torch.tensor(ytrain.values, dtype=torch.float32)
xtest = torch.tensor(xtest.values, dtype=torch.float32)
ytest = torch.tensor(ytest.values, dtype=torch.float32)

# 3. Create Dataset and DataLoader
train_dataset = TensorDataset(xtrain, ytrain)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

# 4. Define Neural Network
class DeepNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepNN, self).__init__()
        self.input = nn.Linear(input_dim, 64)
        self.hidden1 = nn.Linear(64, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.hidden3 = nn.Linear(16, 8)
        self.output = nn.Linear(8, output_dim)

    def forward(self, x):
        x = torch.relu(self.input(x))
        x = torch.relu(self.hidden1(x))
        x = torch.relu(self.hidden2(x))
        x = torch.relu(self.hidden3(x))
        return self.output(x)

# 5. Model, loss, optimizer
model = DeepNN(xtrain.shape[1], 1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 6. Training
epochs = 100
for epoch in range(epochs):
    for x_train, y_train in train_loader:
        y_train = y_train.view(-1, 1)
        pred = model(x_train)
        loss = criterion(pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# 7. Evaluation
with torch.no_grad():
    train_preds = model(xtrain).squeeze()
    test_preds = model(xtest).squeeze()

    print("\nTrain Performance:")
    print(f"R2: {r2_score(ytrain, train_preds):.4f}")
    print(f"RMSE: {root_mean_squared_error(ytrain, train_preds):.4f}")
    print(f"MAE: {mean_absolute_error(ytrain, train_preds):.4f}")

    print("\nTest Performance:")
    print(f"R2: {r2_score(ytest, test_preds):.4f}")
    print(f"RMSE: {root_mean_squared_error(ytest, test_preds):.4f}")
    print(f"MAE: {mean_absolute_error(ytest, test_preds):.4f}")



Train Performance:
R2: 0.9987
RMSE: 0.0325
MAE: 0.0201

Test Performance:
R2: 0.9987
RMSE: 0.0324
MAE: 0.0200
