**New Method**

**next session**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
# from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load and preprocess the dataset
df = pd.read_csv(
    "/content/drive/MyDrive/new_household_power_consumption.txt",
    sep=';',
    parse_dates={'timestamp': ['Date', 'Time']},
    dayfirst=True,
    low_memory=False,
    on_bad_lines='skip'
)

df.replace('?', pd.NA, inplace=True)
df.dropna(inplace=True)


df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df['hour'] = df['timestamp'].dt.hour


def assign_time_range(hour):
    if 6 <= hour < 10:
        return '06:00:00-10:00:00'
    elif 10 <= hour < 12:
        return '10:00:00-12:00:00'
    elif 12 <= hour < 15:
        return '12:00:00-15:00:00'
    elif 15 <= hour < 17:
        return '15:00:00-17:00:00'
    elif 17 <= hour < 23:
        return '17:00:00-23:00:00'
    else:
        return '23:00:00-06:00:00'

df['time_range'] = df['hour'].apply(assign_time_range)
df = df.drop(columns=['timestamp'])
# df['time_range']


Unnamed: 0,time_range
0,17:00:00-23:00:00
1,17:00:00-23:00:00
2,17:00:00-23:00:00
3,17:00:00-23:00:00
4,17:00:00-23:00:00
...,...
2075254,17:00:00-23:00:00
2075255,17:00:00-23:00:00
2075256,17:00:00-23:00:00
2075257,17:00:00-23:00:00


In [2]:
df['Global_active_power'] = df['Global_active_power'].astype('float')
df['Global_reactive_power'] = df['Global_reactive_power'].astype('float')
df['Voltage'] = df['Voltage'].astype('float')
df['Global_intensity'] = df['Global_intensity'].astype('float')
df['Sub_metering_1'] = df['Sub_metering_1'].astype('float')
df['Sub_metering_2'] = df['Sub_metering_2'].astype('float')
df['Sub_metering_3'] = df['Sub_metering_3'].astype('float')

df['Submeter_metering_avg'] = (df['Sub_metering_1'] + df['Sub_metering_2'] + df['Sub_metering_3']) // 3
df.drop(columns=['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'], inplace=True)

le = LabelEncoder()
df['time_range_encoded'] = le.fit_transform(df['time_range'])

# df['time_range_encoded']

Unnamed: 0,time_range_encoded
0,4
1,4
2,4
3,4
4,4
...,...
2075254,4
2075255,4
2075256,4
2075257,4


In [3]:
# from sklearn.multioutput import MultiOutputRegressor

X = df[['Global_reactive_power', 'Voltage', 'Global_intensity', 'time_range_encoded', 'Submeter_metering_avg']]
y = df[['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity', 'Submeter_metering_avg']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, min_samples_split=5, min_samples_leaf=2))
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

r2_global_active_power = r2_score(y_test['Global_active_power'], y_pred_rf[:, 0])
r2_global_reactive_power = r2_score(y_test['Global_reactive_power'], y_pred_rf[:, 1])
r2_voltage = r2_score(y_test['Voltage'], y_pred_rf[:, 2])
r2_global_intensity = r2_score(y_test['Global_intensity'], y_pred_rf[:, 3])
r2_submeter_metering_avg = r2_score(y_test['Submeter_metering_avg'], y_pred_rf[:, 4])


print(f"R² score for Global_active_power: {r2_global_active_power}")
print(f"R² score for Global_reactive_power: {r2_global_reactive_power}")
print(f"R² score for Voltage: {r2_voltage}")
print(f"R² score for Global_intensity: {r2_global_intensity}")
print(f"R² score for Submeter_metering_avg: {r2_submeter_metering_avg}")


mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Mean Squared Error (Random Forest): {mse_rf}")


df[['predicted_Global_active_power', 'predicted_Global_reactive_power', 'predicted_Voltage',
    'predicted_Global_intensity', 'predicted_Submeter_metering_avg']] = rf_model.predict(X)

aggregated_predictions_rf = df.groupby('time_range')[['predicted_Global_active_power',
                                                      'predicted_Global_reactive_power',
                                                      'predicted_Voltage',
                                                      'predicted_Global_intensity',
                                                      'predicted_Submeter_metering_avg']].mean().reset_index()


print("Aggregated Predictions by Time Range (Random Forest):")
print(aggregated_predictions_rf)

R² score for Global_active_power: 0.9988330134635928
R² score for Global_reactive_power: 0.9999999368548379
R² score for Voltage: 0.99999955706708
R² score for Global_intensity: 0.9999999413843319
R² score for Submeter_metering_avg: 0.9999999694612788
Mean Squared Error (Random Forest): 0.0002636018975797555
Aggregated Predictions by Time Range (Random Forest):
          time_range  predicted_Global_active_power  \
0  06:00:00-10:00:00                       1.270864   
1  10:00:00-12:00:00                       1.257415   
2  12:00:00-15:00:00                       1.148845   
3  15:00:00-17:00:00                       0.971005   
4  17:00:00-23:00:00                       1.548844   
5  23:00:00-06:00:00                       0.559755   

   predicted_Global_reactive_power  predicted_Voltage  \
0                         0.107485         240.197418   
1                         0.122382         239.391888   
2                         0.135074         240.794112   
3                     