# Machine Learning For Power Dissagregation

### Import Libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

### Mount Drive

In [2]:
# Mount Google Drive (if using Colab)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load Data & preprocess time

In [3]:
# List of CSV files to use for training
csv_files = ['/content/drive/MyDrive/50_ResidentialPowerDisaggregation_SD_Fall23/1.2 Software/Colab Notebooks/ML Models/Andrew/MLData_long.csv']  # Add more file names as needed

# Load and concatenate data from multiple CSV files
data_list = []
for csv_file in csv_files:
    data = pd.read_csv(csv_file)
    data_list.append(data)

# Concatenate data from all CSV files
data = pd.concat(data_list, ignore_index=True)

# Preprocess the timestamp column to extract relevant information
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['Hour'] = data['timestamp'].dt.hour
data['DayOfWeek'] = data['timestamp'].dt.dayofweek
data['Month'] = data['timestamp'].dt.month

# Remove negative values for appropriate columns
for column in data.columns:
    if column != 'timestamp':
        data[column] = data[column].clip(lower=0)  # Clip negative values

# Remove rows with null values
data = data.dropna()

### Features, split, and sequences

In [4]:
# Define features and targets
X = data[['Hour', 'DayOfWeek', 'Month', 'Total']]
y = data[['Washer', 'BlowerGH', 'Lights', 'BlowerBed', 'CompGH', 'CompBed', 'Dryer', 'Recs1', 'Recs2', 'WaterHeater']]

# Normalize features and targets
scaler_X = MinMaxScaler()
X = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()
y = scaler_y.fit_transform(y)

# Function to create sequences
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps + 1):
        v = X[i:(i + time_steps)]
        Xs.append(v)
        ys.append(y[i + time_steps - 1])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 10

# Create sequences
X_seq, y_seq = create_dataset(X, y, TIME_STEPS)

# Split data into training and testing
split_ratio = 0.80
split_index = int(len(X_seq) * split_ratio)

X_train, y_train = X_seq[:split_index], y_seq[:split_index]
X_test, y_test = X_seq[split_index:], y_seq[split_index:]

## Machine Learning Models

### CNN model

In [5]:
# Define a function to create the CNN-LSTM model
def create_cnn_lstm_model(input_shape):
    model = Sequential()
    # Add convolutional layers
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    # Reshape the data for LSTM
    model.add(tf.keras.layers.Reshape((-1, 64)))  # 64 corresponds to the number of filters in the last Conv1D layer
    # Add LSTM layers
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    # Add dense layers
    model.add(Dense(10))  # This 10 corresponds to the number of output features
    model.compile(optimizer='adam', loss='mse')
    return model

# Create and compile the CNN-LSTM model
cnn_lstm_model = create_cnn_lstm_model((X_train.shape[1], X_train.shape[2]))

# Train the CNN-LSTM model on your data
cnn_lstm_model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f1e745ead10>

### Random Forest Model

In [6]:
# Define function to create the Random Forest model
def create_random_forest_model():
    model = RandomForestRegressor(n_estimators=100, random_state=42)  #Adjust the number of estimators as needed
    return model

# Create Random Forest model
random_forest_model = create_random_forest_model()

# Train the Random Forest model on data
# Reshape X_train and X_test to 2D arrays
X_train_rf = X_train.reshape(X_train.shape[0], -1)
X_test_rf = X_test.reshape(X_test.shape[0], -1)

random_forest_model.fit(X_train_rf, y_train)  # Train the Random Forest model

### Decision Trees model

In [7]:
# Define a function to create the Decision Trees model
def create_decision_trees_model():
    model = DecisionTreeRegressor(random_state=42)
    return model

# Create the Decision Trees model
decision_trees_model = create_decision_trees_model()

# Train the Decision Trees model on your data
decision_trees_model.fit(X_train_rf, y_train)

### Making Predictions

In [8]:
# Make predictions using all three models
cnn_lstm_predictions = cnn_lstm_model.predict(X_test)
X_test_rf = X_test.reshape(X_test.shape[0], -1)  # Reshape X_test for Random Forest and Decision Trees
random_forest_predictions = random_forest_model.predict(X_test_rf)  # Predict with Random Forest
decision_trees_predictions = decision_trees_model.predict(X_test_rf)  # Predict with Decision Trees

# Inverse transform the CNN-LSTM predictions and actual values to get them back in Watts
cnn_lstm_predicted_values = scaler_y.inverse_transform(cnn_lstm_predictions)
cnn_lstm_actual_values = scaler_y.inverse_transform(y_test)

# Inverse transform the Random Forest predictions
random_forest_predicted_values = scaler_y.inverse_transform(random_forest_predictions)

# Inverse transform the Decision Trees predictions
decision_trees_predicted_values = scaler_y.inverse_transform(decision_trees_predictions)



### Testing and plotting each model

In [9]:
# Extract timestamps for test
test_timestamps = data['timestamp'].iloc[-len(X_test):]

appliance_names = ['Washer', 'BlowerGH', 'Lights', 'BlowerBed', 'CompGH', 'CompBed', 'Dryer', 'Recs1', 'Recs2', 'WaterHeater']

# Create an empty DataFrame to store the results for all models
results_df = pd.DataFrame(columns=['Model', 'Appliance', 'Actual Total (Watts)', 'Predicted Total (Watts)', 'MAPE', 'RMSE', 'MAE'])

# List of model names
model_names = ['CNN-LSTM', 'Random Forest', 'Decision Trees']

# List of models
models = [cnn_lstm_model, random_forest_model, decision_trees_model]

# Loop through each model
for model_name, model in zip(model_names, models):
    if model_name == 'Random Forest' or model_name == 'Decision Trees':
        predictions = model.predict(X_test_rf)  # Use reshaped data for Random Forest and Decision Trees
    else:
        predictions = model.predict(X_test)  # Use original data for CNN-LSTM

    # Inverse transform the predictions and actual values to get them back in Watts
    predicted_values = scaler_y.inverse_transform(predictions)
    actual_values = scaler_y.inverse_transform(y_test)

    # Create an empty DataFrame to store the results for the current model
    model_results_df = pd.DataFrame(columns=['Appliance', 'Actual Total (Watts)', 'Predicted Total (Watts)', 'MAPE', 'RMSE', 'MAE'])

    for idx, appliance in enumerate(appliance_names):
        plt.figure(figsize=(15, 7))

        # Plot actual usage
        plt.plot(test_timestamps, actual_values[:, idx], label='Actual', color='C' + str(idx))

        # Plot predicted usage
        plt.plot(test_timestamps, predicted_values[:, idx], label='Predicted', color='C' + str(idx + 7))

        plt.title(f"{model_name} - {appliance} - Actual vs Predicted")
        plt.xlabel('Timestamp')
        plt.ylabel('Power Consumption (Watts)')
        plt.legend()
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Calculate actual and predicted total power in watts for the current appliance
        actual_total = actual_values[:, idx].sum()
        predicted_total = predicted_values[:, idx].sum()

        # Calculate the Mean Absolute Percentage Error (MAPE) for the current appliance
        #The code below calculates error at each point which is causing issues
        absolute_percentage_error = np.abs((actual_values[:, idx] - predicted_values[:, idx]) / actual_values[:, idx])
        mape = np.mean(absolute_percentage_error) * 100

        # Mean Absolute Percentage Error (MAPE) at lower resolution

        # Define block size (testing 15 minutes of resolution)
        block_size = 15

        # Calculate the number of complete blocks - cant have an incomplete one
        num_complete_blocks = len(actual_values) // block_size

        # Trim arrays to only include complete blocks
        trimmed_actual_values = actual_values[:num_complete_blocks * block_size,idx]
        trimmed_predicted_values = predicted_values[:num_complete_blocks * block_size,idx]

        # Reshape and aggregate the data
        actual_agg = np.sum(trimmed_actual_values.reshape(-1, block_size), axis=1)
        predicted_agg = np.sum(trimmed_predicted_values.reshape(-1, block_size), axis=1)

        # Calculate the absolute percentage error for each aggregated block
        absolute_percentage_error_agg = np.abs((actual_agg - predicted_agg) / actual_agg)

        # Calculate MAPE (at block resoluition)
        mape_agg = np.mean(absolute_percentage_error_agg) * 100


        # Calculate the Root Mean Square Error (RMSE) for the current appliance
        rmse = np.sqrt(np.mean((actual_values[:, idx] - predicted_values[:, idx]) ** 2))

        # Calculate the Mean Absolute Error (MAE) for the current appliance
        mae = np.mean(np.abs(actual_values[:, idx] - predicted_values[:, idx]))

        # Append the results to the DataFrame for the current model
        model_results_df = model_results_df.append({'Appliance': appliance,
                                                    'Actual Total (Watts)': actual_total,
                                                    'Predicted Total (Watts)': predicted_total,
                                                    'MAPE': mape,
                                                    'RMSE': rmse,
                                                    'MAE': mae,
                                                    'MAPE Agg': mape_agg},
                                                   ignore_index=True)

    # Calculate and display the overall total power usage comparison
    actual_total_combined = actual_values.sum(axis=1).sum()
    predicted_total_combined = predicted_values.sum(axis=1).sum()

    print(f"Total Power Usage for All Appliances Combined (Actual): {actual_total_combined:.2f} Watts")
    print(f"Total Power Usage for All Appliances Combined (Predicted): {predicted_total_combined:.2f} Watts")
    print(f"Percentage Error for All Appliances Combined: {((actual_total_combined - predicted_total_combined) / actual_total_combined) * 100:.2f}%")
    print()

    # Append the results for the current model to the overall results DataFrame
    model_results_df['Model'] = model_name
    results_df = pd.concat([results_df, model_results_df])

# Display the results DataFrame for all models
print(results_df)


Output hidden; open in https://colab.research.google.com to view.

#### Export Error Values Calculated to CSV

In [10]:
results_df.to_csv('error_values.csv', index=False)