In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read the data
FLOOR = 7
file_path = f'../Dataset/Sort/Building_energy_consumption/filtered_merged_data_Floor{FLOOR}.csv'
data = pd.read_csv(file_path)

# Convert the date column to datetime format and sort
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values('Date', inplace=True)

# Fill missing values in other columns (preliminary treatment using forward fill)
data.ffill(inplace=True)

# Select numerical columns for normalization
numerical_columns = data.select_dtypes(include=['float64']).columns
scaler = MinMaxScaler()
scaled_data = data.copy()
scaled_data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Convert the data to the format suitable for LSTM input
def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step)]
        dataX.append(a)
        dataY.append(dataset[i + time_step])
    return np.array(dataX), np.array(dataY)

time_step = 10
data = scaled_data[numerical_columns].values
trainX, trainY = create_dataset(data, time_step)

# Reshape the data to fit the LSTM input requirements [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], trainX.shape[2]))

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DatasetLoader, TensorDatasetset
from tqdm import tqdm

In [None]:
# Convert the data to PyTorch tensors
trainX_tensor = torch.Tensor(trainX)
trainY_tensor = torch.Tensor(trainY)

# Create data loader
train_dataset = TensorDatasetset(trainX_tensor, trainY_tensor)
train_loader = DatasetLoader(dataset=train_dataset, batch_size=64, shuffle=True)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, output_size, hidden_layer_size=50):
        super(LSTMModel, self).__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)

    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        predictions = self.linear(lstm_out[:, -1])
        return predictions

# Initialize the model, loss function, and optimizer
input_size = trainX.shape[2]
output_size = trainY.shape[1]
model = LSTMModel(input_size=input_size, hidden_layer_size=50, output_size=output_size)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
epochs = 10
for i in range(epochs):
    model.train()
    total_loss = 0
    with tqdm(train_loader, unit="batch") as tepoch:
        for seq, labels in tepoch:
            tepoch.set_description(f"Epoch {i+1}")
            optimizer.zero_grad()
            model.hidden_cell = (torch.zeros(1, seq.size(0), model.hidden_layer_size),
                            torch.zeros(1, seq.size(0), model.hidden_layer_size))

            y_pred = model(seq)

            single_loss = loss_function(y_pred, labels)
            single_loss.backward()
            optimizer.step()

            total_loss += single_loss.item()
            tepoch.set_postfix(loss=total_loss/len(train_loader))

    print(f'Epoch {i+1} loss: {total_loss/len(train_loader)}')

In [None]:
# Use the trained model for prediction and fill missing values
model.eval()
predictions = []
for seq in trainX_tensor:
    with torch.no_grad():
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),
                             torch.zeros(1, 1, model.hidden_layer_size))
        predictions.append(model(seq.unsqueeze(0)).numpy())

# Convert to NumPy array
predictions = np.array(predictions).squeeze()

# Ensure that the predictions are not less than 0
predictions = np.maximum(predictions, 0)

# Inverse transform the predictions to the original scale
predictions = scaler.inverse_transform(predictions)

# Fill the predictions back into the original data
filled_data = scaled_data.copy()
filled_data[numerical_columns] = scaler.inverse_transform(scaled_data[numerical_columns])

# Re-fill the missing values
filled_indices = filled_data[filled_data[numerical_columns].isnull().any(axis=1)].index
for idx, col in enumerate(numerical_columns):
    filled_data.loc[filled_indices, col] = predictions[:len(filled_indices), idx]

# Round the numerical values to two decimal places
filled_data[numerical_columns] = filled_data[numerical_columns].round(2)

# Save the filled data
filled_data.to_csv(f'../Dataset/Sort/Building_energy_consumption/filled_filtered_merged_data_Floor{FLOOR}.csv', index=False)