In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [None]:
Xpath = '../Data/X/'
weather_df = pd.read_csv(Xpath + "hourly_weather_newark.csv")
weather_df["Date"] = pd.to_datetime(weather_df["Date"])
weather_df["Date"] = weather_df["Date"].dt.round('h')

import os

Ypath = "../Data/Y/"

files = os.listdir(Ypath)

rate_classes = {}

for file in files:
    df = pd.read_csv(Ypath + file)
    name = file.removesuffix("_cleaned.csv")

    rate_classes[name] = df
    
    df.rename(columns={name: "Load"}, inplace=True)
    df["Load"] = df["Load"] / 1000 # make sure all load units are in megawatts, MW = KW/1000
    df["Date"] = pd.to_datetime(df["Date"])

feature_matrix = rate_classes['Res'].merge(weather_df, on='Date')

# to POSIX Timestamp (number of seconds that have passed since January 1, 1970)
#feature_matrix = feature_matrix[['Date', 'Load', 'Temperature (F)', 'Humidity (%)']]
feature_matrix['Date'] = feature_matrix['Date'].apply(lambda t: int(t.timestamp()))

# Scale the load so that the values are closer to the temperature values
feature_matrix['Load'] = feature_matrix['Load'] * (feature_matrix['Temperature (F)'].mean() / feature_matrix['Load'].mean())

scaler = MinMaxScaler(feature_range=(0, 1))
scale_columns = ['Load', 'Temperature (F)', 'Humidity (%)']
feature_matrix[scale_columns] = scaler.fit_transform(feature_matrix[scale_columns])
print(feature_matrix.head())

In [None]:
window_size = 12  # Adjust this as needed
X = []
y = []
target_dates = feature_matrix['Date'][window_size:].reset_index(drop=True)  # Ensure proper indexing

# Loop to create sequences of data
for i in range(window_size, len(feature_matrix)):
    X.append(feature_matrix.iloc[i - window_size:i][['Load', 'Temperature (F)', 'Humidity (%)']].values)
    y.append(feature_matrix.iloc[i][['Load']].values)  # Predict the 'Load'

X = np.array(X)
y = np.array(y)

# Train-test split (without shuffling)
X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(
    X, y, target_dates, test_size=0.2, shuffle=False
)

# Reshaping X_train and X_test for LSTM (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2]))

# Print shapes for verification
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
print(X_train.dtype)
print(y_train.dtype)

y_train = y_train.astype('float32')

In [None]:
model = Sequential()
model.add(LSTM(units=128, return_sequences=True, input_shape=(X_train.shape[1],  X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=128))
model.add(Dropout(0.2))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mean_squared_error')

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

predictions = model.predict(X_test)
# Create a dummy array of shape (n_samples, 3) with predictions in the first column
predictions_with_dummy = np.zeros((predictions.shape[0], 3))
predictions_with_dummy[:, 0] = predictions.flatten()  # Fill only the first column with predictions

# Now inverse scale the predictions
predictions_rescaled = scaler.inverse_transform(predictions_with_dummy)[:, 0]  # Take only the first column (Load)

# Rescale y_test as well
y_test_rescaled = scaler.inverse_transform(np.hstack((y_test.reshape(-1, 1), np.zeros((y_test.shape[0], 2)))))[:, 0]  # Fill with dummy columns

# Calculate RMSE
rmse = np.sqrt(np.mean((y_test_rescaled - predictions_rescaled)**2))
print(f'RMSE: {rmse:.2f}')

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Train vs Test')
plt.xlabel('Epoch')
plt.ylabel('Loss')

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(y_test_rescaled, label='Actual Res_cleaned (Load)', marker='o')
plt.plot(predictions_rescaled, label='Predicted Load', marker='x')
plt.title('Actual vs Predicted Load')
plt.xlabel('Time')
plt.ylabel('Load (MW)')
plt.legend()
plt.show()