<a href="https://colab.research.google.com/github/brockgion/brockgion.github.io/blob/master/comparing_LSTM_vs_Linear_Regression_performance_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Master's Thesis: Household Load Forecasting with LSTM
# Data Loading and Setup

import pandas as pd
import numpy as np
import os
import glob
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries loaded successfully!")
print(f"Pandas version: {pd.__version__}")

✓ Libraries loaded successfully!
Pandas version: 2.2.2


In [13]:
# Define base path and month folders (NOTE: THIS WILL ONLY WORK IF LOGGED INTO GOOGLE DRIVE ACCT)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
base_path = "/content/drive/MyDrive/pt1"

month_folders = [
    "November 2024",
    "December 2024",
    "January 2025",
    "February 2025",
    "March 2025",
    "April 2025",
    "May 2025",
    "June 2025",
    "July 2025",
    "August 2025",
    "September 2025",
    "October 2025"
]

# Function to load a single month's 15MIN data
def load_month_data(month_folder):
    month_path = os.path.join(base_path, month_folder)
    search_pattern = os.path.join(month_path, "*", "*15MIN.csv")
    files = glob.glob(search_pattern)

    if len(files) == 0:
        print(f"⚠️  WARNING: No 15MIN file found in {month_folder}")
        return None

    filepath = files[0]
    print(f"✓ Loading: {month_folder}")
    df = pd.read_csv(filepath)
    df['source_month'] = month_folder
    return df

# Load all months
print("="*60)
print("LOADING DATA FROM ALL MONTHS")
print("="*60)

all_data = []
for month in month_folders:
    month_df = load_month_data(month)
    if month_df is not None:
        all_data.append(month_df)
        print(f"  → Loaded {len(month_df):,} rows")

print(f"\n✓ Successfully loaded {len(all_data)} months of data")


LOADING DATA FROM ALL MONTHS
✓ Loading: November 2024
  → Loaded 2,411 rows
✓ Loading: December 2024
  → Loaded 2,977 rows
✓ Loading: January 2025
  → Loaded 2,977 rows
✓ Loading: February 2025
  → Loaded 2,689 rows
✓ Loading: March 2025
  → Loaded 2,973 rows
✓ Loading: April 2025
  → Loaded 2,881 rows
✓ Loading: May 2025
  → Loaded 2,977 rows
✓ Loading: June 2025
  → Loaded 2,881 rows
✓ Loading: July 2025
  → Loaded 2,977 rows
✓ Loading: August 2025
  → Loaded 2,977 rows
✓ Loading: September 2025
  → Loaded 2,881 rows
✓ Loading: October 2025
  → Loaded 2,977 rows

✓ Successfully loaded 12 months of data


In [15]:
# Combine all months
print("Combining all monthly data...")
df_combined = pd.concat(all_data, ignore_index=True)
print(f"✓ Combined shape: {df_combined.shape}")

# Parse timestamp and sort
print("\nCreating features...")
df_combined['timestamp'] = pd.to_datetime(df_combined['Time Bucket (America/Chicago)'])
df_combined = df_combined.sort_values('timestamp').reset_index(drop=True)

# Create total household power (TARGET variable)
df_combined['total_power_kw'] = (
    df_combined['SWORDFISH VUE-Mains_A (kWatts)'] +
    df_combined['SWORDFISH VUE-Mains_B (kWatts)']
)

# Create time-based features
df_combined['hour'] = df_combined['timestamp'].dt.hour
df_combined['day_of_week'] = df_combined['timestamp'].dt.dayofweek
df_combined['month'] = df_combined['timestamp'].dt.month
df_combined['is_weekend'] = (df_combined['day_of_week'] >= 5).astype(int)

# Set timestamp as index
df_combined = df_combined.set_index('timestamp')

print(f"✓ Date range: {df_combined.index[0]} to {df_combined.index[-1]}")
print(f"✓ Power statistics:")
print(f"   Mean: {df_combined['total_power_kw'].mean():.3f} kW")
print(f"   Min:  {df_combined['total_power_kw'].min():.3f} kW")
print(f"   Max:  {df_combined['total_power_kw'].max():.3f} kW")
print(f"\n✓ Features created: total_power_kw, hour, day_of_week, month, is_weekend")
print(f"✓ Final shape: {df_combined.shape}")

Combining all monthly data...
✓ Combined shape: (34578, 23)

Creating features...
✓ Date range: 2024-11-07 13:30:00 to 2025-11-01 00:00:00
✓ Power statistics:
   Mean: 1.033 kW
   Min:  0.079 kW
   Max:  13.322 kW

✓ Features created: total_power_kw, hour, day_of_week, month, is_weekend
✓ Final shape: (34578, 28)


In [16]:
# Create train/test split (80% train, 20% test)
print("Creating train/test split...")

split_idx = int(len(df_combined) * 0.8)
train_data = df_combined.iloc[:split_idx].copy()
test_data = df_combined.iloc[split_idx:].copy()

print(f"\n✓ Training Set: {len(train_data):,} rows")
print(f"   Date range: {train_data.index[0]} to {train_data.index[-1]}")
print(f"\n✓ Test Set: {len(test_data):,} rows")
print(f"   Date range: {test_data.index[0]} to {test_data.index[-1]}")
print(f"\n✓ Split verification: {'PASS' if train_data.index[-1] < test_data.index[0] else 'FAIL'}")

Creating train/test split...

✓ Training Set: 27,662 rows
   Date range: 2024-11-07 13:30:00 to 2025-08-20 23:30:00

✓ Test Set: 6,916 rows
   Date range: 2025-08-20 23:45:00 to 2025-11-01 00:00:00

✓ Split verification: PASS


In [17]:
# Build Linear Regression Baseline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("Building Linear Regression Baseline...\n")

# Define features and target
feature_cols = ['hour', 'day_of_week', 'month', 'is_weekend']
target_col = 'total_power_kw'

X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Train model
print("Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print("✓ Model trained!\n")

# Make predictions
y_test_pred = lr_model.predict(X_test)

# Calculate metrics
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
test_r2 = r2_score(y_test, y_test_pred)

print("="*60)
print("LINEAR REGRESSION BASELINE RESULTS (Table 2)")
print("="*60)
print(f"   RMSE: {test_rmse:.4f} kW")
print(f"   MAE:  {test_mae:.4f} kW")
print(f"   MAPE: {test_mape:.2f}%")
print(f"   R²:   {test_r2:.4f}")
print("="*60)

# Store for comparison
baseline_results = {
    'Model': 'Linear Regression',
    'RMSE': test_rmse,
    'MAE': test_mae,
    'MAPE': test_mape,
    'R2': test_r2
}

print("\n✓ Baseline complete! Ready for LSTM comparison.")

Building Linear Regression Baseline...

Training Linear Regression...
✓ Model trained!

LINEAR REGRESSION BASELINE RESULTS (Table 2)
   RMSE: 0.9376 kW
   MAE:  0.5777 kW
   MAPE: 65.30%
   R²:   0.0439

✓ Baseline complete! Ready for LSTM comparison.


In [None]:
# ============================================================
# Long Short-Term Memory (LSTM) MODEL
# ============================================================

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

print("Building LSTM Model...\n")

# Step 1: Create sequences (use past 24 hours to predict next value)
print("Step 1: Creating sequences...")
lookback = 96  # 24 hours * 4 (15-min intervals)

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_data[['total_power_kw']])
test_scaled = scaler.transform(test_data[['total_power_kw']])

def create_sequences(data, lookback):
    X, y = [], []
    for i in range(len(data) - lookback):
        X.append(data[i:i+lookback])
        y.append(data[i+lookback])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train_scaled, lookback)
X_test, y_test = create_sequences(test_scaled, lookback)

print(f"   Training sequences: {X_train.shape}")
print(f"   Test sequences: {X_test.shape}")

# Step 2: Build LSTM architecture
print("\nStep 2: Building LSTM model...")
model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, input_shape=(lookback, 1)),
    Dropout(0.2),
    LSTM(50, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
print("   ✓ Model architecture created")

# Step 3: Train model
print("\nStep 3: Training LSTM (this takes 5-10 minutes)...\n")
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

print("\n✓ Training complete!")

Building LSTM Model...

Step 1: Creating sequences...
   Training sequences: (27566, 96, 1)
   Test sequences: (6820, 96, 1)

Step 2: Building LSTM model...
   ✓ Model architecture created

Step 3: Training LSTM (this takes 5-10 minutes)...

Epoch 1/20
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 106ms/step - loss: 0.0095 - mae: 0.0532 - val_loss: 0.0052 - val_mae: 0.0399
Epoch 2/20
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 102ms/step - loss: 0.0078 - mae: 0.0480 - val_loss: 0.0049 - val_mae: 0.0355
Epoch 3/20
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 101ms/step - loss: 0.0069 - mae: 0.0443 - val_loss: 0.0045 - val_mae: 0.0351
Epoch 4/20
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 101ms/step - loss: 0.0071 - mae: 0.0447 - val_loss: 0.0046 - val_mae: 0.0339
Epoch 5/20
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 98ms/step - loss: 0.0069 - mae: 0.0441 - val_loss: 0.0049 - 