<a href="https://colab.research.google.com/github/brockgion/brockgion.github.io/blob/master/ITERATION2_(SEASONAL_SUMMER_3_MONTHS)_comparing_LSTM_vs_Linear_Regression_performance_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Electricity Household Load Forecasting with LSTM
# Data Loading and Setup

import pandas as pd
import numpy as np
import os
import glob
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries loaded successfully!")
print(f"Pandas version: {pd.__version__}")

✓ Libraries loaded successfully!
Pandas version: 2.2.2


In [2]:
# Define base path and month folders (NOTE: THIS WILL ONLY WORK IF LOGGED INTO GOOGLE DRIVE ACCT)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
base_path = "/content/drive/MyDrive/pt1"

month_folders = [
    # "November 2024",
    # "December 2024",
    # "January 2025",
    # "February 2025",
    # "March 2025",
    # "April 2025",
    # "May 2025",
    "June 2025",
    "July 2025",
    "August 2025",
    # "September 2025",
    # "October 2025"
]

# Function to load a single month's 15MIN data
def load_month_data(month_folder):
    month_path = os.path.join(base_path, month_folder)
    search_pattern = os.path.join(month_path, "*", "*15MIN.csv")
    files = glob.glob(search_pattern)

    if len(files) == 0:
        print(f"⚠️  WARNING: No 15MIN file found in {month_folder}")
        return None

    filepath = files[0]
    print(f"✓ Loading: {month_folder}")
    df = pd.read_csv(filepath)
    df['source_month'] = month_folder
    return df

# Load all months
print("="*60)
print("LOADING DATA FROM ALL MONTHS")
print("="*60)

all_data = []
for month in month_folders:
    month_df = load_month_data(month)
    if month_df is not None:
        all_data.append(month_df)
        print(f"  → Loaded {len(month_df):,} rows")

print(f"\n✓ Successfully loaded {len(all_data)} months of data")


LOADING DATA FROM ALL MONTHS
✓ Loading: June 2025
  → Loaded 2,881 rows
✓ Loading: July 2025
  → Loaded 2,977 rows
✓ Loading: August 2025
  → Loaded 2,977 rows

✓ Successfully loaded 3 months of data


In [4]:
# Combine all months
print("Combining all monthly data...")
df_combined = pd.concat(all_data, ignore_index=True)
print(f"✓ Combined shape: {df_combined.shape}")

# Parse timestamp and sort
print("\nCreating features...")
df_combined['timestamp'] = pd.to_datetime(df_combined['Time Bucket (America/Chicago)'])
df_combined = df_combined.sort_values('timestamp').reset_index(drop=True)

# Create total household power (TARGET variable)
df_combined['total_power_kw'] = (
    df_combined['SWORDFISH VUE-Mains_A (kWatts)'] +
    df_combined['SWORDFISH VUE-Mains_B (kWatts)']
)

# Create time-based features
df_combined['hour'] = df_combined['timestamp'].dt.hour
df_combined['day_of_week'] = df_combined['timestamp'].dt.dayofweek
df_combined['month'] = df_combined['timestamp'].dt.month
df_combined['is_weekend'] = (df_combined['day_of_week'] >= 5).astype(int)

# Set timestamp as index
df_combined = df_combined.set_index('timestamp')

print(f"✓ Date range: {df_combined.index[0]} to {df_combined.index[-1]}")
print(f"✓ Power statistics:")
print(f"   Mean: {df_combined['total_power_kw'].mean():.3f} kW")
print(f"   Min:  {df_combined['total_power_kw'].min():.3f} kW")
print(f"   Max:  {df_combined['total_power_kw'].max():.3f} kW")
print(f"\n✓ Features created: total_power_kw, hour, day_of_week, month, is_weekend")
print(f"✓ Final shape: {df_combined.shape}")

Combining all monthly data...
✓ Combined shape: (8835, 21)

Creating features...
✓ Date range: 2025-06-01 00:00:00 to 2025-09-01 00:00:00
✓ Power statistics:
   Mean: 1.185 kW
   Min:  0.285 kW
   Max:  10.781 kW

✓ Features created: total_power_kw, hour, day_of_week, month, is_weekend
✓ Final shape: (8835, 26)


In [5]:
# Create train/test split (80% train, 20% test)
print("Creating train/test split...")

split_idx = int(len(df_combined) * 0.8)
train_data = df_combined.iloc[:split_idx].copy()
test_data = df_combined.iloc[split_idx:].copy()

print(f"\n✓ Training Set: {len(train_data):,} rows")
print(f"   Date range: {train_data.index[0]} to {train_data.index[-1]}")
print(f"\n✓ Test Set: {len(test_data):,} rows")
print(f"   Date range: {test_data.index[0]} to {test_data.index[-1]}")
print(f"\n✓ Split verification: {'PASS' if train_data.index[-1] < test_data.index[0] else 'FAIL'}")

Creating train/test split...

✓ Training Set: 7,068 rows
   Date range: 2025-06-01 00:00:00 to 2025-08-13 14:15:00

✓ Test Set: 1,767 rows
   Date range: 2025-08-13 14:30:00 to 2025-09-01 00:00:00

✓ Split verification: PASS


In [6]:
# ============================================================
# Linear Regression MODEL
# ============================================================
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

print("Building Linear Regression Baseline...\n")

# Define features and target
feature_cols = ['hour', 'day_of_week', 'month', 'is_weekend']
target_col = 'total_power_kw'

X_train = train_data[feature_cols]
y_train = train_data[target_col]
X_test = test_data[feature_cols]
y_test = test_data[target_col]

# Train model
print("Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
print("✓ Model trained!\n")

# Make predictions
y_test_pred = lr_model.predict(X_test)

# Calculate metrics
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100
test_r2 = r2_score(y_test, y_test_pred)

print("="*60)
print("LINEAR REGRESSION BASELINE RESULTS (Table 2)")
print("="*60)
print(f"   RMSE: {test_rmse:.4f} kW")
print(f"   MAE:  {test_mae:.4f} kW")
print(f"   MAPE: {test_mape:.2f}%")
print(f"   R²:   {test_r2:.4f}")
print("="*60)

# Store for comparison
baseline_results = {
    'Model': 'Linear Regression',
    'RMSE': test_rmse,
    'MAE': test_mae,
    'MAPE': test_mape,
    'R2': test_r2
}

print("\n✓ Baseline complete! Ready for LSTM comparison.")

Building Linear Regression Baseline...

Training Linear Regression...
✓ Model trained!

LINEAR REGRESSION BASELINE RESULTS (Table 2)
   RMSE: 0.9669 kW
   MAE:  0.6542 kW
   MAPE: 58.58%
   R²:   0.0625

✓ Baseline complete! Ready for LSTM comparison.


In [7]:
# ============================================================
# Long Short-Term Memory (LSTM) MODEL
# ============================================================

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

print("Building LSTM Model...\n")

# Step 1: Create sequences (use past 24 hours to predict next value)
print("Step 1: Creating sequences...")
lookback = 96  # 24 hours * 4 (15-min intervals)

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_data[['total_power_kw']])
test_scaled = scaler.transform(test_data[['total_power_kw']])

def create_sequences(data, lookback):
    X, y = [], []
    for i in range(len(data) - lookback):
        X.append(data[i:i+lookback])
        y.append(data[i+lookback])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(train_scaled, lookback)
X_test, y_test = create_sequences(test_scaled, lookback)

print(f"   Training sequences: {X_train.shape}")
print(f"   Test sequences: {X_test.shape}")

# Step 2: Build LSTM architecture
print("\nStep 2: Building LSTM model...")
model = Sequential([
    LSTM(50, activation='relu', return_sequences=True, input_shape=(lookback, 1)),
    Dropout(0.2),
    LSTM(50, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
print("   ✓ Model architecture created")

# Step 3: Train model
print("\nStep 3: Training LSTM (this takes 5-10 minutes)...\n")
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

print("\n✓ Training complete!")

Building LSTM Model...

Step 1: Creating sequences...
   Training sequences: (6972, 96, 1)
   Test sequences: (1671, 96, 1)

Step 2: Building LSTM model...
   ✓ Model architecture created

Step 3: Training LSTM (this takes 5-10 minutes)...

Epoch 1/20
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 51ms/step - loss: 0.0102 - mae: 0.0614 - val_loss: 0.0065 - val_mae: 0.0407
Epoch 2/20
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 53ms/step - loss: 0.0082 - mae: 0.0535 - val_loss: 0.0057 - val_mae: 0.0482
Epoch 3/20
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 57ms/step - loss: 0.0076 - mae: 0.0506 - val_loss: 0.0053 - val_mae: 0.0424
Epoch 4/20
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 52ms/step - loss: 0.0066 - mae: 0.0464 - val_loss: 0.0052 - val_mae: 0.0450
Epoch 5/20
[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 50ms/step - loss: 0.0078 - mae: 0.0521 - val_loss: 0.0050 - val_mae:

In [8]:
# Step 4: Generate predictions
print("Step 4: Generating predictions...")
y_test_pred_scaled = model.predict(X_test, verbose=0)

# Inverse transform
y_test_actual = scaler.inverse_transform(y_test)
y_test_pred = scaler.inverse_transform(y_test_pred_scaled)

print("✓ Predictions generated!")

# ============================================================
# Save the trained model
print("\nSaving model + scaler to Google Drive...")

# Save model
model.save('/content/drive/MyDrive/lstm_model_summer_UNIVARIATE.h5')

# Save scaler params (just like your original method)
scaler_params = {
    'min': scaler.min_,
    'scale': scaler.scale_
}

import pickle
with open('/content/drive/MyDrive/scaler_params_summer_UNIVARIATE.pkl', 'wb') as f:
    pickle.dump(scaler_params, f)

print("✓ Model and scaler saved successfully!")
# ============================================================

# Step 5: Calculate metrics
lstm_rmse = np.sqrt(mean_squared_error(y_test_actual, y_test_pred))
lstm_mae = mean_absolute_error(y_test_actual, y_test_pred)
lstm_mape = np.mean(np.abs((y_test_actual - y_test_pred) / y_test_actual)) * 100
lstm_r2 = r2_score(y_test_actual, y_test_pred)

print("\n" + "="*60)
print("LSTM MODEL RESULTS (Table 2)")
print("="*60)
print(f"   RMSE: {lstm_rmse:.4f} kW")
print(f"   MAE:  {lstm_mae:.4f} kW")
print(f"   MAPE: {lstm_mape:.2f}%")
print(f"   R²:   {lstm_r2:.4f}")
print("="*60)

# Comparison with baseline
print("\n" + "="*60)
print("COMPARISON: LSTM vs BASELINE")
print("="*60)
print(f"Linear Regression: RMSE={baseline_results['RMSE']:.4f}, R²={baseline_results['R2']:.4f}")
print(f"LSTM:              RMSE={lstm_rmse:.4f}, R²={lstm_r2:.4f}")
print(f"\nImprovement:")
print(f"   RMSE: {((baseline_results['RMSE']-lstm_rmse)/baseline_results['RMSE']*100):.1f}% better")
print(f"   R²:   {lstm_r2:.4f} (baseline was {baseline_results['R2']:.4f})")
print("="*60)

lstm_results = {
    'Model': 'LSTM',
    'RMSE': lstm_rmse,
    'MAE': lstm_mae,
    'MAPE': lstm_mape,
    'R2': lstm_r2
}

print("\n✓ LSTM MODEL COMPLETE! Table 2 ready.")

Step 4: Generating predictions...




✓ Predictions generated!

Saving model + scaler to Google Drive...
✓ Model and scaler saved successfully!

LSTM MODEL RESULTS (Table 2)
   RMSE: 0.8021 kW
   MAE:  0.3930 kW
   MAPE: 26.12%
   R²:   0.3600

COMPARISON: LSTM vs BASELINE
Linear Regression: RMSE=0.9669, R²=0.0625
LSTM:              RMSE=0.8021, R²=0.3600

Improvement:
   RMSE: 17.0% better
   R²:   0.3600 (baseline was 0.0625)

✓ LSTM MODEL COMPLETE! Table 2 ready.


In [9]:
# Create Table 2 results
import pandas as pd

table_2 = pd.DataFrame([
    {
        'Model': 'Linear Regression',
        'RMSE (kW)': f"{baseline_results['RMSE']:.4f}",
        'MAE (kW)': f"{baseline_results['MAE']:.4f}",
        'MAPE (%)': f"{baseline_results['MAPE']:.2f}",
        'R²': f"{baseline_results['R2']:.4f}"
    },
    {
        'Model': 'LSTM',
        'RMSE (kW)': f"{lstm_results['RMSE']:.4f}",
        'MAE (kW)': f"{lstm_results['MAE']:.4f}",
        'MAPE (%)': f"{lstm_results['MAPE']:.2f}",
        'R²': f"{lstm_results['R2']:.4f}"
    }
])

print("\n" + "="*70)
print("TABLE 2: Load Forecasting Model Performance Comparison")
print("="*70)
print(table_2.to_string(index=False))
print("="*70)


TABLE 2: Load Forecasting Model Performance Comparison
            Model RMSE (kW) MAE (kW) MAPE (%)     R²
Linear Regression    0.9669   0.6542    58.58 0.0625
             LSTM    0.8021   0.3930    26.12 0.3600
