# LSTM Model Tutorial

This notebook demonstrates how to use the **LSTMModel** from OpenAD-lib for predicting biogas production using Long Short-Term Memory neural networks.

## Overview

LSTM networks are excellent for:
- Learning temporal patterns in time series data
- Fast real-time predictions once trained
- Capturing complex non-linear relationships between feedstock inputs and biogas outputs

## 1. Setup and Imports

In [None]:
import sys
import os

# Add library to path if not installed
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import LSTM Model
from openad_lib.models.ml import LSTMModel
from openad_lib.models.ml.lstm_model import series_to_supervised

print("Imports successful!")
print(f"PyTorch available: True")

## 2. Load and Explore Data

In [None]:
# Load sample data
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), 'src', 'openad_lib', 'data')
data_path = os.path.join(DATA_DIR, 'sample_feedstock_timeseries.csv')

data = pd.read_csv(data_path).dropna()

print(f"Dataset shape: {data.shape}")
print(f"\nColumns:")
print(data.columns.tolist())

data.head()

In [None]:
# Explore the data
print("=== Data Statistics ===")
data.describe()

In [None]:
# Visualize the target variable
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Biogas production over time
axes[0, 0].plot(data.index, data['Total_Biogas'], 'b-', linewidth=0.8)
axes[0, 0].set_xlabel('Sample Index')
axes[0, 0].set_ylabel('Total Biogas (m³/day)')
axes[0, 0].set_title('Biogas Production Time Series')
axes[0, 0].grid(True, alpha=0.3)

# Distribution
axes[0, 1].hist(data['Total_Biogas'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Total Biogas (m³/day)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Biogas Distribution')

# Feedstock inputs
feedstock_cols = ['Maize', 'Wholecrop', 'Chicken Litter', 'Lactose', 'Apple Pomace']
for col in feedstock_cols:
    if col in data.columns:
        axes[1, 0].plot(data.index, data[col], label=col, alpha=0.7)
axes[1, 0].set_xlabel('Sample Index')
axes[1, 0].set_ylabel('Feedstock Amount')
axes[1, 0].set_title('Feedstock Inputs Over Time')
axes[1, 0].legend(loc='upper right')
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
corr_cols = feedstock_cols + ['Total_Biogas']
corr_data = data[[c for c in corr_cols if c in data.columns]].corr()
im = axes[1, 1].imshow(corr_data, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
axes[1, 1].set_xticks(range(len(corr_data.columns)))
axes[1, 1].set_yticks(range(len(corr_data.columns)))
axes[1, 1].set_xticklabels(corr_data.columns, rotation=45, ha='right')
axes[1, 1].set_yticklabels(corr_data.columns)
axes[1, 1].set_title('Feature Correlations')
plt.colorbar(im, ax=axes[1, 1])

plt.tight_layout()
plt.show()

## 3. Prepare Data for LSTM

In [None]:
# Define features and target
feature_columns = ['Maize', 'Wholecrop', 'Chicken Litter', 'Lactose', 'Apple Pomace']
target_column = 'Total_Biogas'

# Verify columns exist
available_features = [c for c in feature_columns if c in data.columns]
print(f"Using features: {available_features}")
print(f"Target: {target_column}")

X = data[available_features].values
y = data[target_column].values

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Split data (preserving temporal order!)
train_size = int(len(X) * 0.8)

X_train = X[:train_size]
X_test = X[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

## 4. Create and Configure LSTM Model

In [None]:
# Create LSTM model
lstm = LSTMModel(
    input_dim=len(available_features),  # Number of input features
    hidden_dim=32,                       # Hidden layer size
    output_dim=1,                        # Single output (biogas)
    num_layers=2,                        # Number of LSTM layers
    dropout=0.1,                         # Dropout for regularization
    learning_rate=0.001                  # Learning rate
)

print(f"LSTM Model Configuration:")
print(f"  Input dimension: {lstm.input_dim}")
print(f"  Hidden dimension: {lstm.hidden_dim}")
print(f"  Output dimension: {lstm.output_dim}")
print(f"  Number of layers: {lstm.num_layers}")
print(f"  Device: {lstm.device}")

## 5. Train the Model

In [None]:
# Train the model
print("Training LSTM model...\n")

lstm.fit(
    X_train, 
    y_train,
    epochs=100,
    batch_size=8,
    verbose=True
)

print("\nTraining complete!")

In [None]:
# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(lstm.training_history, 'b-', linewidth=1)
plt.xlabel('Epoch')
plt.ylabel('Loss (MAE)')
plt.title('Training Loss Over Epochs')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Make Predictions

In [None]:
# Make predictions on train and test sets
y_train_pred = lstm.predict(X_train)
y_test_pred = lstm.predict(X_test)

print(f"Training predictions shape: {y_train_pred.shape}")
print(f"Test predictions shape: {y_test_pred.shape}")

In [None]:
# Evaluate performance
train_metrics = lstm.evaluate(X_train, y_train)
test_metrics = lstm.evaluate(X_test, y_test)

print("=== Training Metrics ===")
print(f"  RMSE: {train_metrics['rmse']:.2f}")
print(f"  MAE:  {train_metrics['mae']:.2f}")
print(f"  R²:   {train_metrics['r2']:.4f}")

print("\n=== Test Metrics ===")
print(f"  RMSE: {test_metrics['rmse']:.2f}")
print(f"  MAE:  {test_metrics['mae']:.2f}")
print(f"  R²:   {test_metrics['r2']:.4f}")

## 7. Visualize Results

In [None]:
# Plot predictions vs actual
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Training fit
axes[0, 0].plot(range(len(y_train)), y_train, 'b-', label='Actual', alpha=0.7)
axes[0, 0].plot(range(len(y_train_pred)), y_train_pred.flatten(), 'r-', label='Predicted', alpha=0.7)
axes[0, 0].set_xlabel('Sample Index')
axes[0, 0].set_ylabel('Biogas (m³/day)')
axes[0, 0].set_title(f'Training Set (R² = {train_metrics["r2"]:.3f})')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Test fit
axes[0, 1].plot(range(len(y_test)), y_test, 'b-', label='Actual', alpha=0.7)
axes[0, 1].plot(range(len(y_test_pred)), y_test_pred.flatten(), 'r-', label='Predicted', alpha=0.7)
axes[0, 1].set_xlabel('Sample Index')
axes[0, 1].set_ylabel('Biogas (m³/day)')
axes[0, 1].set_title(f'Test Set (R² = {test_metrics["r2"]:.3f})')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Scatter plot - training
axes[1, 0].scatter(y_train, y_train_pred.flatten(), alpha=0.5, s=20)
axes[1, 0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', linewidth=2)
axes[1, 0].set_xlabel('Actual Biogas')
axes[1, 0].set_ylabel('Predicted Biogas')
axes[1, 0].set_title('Training: Actual vs Predicted')
axes[1, 0].grid(True, alpha=0.3)

# Scatter plot - test
axes[1, 1].scatter(y_test, y_test_pred.flatten(), alpha=0.5, s=20, color='green')
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
axes[1, 1].set_xlabel('Actual Biogas')
axes[1, 1].set_ylabel('Predicted Biogas')
axes[1, 1].set_title('Test: Actual vs Predicted')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Residuals analysis
residuals = y_test - y_test_pred.flatten()

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Residual distribution
axes[0].hist(residuals, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--')
axes[0].set_xlabel('Residual')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Residual Distribution')

# Residuals over time
axes[1].plot(range(len(residuals)), residuals, 'o-', markersize=3, alpha=0.7)
axes[1].axhline(y=0, color='red', linestyle='--')
axes[1].set_xlabel('Sample Index')
axes[1].set_ylabel('Residual')
axes[1].set_title('Residuals Over Time')

plt.tight_layout()
plt.show()

## 8. Cross-Validation

In [None]:
# Perform time series cross-validation
print("Performing Time Series Cross-Validation...\n")

# Create a new model for CV
lstm_cv = LSTMModel(
    input_dim=len(available_features),
    hidden_dim=24,
    output_dim=1
)

cv_results = lstm_cv.cross_validate(
    X, y,
    n_splits=5,
    epochs=50,
    batch_size=8,
    verbose=True
)

In [None]:
# Summarize CV results
print("\n=== Cross-Validation Summary ===")
print(f"\nTest RMSE: {np.mean(cv_results['test_rmse']):.2f} ± {np.std(cv_results['test_rmse']):.2f}")
print(f"Test MAE:  {np.mean(cv_results['test_mae']):.2f} ± {np.std(cv_results['test_mae']):.2f}")
print(f"Test R²:   {np.mean(cv_results['test_r2']):.3f} ± {np.std(cv_results['test_r2']):.3f}")

## 9. Save and Load Model

In [None]:
# Save the trained model
model_path = 'lstm_biogas_model.pt'
lstm.save(model_path)
print(f"Model saved to {model_path}")

In [None]:
# Load the model
loaded_lstm = LSTMModel.load(model_path)
print("Model loaded successfully!")

# Verify predictions match
loaded_pred = loaded_lstm.predict(X_test[:5])
original_pred = lstm.predict(X_test[:5])

print(f"\nVerification (first 5 predictions):")
print(f"Original model: {original_pred.flatten()[:3]}")
print(f"Loaded model:   {loaded_pred.flatten()[:3]}")

## 10. Hyperparameter Tuning Tips

Key hyperparameters to tune:

| Parameter | Description | Good Starting Range |
|-----------|-------------|--------------------|
| `hidden_dim` | LSTM hidden units | 16-128 |
| `num_layers` | LSTM layers | 1-3 |
| `dropout` | Regularization | 0.1-0.3 |
| `learning_rate` | Optimizer LR | 0.0001-0.01 |
| `batch_size` | Training batch | 4-32 |
| `epochs` | Training iterations | 50-200 |

In [None]:
# Example: Compare different hidden dimensions
hidden_dims = [16, 32, 64]
results = []

for hdim in hidden_dims:
    print(f"\nTesting hidden_dim={hdim}...")
    model = LSTMModel(input_dim=len(available_features), hidden_dim=hdim)
    model.fit(X_train, y_train, epochs=50, verbose=False)
    metrics = model.evaluate(X_test, y_test)
    results.append({'hidden_dim': hdim, **metrics})
    print(f"  R² = {metrics['r2']:.3f}")

# Show comparison
results_df = pd.DataFrame(results)
print("\n=== Comparison ===")
print(results_df.to_string(index=False))

## Summary

In this notebook, you learned how to:

1. **Load and explore** time series data for biogas prediction
2. **Configure** LSTM model architecture
3. **Train** the model with appropriate parameters
4. **Evaluate** performance using RMSE, MAE, and R²
5. **Visualize** predictions and residuals
6. **Cross-validate** with time series splits
7. **Save and load** trained models

### Next Steps

- Experiment with different architectures
- Try feature engineering (lags, rolling windows)
- Compare with MTGP for uncertainty quantification
- Deploy trained model for real-time prediction