# Hybrid Prophet + LSTM Training

Notebook này huấn luyện mô hình lai giữa Facebook Prophet và LSTM để dự báo `memory_usage_pct`, `cpu_total_usage`, `system_load`. Quy trình:

1. Prophet mô hình hóa xu hướng + seasonality trên chuỗi train.
2. Sai số (residuals) trên phần train được dùng để huấn luyện LSTM nhằm học các mẫu phi tuyến phức tạp mà Prophet chưa giải thích được.
3. Trong giai đoạn dự báo, Prophet dự đoán trước, LSTM dự báo residual tương lai theo kiểu autoregressive, cuối cùng cộng lại tạo thành dự báo hybrid.

> **Lưu ý**: Cần cài thêm `prophet` (hoặc `fbprophet` tùy môi trường) và `torch` trước khi chạy notebook này.


In [12]:
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from prophet import Prophet

from model_utils import (
    calculate_metrics,
    print_metrics,
    save_results,
    save_model,
)

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f"{x:,.6f}")

np.random.seed(42)
torch.manual_seed(42)

print("✓ Libraries imported")


✓ Libraries imported


In [13]:
# Configuration
DATA_DIR = Path('processed_data')
MODELS_DIR = Path('models')
RESULTS_FILE = Path('results_hybrid_prophet_lstm.json')
TARGETS = ['memory_usage_pct', 'cpu_total_usage', 'system_load']

# Time axis assumptions (30s cadence như các notebook khác)
SERIES_FREQ = '30S'
START_TIMESTAMP = pd.Timestamp('2024-01-01 00:00:00')

# Prophet config
PROPHET_CONFIG = {
    'seasonality_mode': 'additive',
    'weekly_seasonality': True,
    'daily_seasonality': True,
    'yearly_seasonality': False,
    'changepoint_prior_scale': 0.05,
}

# LSTM residual config
WINDOW_SIZE = 48  # 24 phút nếu data mỗi 30 giây
HIDDEN_SIZE = 64
NUM_LAYERS = 2
DROPOUT = 0.1
EPOCHS = 20
BATCH_SIZE = 256
LEARNING_RATE = 1e-3

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MODELS_DIR.mkdir(exist_ok=True)
print(f"Models dir: {MODELS_DIR.resolve()}")
print(f"Device: {DEVICE}")


Models dir: C:\Users\AnPham\Documents\PROJECTS\Intern1\models
Device: cpu


In [14]:
# Load datasets

def load_target_series(target: str):
    train_path = DATA_DIR / target / 'y_train.csv'
    test_path = DATA_DIR / target / 'y_test.csv'
    y_train = pd.read_csv(train_path).squeeze()
    y_test = pd.read_csv(test_path).squeeze()
    return y_train, y_test


datasets = {}
for target in TARGETS:
    y_train, y_test = load_target_series(target)
    datasets[target] = {
        'y_train': y_train,
        'y_test': y_test,
        'n_train': len(y_train),
        'n_test': len(y_test),
    }
    print(f"{target}: train={len(y_train):,} | test={len(y_test):,}")

print("\n✓ Series loaded")


memory_usage_pct: train=68,599 | test=17,150
cpu_total_usage: train=68,599 | test=17,150
system_load: train=68,599 | test=17,150

✓ Series loaded


In [15]:
# Helper functions

class ResidualDataset(Dataset):
    def __init__(self, sequences, targets):
        self.X = torch.tensor(sequences, dtype=torch.float32).unsqueeze(-1)
        self.y = torch.tensor(targets, dtype=torch.float32).unsqueeze(-1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class LSTMResidualModel(nn.Module):
    def __init__(self, hidden_size=64, num_layers=2, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out


def build_time_index(length: int, freq: str = SERIES_FREQ, start_ts: pd.Timestamp = START_TIMESTAMP):
    return pd.date_range(start=start_ts, periods=length, freq=freq)


def create_sequences(values: np.ndarray, window_size: int):
    X, y = [], []
    for i in range(window_size, len(values)):
        X.append(values[i - window_size:i])
        y.append(values[i])
    return np.array(X), np.array(y)


def train_lstm(model, dataloader, epochs, lr):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.train()
    for epoch in range(1, epochs + 1):
        epoch_loss = 0.0
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * len(X_batch)
        epoch_loss /= len(dataloader.dataset)
        if epoch % 5 == 0 or epoch == 1 or epoch == epochs:
            print(f"    Epoch {epoch:02d}/{epochs} - loss: {epoch_loss:.6f}")


def predict_sequences(model, sequences):
    model.eval()
    with torch.no_grad():
        X = torch.tensor(sequences, dtype=torch.float32).unsqueeze(-1).to(DEVICE)
        preds = model(X).cpu().numpy().flatten()
    return preds


def forecast_residuals(model, seed_sequence, n_steps):
    """Autoregressive residual forecast using last WINDOW_SIZE scaled residuals."""
    model.eval()
    seq = seed_sequence.copy().tolist()
    preds = []
    with torch.no_grad():
        for _ in range(n_steps):
            window = torch.tensor(seq[-WINDOW_SIZE:], dtype=torch.float32).view(1, WINDOW_SIZE, 1).to(DEVICE)
            pred = model(window).cpu().item()
            preds.append(pred)
            seq.append(pred)
    return np.array(preds)


def inverse_scale(values, mean, std):
    return values * std + mean


In [18]:
m = Prophet(**PROPHET_CONFIG, stan_backend='CMDSTANPY')
print("OK", m)

ValueError: CmdStan installataion missing makefile, path c:\Users\AnPham\Documents\PROJECTS\Intern1\.venv\Lib\site-packages\prophet\stan_model\cmdstan-2.33.1 is invalid. You may wish to re-install cmdstan by running command "install_cmdstan --overwrite", or Python code "import cmdstanpy; cmdstanpy.install_cmdstan(overwrite=True)"

In [16]:
hybrid_models = {}
training_metadata = {}
summary_records = []
detailed_results = {}
prediction_store = {}

for target in TARGETS:
    print("\n" + "=" * 80)
    print(f"Target: {target}")
    print("=" * 80)

    y_train = datasets[target]['y_train'].reset_index(drop=True)
    y_test = datasets[target]['y_test'].reset_index(drop=True)
    n_train = len(y_train)
    n_test = len(y_test)
    total_len = n_train + n_test

    time_index = build_time_index(total_len)
    train_df = pd.DataFrame({'ds': time_index[:n_train], 'y': y_train.values})

    # ---- Prophet training ----
    print("Training Prophet...")
    prophet_start = time.time()
    prophet = Prophet(**PROPHET_CONFIG)
    prophet.fit(train_df)
    prophet_time = time.time() - prophet_start

    future_df = prophet.make_future_dataframe(periods=n_test, freq=SERIES_FREQ)
    forecast_df = prophet.predict(future_df)
    prophet_preds_all = forecast_df['yhat'].values
    prophet_train_pred = prophet_preds_all[:n_train]
    prophet_test_pred = prophet_preds_all[n_train:]

    # ---- Residual prep ----
    train_residuals = y_train.values - prophet_train_pred
    residual_mean = train_residuals.mean()
    residual_std = train_residuals.std() if train_residuals.std() > 0 else 1e-6
    residuals_scaled = (train_residuals - residual_mean) / residual_std

    if n_train <= WINDOW_SIZE:
        raise ValueError(f"WINDOW_SIZE ({WINDOW_SIZE}) phải nhỏ hơn số mẫu train ({n_train})")

    seq_X, seq_y = create_sequences(residuals_scaled, WINDOW_SIZE)
    residual_dataset = ResidualDataset(seq_X, seq_y)
    dataloader = DataLoader(residual_dataset, batch_size=BATCH_SIZE, shuffle=True)

    # ---- LSTM training ----
    print("Training LSTM on residuals...")
    lstm_model = LSTMResidualModel(hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT).to(DEVICE)
    lstm_start = time.time()
    train_lstm(lstm_model, dataloader, epochs=EPOCHS, lr=LEARNING_RATE)
    lstm_time = time.time() - lstm_start

    # ---- Residual predictions ----
    train_pred_scaled = predict_sequences(lstm_model, seq_X)
    residual_train_pred = inverse_scale(train_pred_scaled, residual_mean, residual_std)

    residual_train_full = prophet_train_pred.copy()
    residual_train_full[:WINDOW_SIZE] = prophet_train_pred[:WINDOW_SIZE]
    hybrid_train_pred = prophet_train_pred.copy()
    hybrid_train_pred[WINDOW_SIZE:] = prophet_train_pred[WINDOW_SIZE:] + residual_train_pred

    seed_sequence = residuals_scaled[-WINDOW_SIZE:]
    residual_test_scaled = forecast_residuals(lstm_model, seed_sequence, n_test)
    residual_test_pred = inverse_scale(residual_test_scaled, residual_mean, residual_std)
    hybrid_test_pred = prophet_test_pred + residual_test_pred

    # ---- Metrics ----
    prophet_metrics = calculate_metrics(y_test.values, prophet_test_pred)
    hybrid_metrics = calculate_metrics(y_test.values, hybrid_test_pred)

    print("Prophet only metrics:")
    print_metrics(prophet_metrics, target + ' (Prophet)')
    print("Hybrid metrics:")
    print_metrics(hybrid_metrics, target + ' (Hybrid)')

    # ---- Save model package ----
    hybrid_package = {
        'prophet': prophet,
        'lstm_state_dict': lstm_model.state_dict(),
        'residual_mean': residual_mean,
        'residual_std': residual_std,
        'window_size': WINDOW_SIZE,
        'freq': SERIES_FREQ,
        'start_timestamp': START_TIMESTAMP.isoformat(),
        'config': {
            'prophet': PROPHET_CONFIG,
            'lstm': {
                'hidden_size': HIDDEN_SIZE,
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT,
                'epochs': EPOCHS,
                'batch_size': BATCH_SIZE,
                'learning_rate': LEARNING_RATE,
            }
        }
    }

    model_path = save_model(
        hybrid_package,
        model_name='hybrid_prophet_lstm',
        target=target,
        config={
            'prophet': PROPHET_CONFIG,
            'lstm': {
                'hidden_size': HIDDEN_SIZE,
                'num_layers': NUM_LAYERS,
                'dropout': DROPOUT,
                'epochs': EPOCHS,
                'batch_size': BATCH_SIZE,
                'learning_rate': LEARNING_RATE,
                'window_size': WINDOW_SIZE,
            }
        },
        models_dir=str(MODELS_DIR)
    )

    training_metadata[target] = {
        'prophet_time_s': prophet_time,
        'lstm_time_s': lstm_time,
        'model_path': model_path,
    }

    summary_records.append({
        'target': target,
        'prophet_mae': prophet_metrics['mae'],
        'prophet_rmse': prophet_metrics['rmse'],
        'prophet_r2': prophet_metrics['r2'],
        'hybrid_mae': hybrid_metrics['mae'],
        'hybrid_rmse': hybrid_metrics['rmse'],
        'hybrid_r2': hybrid_metrics['r2'],
        'prophet_time_s': prophet_time,
        'lstm_time_s': lstm_time,
        'model_path': model_path,
    })

    detailed_results[target] = {
        'prophet_metrics': prophet_metrics,
        'hybrid_metrics': hybrid_metrics,
        'prophet_time_s': prophet_time,
        'lstm_time_s': lstm_time,
        'model_path': model_path,
    }

    prediction_store[target] = {
        'y_train': y_train.values,
        'y_test': y_test.values,
        'prophet_train': prophet_train_pred,
        'prophet_test': prophet_test_pred,
        'hybrid_train': hybrid_train_pred,
        'hybrid_test': hybrid_test_pred,
    }

print("\n✓ Training completed for all targets")



Target: memory_usage_pct
Training Prophet...


AttributeError: 'Prophet' object has no attribute 'stan_backend'

In [None]:
summary_df = pd.DataFrame(summary_records).set_index('target')
display(summary_df)


## Visualization


In [None]:
fig, axes = plt.subplots(len(TARGETS), 1, figsize=(16, 12), sharex=False)
fig.suptitle('Hybrid Prophet + LSTM vs Actual (Test Set)', fontsize=16, fontweight='bold')

for idx, target in enumerate(TARGETS):
    ax = axes[idx]
    data = prediction_store[target]
    ax.plot(data['y_test'], label='Actual', alpha=0.7)
    ax.plot(data['prophet_test'], label='Prophet', alpha=0.7)
    ax.plot(data['hybrid_test'], label='Hybrid', linewidth=2)
    ax.set_title(target)
    ax.set_ylabel('Normalized value')
    ax.grid(True, alpha=0.3)
    if idx == len(TARGETS) - 1:
        ax.set_xlabel('Time step (test)')
    if idx == 0:
        ax.legend()

plt.tight_layout()
plt.show()


In [None]:
fig, axes = plt.subplots(1, len(TARGETS), figsize=(18, 5))
fig.suptitle('Predicted vs Actual (Hybrid)', fontsize=16, fontweight='bold')

for idx, target in enumerate(TARGETS):
    ax = axes[idx]
    data = prediction_store[target]
    y_true = data['y_test']
    y_pred = data['hybrid_test']
    ax.scatter(y_true, y_pred, alpha=0.3, s=10)
    min_v = min(y_true.min(), y_pred.min())
    max_v = max(y_true.max(), y_pred.max())
    ax.plot([min_v, max_v], [min_v, max_v], 'r--', linewidth=1)
    ax.set_title(f"{target}")
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicted')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

metrics_df = summary_df[[
    'prophet_mae', 'prophet_rmse', 'prophet_r2',
    'hybrid_mae', 'hybrid_rmse', 'hybrid_r2'
]].copy()

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('Metrics Comparison (Prophet vs Hybrid)', fontsize=16, fontweight='bold')

metrics_df[['prophet_mae', 'hybrid_mae']].plot(kind='bar', ax=axes[0, 0], color=['#90caf9', '#1e88e5'])
axes[0, 0].set_title('MAE (lower better)')
axes[0, 0].grid(True, alpha=0.3)

metrics_df[['prophet_rmse', 'hybrid_rmse']].plot(kind='bar', ax=axes[0, 1], color=['#ffab91', '#d84315'])
axes[0, 1].set_title('RMSE (lower better)')
axes[0, 1].grid(True, alpha=0.3)

metrics_df[['prophet_r2', 'hybrid_r2']].plot(kind='bar', ax=axes[1, 0], color=['#c5e1a5', '#7cb342'])
axes[1, 0].set_title('R² (higher better)')
axes[1, 0].axhline(0, color='k', linestyle='--', linewidth=1)
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].axis('off')
plt.tight_layout()
plt.show()


In [None]:
results_payload = {
    'model': 'hybrid_prophet_lstm',
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'targets': detailed_results,
    'training': training_metadata,
    'config': {
        'prophet': PROPHET_CONFIG,
        'lstm': {
            'hidden_size': HIDDEN_SIZE,
            'num_layers': NUM_LAYERS,
            'dropout': DROPOUT,
            'epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'learning_rate': LEARNING_RATE,
            'window_size': WINDOW_SIZE,
        },
        'series_freq': SERIES_FREQ,
        'start_timestamp': START_TIMESTAMP.isoformat(),
    }
}

save_results(results_payload, str(RESULTS_FILE))
