In [3]:
# === Multi-horizon AQI Forecasting with LSTM (mirrors your LGBM pipeline) ===
from pathlib import Path
import json, warnings
warnings.filterwarnings("ignore")

# ----------------- Imports -----------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

import matplotlib.pyplot as plt

# Torch
try:
    import torch
    import torch.nn as nn
    from torch.utils.data import DataLoader, TensorDataset
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
    import torch
    import torch.nn as nn
    from torch.utils.data import DataLoader, TensorDataset


In [4]:

# ----------------- CONFIG -----------------
PROJECT_ROOT  = Path.cwd().parent        # notebook inside Model_training/
DATA_PATH     = PROJECT_ROOT / "preprocessed_aqi_data (3).csv"
FEATURES_JSON = PROJECT_ROOT / "final_feature_list.json"

OUT_DIR       = PROJECT_ROOT / "predictions"
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILE   = OUT_DIR / "lstm_predicted_aqi_72hrs.csv"

WINDOW_SIZE     = 24       # past hours per sample
PREDICT_HORIZON = 72       # next 72 hours (3 days)
TARGET_COL      = "us_aqi"
TIME_COL_CANDS  = ["time", "datetime"]

# Training
EPOCHS          = 40
BATCH_SIZE      = 256
LR              = 1e-3
WD              = 1e-4
PATIENCE        = 8
HIDDEN_SIZE     = 128
NUM_LAYERS      = 2
DROPOUT         = 0.2
RANDOM_SEED     = 42


In [5]:

# ----------------- Reproducibility -----------------
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"

# ------------------- LOAD & PARSE TIME (DAY-FIRST) -------------------
df = pd.read_csv(DATA_PATH)

# pick time column name automatically
for c in TIME_COL_CANDS:
    if c in df.columns:
        TIME_COL = c
        break
else:
    raise ValueError(f"No datetime column found. Expected one of: {TIME_COL_CANDS}")

# clean & parse as day-first to match your “4/8/25 = 4 Aug 2025”
raw_time = (
    df[TIME_COL].astype(str)
      .str.strip()
      .str.replace("\u00A0", " ", regex=False)
      .str.replace("\u202F", " ", regex=False)
)
df[TIME_COL] = pd.to_datetime(raw_time, dayfirst=True, errors="coerce")

# sort chronologically
df = df.sort_values(TIME_COL).reset_index(drop=True)


In [6]:

# ------------------- FEATURES -------------------
feat_cols = json.loads(FEATURES_JSON.read_text())
missing = [c for c in feat_cols + [TARGET_COL, TIME_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in data: {missing}")

# ------------------- Helper: build windows -------------------
def build_sequences_limit(frame, features, target, window, horizon):
    """Just to figure out how many usable windows exist (like your LGBM first pass)."""
    n = len(frame)
    limit = n - window - horizon
    return limit

def build_sequences_seq(frame, features, target, window, horizon):
    """
    Returns:
      X: (N, T, F) with oldest->newest order (T==window)
      Y: (N, H)
    """
    Xs, Ys = [], []
    Xmat = frame[features].values
    yvec = frame[target].values
    n = len(frame)
    limit = n - window - horizon
    for i in range(limit):
        Xs.append(Xmat[i:i+window])                        # (T, F)
        Ys.append(yvec[i+window:i+window+horizon])         # (H,)
    return np.asarray(Xs), np.asarray(Ys), limit


In [7]:

# ------------------- TRAIN-ONLY WINSORIZATION + IMPUTE + SCALE -------------------
# First pass to know how many windows we’ll have
limit = build_sequences_limit(df, feat_cols, TARGET_COL, WINDOW_SIZE, PREDICT_HORIZON)
if limit <= 0:
    raise ValueError("Not enough rows to make sliding windows. Add more data.")

# Map “train windows” back to raw rows used by their inputs (same logic you used)
train_windows = int(limit * 0.8)
raw_end_for_train_inputs = (train_windows - 1) + WINDOW_SIZE
raw_end_for_train_inputs = max(raw_end_for_train_inputs, WINDOW_SIZE)

# Winsorize on TRAIN INPUT rows only
numeric_feats = [c for c in feat_cols if pd.api.types.is_numeric_dtype(df[c])]
low = df.loc[:raw_end_for_train_inputs, numeric_feats].quantile(0.01).to_dict()
high = df.loc[:raw_end_for_train_inputs, numeric_feats].quantile(0.99).to_dict()
for c in numeric_feats:
    df[c] = df[c].clip(lower=low[c], upper=high[c])

# Median impute using TRAIN INPUT rows only, then apply to all
train_medians = df.loc[:raw_end_for_train_inputs, feat_cols].median(numeric_only=True).to_dict()
df[feat_cols] = df[feat_cols].fillna(train_medians)

# Scale (LSTM benefits from scaling). Fit on TRAIN INPUT rows only; apply to all.
scaler = StandardScaler()
scaler.fit(df.loc[:raw_end_for_train_inputs, feat_cols].values)
df[feat_cols] = scaler.transform(df[feat_cols].values)


In [8]:

# ------------------- BUILD SEQUENCES AFTER CLEANING/SCALING -------------------
X_seq, y_seq, limit = build_sequences_seq(df, feat_cols, TARGET_COL, WINDOW_SIZE, PREDICT_HORIZON)
N, T, F = X_seq.shape
H = y_seq.shape[1]
assert T == WINDOW_SIZE and H == PREDICT_HORIZON

# ------------------- TRAIN/VAL SPLIT (chronological 80/20) -------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_seq, y_seq, test_size=0.2, shuffle=False
)


In [9]:

# ------------------- LSTM MODEL -------------------
class LSTMForecast(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, horizon):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )
        self.head = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, horizon)
        )

    def forward(self, x):               # x: (B, T, F)
        out, _ = self.lstm(x)           # (B, T, HIDDEN)
        last = out[:, -1, :]            # last time step
        return self.head(last)          # (B, horizon)

model = LSTMForecast(F, HIDDEN_SIZE, NUM_LAYERS, DROPOUT, H).to(device)
criterion = nn.SmoothL1Loss()   # Huber
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)

# DataLoaders
train_dl = DataLoader(TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
), batch_size=BATCH_SIZE, shuffle=False)

val_dl = DataLoader(TensorDataset(
    torch.tensor(X_val, dtype=torch.float32),
    torch.tensor(y_val, dtype=torch.float32)
), batch_size=BATCH_SIZE, shuffle=False)


In [10]:

# ------------------- TRAIN -------------------
best_val_mae = float("inf")
best_state = None
wait = 0

def eval_loader(dataloader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dataloader:
            xb = xb.to(device); yb = yb.to(device)
            pred = model(xb)
            preds.append(pred.cpu().numpy())
            trues.append(yb.cpu().numpy())
    return np.vstack(preds), np.vstack(trues)

for ep in range(1, EPOCHS+1):
    model.train()
    for xb, yb in train_dl:
        xb = xb.to(device); yb = yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
        optimizer.step()

    # Validate
    val_pred, val_true = eval_loader(val_dl)
    val_mae  = mean_absolute_error(val_true, val_pred)
    scheduler.step(val_mae)
    print(f"Epoch {ep:02d} | Val MAE: {val_mae:.4f}")

    if val_mae < best_val_mae - 1e-4:
        best_val_mae = val_mae
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        wait = 0
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stopping.")
            break

if best_state:
    model.load_state_dict(best_state)


Epoch 01 | Val MAE: 76.3324
Epoch 02 | Val MAE: 75.1567
Epoch 03 | Val MAE: 70.4450
Epoch 04 | Val MAE: 62.2746
Epoch 05 | Val MAE: 51.3796
Epoch 06 | Val MAE: 37.4538
Epoch 07 | Val MAE: 20.6191
Epoch 08 | Val MAE: 5.3782
Epoch 09 | Val MAE: 11.8420
Epoch 10 | Val MAE: 11.0473
Epoch 11 | Val MAE: 5.6196
Epoch 12 | Val MAE: 4.4388
Epoch 13 | Val MAE: 4.5521
Epoch 14 | Val MAE: 5.5322
Epoch 15 | Val MAE: 6.4659
Epoch 16 | Val MAE: 6.2944
Epoch 17 | Val MAE: 6.0180
Epoch 18 | Val MAE: 5.7940
Epoch 19 | Val MAE: 5.6868
Epoch 20 | Val MAE: 5.6766
Early stopping.


In [11]:

# ------------------- FINAL METRICS (Train + Val) -------------------
train_pred, train_true = eval_loader(train_dl)
val_pred,   val_true   = eval_loader(val_dl)

def metrics_block(y_t, y_p, label):
    mae  = mean_absolute_error(y_t, y_p)
    rmse = mean_squared_error(y_t, y_p, squared=False)
    # Per-horizon R² then average (like your LGBM approach)
    r2s = [r2_score(y_t[:, h], y_p[:, h]) for h in range(H)]
    print(f"\n=== {label} Metrics (averaged over {H} horizons) ===")
    print(f"MAE:  {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R²:   {np.mean(r2s):.3f}")
    # First 12 + specific horizons
    mae_list  = [mean_absolute_error(y_t[:, h], y_p[:, h]) for h in range(H)]
    if H >= 12: print(f"First 12 horizons MAE: {[round(m,3) for m in mae_list[:12]]}")
    if H >= 24: print(f"24h MAE: {mae_list[23]:.3f}")
    if H >= 48: print(f"48h MAE: {mae_list[47]:.3f}")
    if H >= 72: print(f"72h MAE: {mae_list[71]:.3f}")
    return mae_list, r2s

train_mae_list, train_r2_list = metrics_block(train_true, train_pred, "Train")
val_mae_list,   val_r2_list   = metrics_block(val_true,   val_pred,   "Validation")

# Save per-horizon validation metrics
val_report = pd.DataFrame({
    "horizon": np.arange(1, H+1),
    "MAE": val_mae_list,
    "R2":  val_r2_list
})
val_report.to_csv(OUT_DIR / "lstm_val_per_horizon_metrics.csv", index=False)



=== Train Metrics (averaged over 72 horizons) ===
MAE:  10.937
RMSE: 13.375
R²:   -0.001
First 12 horizons MAE: [11.205, 11.201, 11.195, 11.187, 11.176, 11.167, 11.161, 11.149, 11.14, 11.14, 11.123, 11.115]
24h MAE: 11.046
48h MAE: 10.841
72h MAE: 10.667

=== Validation Metrics (averaged over 72 horizons) ===
MAE:  5.677
RMSE: 7.230
R²:   -0.719
First 12 horizons MAE: [4.489, 4.529, 4.567, 4.599, 4.613, 4.648, 4.7, 4.717, 4.753, 4.849, 4.841, 4.879]
24h MAE: 5.361
48h MAE: 5.982
72h MAE: 6.737


In [13]:

# ------------------- FORECAST NEXT 72 HOURS (ONE CSV) -------------------
# last window of *scaled* features (after cleaning)
last_window = df[feat_cols].values[-WINDOW_SIZE:]              # (T, F)
last_window_t = torch.tensor(last_window, dtype=torch.float32).unsqueeze(0).to(device)  # (1, T, F)

model.eval()
with torch.no_grad():
    future_pred = model(last_window_t).cpu().numpy().reshape(-1)   # (72,)

# anchor = last non-NaT time; start = +1 hour
last_valid_time = df.loc[df[TIME_COL].notna(), TIME_COL].iloc[-1]
start = last_valid_time.floor("H") + pd.Timedelta(hours=1)
future_times = pd.date_range(start=start, periods=PREDICT_HORIZON, freq="h")

# format as d/m/yy HH:MM (so 9/8/25 = 9 Aug 2025)
ft = pd.Series(future_times)
formatted_dt = (
    ft.dt.day.astype(str) + "/" +
    ft.dt.month.astype(str) + "/" +
    ft.dt.strftime("%y") + " " +
    ft.dt.strftime("%H:%M")
)

forecast_df = pd.DataFrame({
    "datetime": formatted_dt,
    "predicted_aqi_us": future_pred
})
forecast_df.to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved forecast → {OUTPUT_FILE}")
print("First/last timestamps:", forecast_df['datetime'].iloc[0], "→", forecast_df['datetime'].iloc[-1])



Saved forecast → d:\Desktop\AlinasPrograms\myenv\10Pearls2\predictions\lstm_predicted_aqi_72hrs.csv
First/last timestamps: 10/8/25 00:00 → 12/8/25 23:00


In [14]:

# ------------------- SAVE MODEL + METADATA + SCALER -------------------
SAVE_DIR = PROJECT_ROOT / "models" / "current"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

MODEL_PATH   = SAVE_DIR / "lstm_multioutput_72h.pt"
SCALER_PATH  = SAVE_DIR / "scaler.joblib"
META_PATH    = SAVE_DIR / "metadata.json"

torch.save(model.state_dict(), MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

meta = {
    "features": feat_cols,
    "target_col": TARGET_COL,
    "time_col": TIME_COL,
    "window_size": WINDOW_SIZE,
    "horizon": PREDICT_HORIZON,
    "dayfirst": True,
    "winsor_low": low,
    "winsor_high": high,
    "train_medians": train_medians,
    "scaler_path": str(SCALER_PATH)
}
META_PATH.write_text(json.dumps(meta, indent=2))
print("Saved model →", MODEL_PATH)
print("Saved scaler →", SCALER_PATH)
print("Saved metadata →", META_PATH)


Saved model → d:\Desktop\AlinasPrograms\myenv\10Pearls2\models\current\lstm_multioutput_72h.pt
Saved scaler → d:\Desktop\AlinasPrograms\myenv\10Pearls2\models\current\scaler.joblib
Saved metadata → d:\Desktop\AlinasPrograms\myenv\10Pearls2\models\current\metadata.json
