# Forecasting Air Pollution with Informer

*MSML 612 - Interim Project*  
Rohan, Shubhang, Adi, Swati, Josh

In [None]:
import os, warnings, datetime, math, json, random
from pathlib import Path
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# PyTorch stack
import torch
from torch import nn
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from pytorch_forecasting import (
    TimeSeriesDataSet, TemporalFusionTransformer, Baseline,
    QuantileLoss, NaNLabelEncoder,
)
# NeuralForecast Informer
from neuralforecast import NeuralForecast
from neuralforecast.models import Informer as NFInformer
from neuralforecast.losses.pytorch import MSE     # loss instance
pl.seed_everything(42, workers=True)
warnings.filterwarnings("ignore")

In [None]:
# hyper‑parameters and file locations 
CONFIG = dict(
    RAW_FILE='pollution_2000_2023.csv',
    SAVE_DIR='data/curated', # where processed files will go
    TRAIN_END='2016-12-31',  # end of training period
    VAL_END='2020-12-31', # end of validation period
    INPUT_WINDOW=30, # how many past days the model sees
    PRED_WINDOW=3, # how many future days it predicts
    BATCH_SIZE=128,
    MAX_EPOCHS=30, # training epochs for Lightning baselines
)
# making sure SAVE_DIR exists
Path(CONFIG['SAVE_DIR']).mkdir(parents=True, exist_ok=True)

In [None]:
#loading and formatting raw data
def load_raw(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, low_memory=False)
    # snake_case all column names
    df.columns = (
        df.columns.str.strip()
                  .str.replace(' ', '_')
                  .str.lower()
    )
    # ensure 'date' is datetime64[ns]
    df['date'] = pd.to_datetime(df['date'])
    return df

#loading the file
df_raw = load_raw(CONFIG['RAW_FILE'])
print(f'Raw rows loaded: {len(df_raw):,}')
# Peek at 3 random rows (transposing for readability)
display(df_raw.sample(3).T.head(20))

# 1. ETL - reshape to NeuralForecast Format

In [None]:
#keeping only necessary columns
value_cols = ['o3_mean', 'co_mean', 'so2_mean', 'no2_mean']
id_cols = ['date', 'state', 'city']
df_nf = df_raw[id_cols + value_cols].dropna().copy()

#melting to transform to long format
#each row becomes (date, city, state, pollutant, value)
df_nf = df_nf.melt(
    id_vars=['date', 'state', 'city'],
    value_vars=value_cols,
    var_name='pollutant',
    value_name='y'
)
# building unique_id  (ie 'denver_colorado_o3')
df_nf['unique_id'] = (
    df_nf['city'].str.replace(' ', '')
    + '_' + df_nf['state'].str.replace(' ', '')
    + '_' + df_nf['pollutant']
).str.lower()

#renaming nf convention
df_nf = df_nf.rename(columns={'date': 'ds'})[['unique_id', 'ds', 'y']]

print('long‑format preview:')
display(df_nf.head())

## 1.2 Train / Val / Test split + standard scaling

In [None]:
# converting cut‑off dates to pandas Timestamps
train_cut = pd.Timestamp(CONFIG['TRAIN_END'])
val_cut = pd.Timestamp(CONFIG['VAL_END'])

# masks for each period
train_mask = df_nf['ds'] <= train_cut
val_mask = (df_nf['ds'] > train_cut) & (df_nf['ds'] <= val_cut)
test_mask = df_nf['ds'] > val_cut

train_df = df_nf[train_mask].copy()
val_df = df_nf[val_mask].copy()
test_df = df_nf[test_mask].copy()

print('Rows [train / val / test] =', [len(train_df), len(val_df), len(test_df)])
#standardizing target using train statistics

scaler = StandardScaler().fit(train_df[['y']])

train_df['y'] = scaler.transform(train_df[['y']])
val_df['y'] = scaler.transform(val_df[['y']])
test_df['y'] = scaler.transform(test_df[['y']])

# saving the processed parquet files for reproducibility
Path('data/neuralforecast').mkdir(parents=True, exist_ok=True)
train_df.to_parquet('data/neuralforecast/train.parquet')
val_df.to_parquet('data/neuralforecast/val.parquet')
test_df.to_parquet('data/neuralforecast/test.parquet')

# 2. Train the Informer model

In [None]:
# Combine train + val (NF handles early stopping) 
df_train_val = pd.concat([train_df, val_df], ignore_index=True)

#instantiating transformer
nf_informer = NFInformer(
    h = CONFIG['PRED_WINDOW'],      # forecast horizon  (required)
    input_size = CONFIG['INPUT_WINDOW'],     # look‑back length  (required)
    hidden_size = 64,# embedding dimension (was d_model)
    n_head = 4, # multi‑head attention
    factor = 5,  # ProbSparse top‑k factor
    dropout = 0.1, # regularization
    learning_rate = 1e-3, # optimizer LR
    loss = MSE(), # pass a *loss instance*
    batch_size = 64, 
    max_steps = 1000,# training cap
)

# wrapping inside NeuralForecast orchestrator
nf = NeuralForecast(models=[nf_informer], freq='D')

# fit
print('Training Informer …')
nf.fit(df=df_train_val)   # internally uses early stopping
print('Training complete')

# 3. Evaluate on the test set

In [None]:
#generating forecasts
print("Generating 3‑day forecasts …")
test_forecasts = nf.predict() # horizon already = 3
print("Forecasts generated:")
display(test_forecasts.head())

#aligning ground‑truth slice
horizon_start = test_forecasts["ds"].min()
horizon_end = test_forecasts["ds"].max()

truth_slice = test_df[
    (test_df["ds"] >= horizon_start) &
    (test_df["ds"] <= horizon_end)
].copy()

#merging truths and predictions
pred_col = [c for c in test_forecasts.columns if c != "ds" and c != "unique_id"][0]
eval_df = (
    truth_slice.merge(test_forecasts, on=["unique_id", "ds"], how="left")
               .rename(columns={pred_col: "y_pred"})
)

#derive pollutant label
eval_df["pollutant"] = (
    eval_df["unique_id"]
    .str.extract(r"_(o3|co|so2|no2)_mean$")[0]
    .str.upper()
)

# Inverse‑scale to physical ppb 
eval_df["y_ppb"] = scaler.inverse_transform(eval_df[["y"]])
eval_df["y_pred_ppb"] = scaler.inverse_transform(eval_df[["y_pred"]])

#computiing the metrics
metrics = {}
for pol, grp in eval_df.groupby("pollutant"):
    metrics[pol] = dict(
        RMSE_scaled = math.sqrt(mean_squared_error(grp["y"], grp["y_pred"])),
        MAE_scaled = mean_absolute_error(grp["y"], grp["y_pred"]),
        R2_scaled = r2_score(grp["y"], grp["y_pred"]),
        RMSE_ppb = math.sqrt(mean_squared_error(grp["y_ppb"], grp["y_pred_ppb"])),
        MAE_ppb = mean_absolute_error(grp["y_ppb"], grp["y_pred_ppb"]),
        R2_ppb = r2_score(grp["y_ppb"], grp["y_pred_ppb"]),
    )

print("Test metrics")
print("```")
print(json.dumps(metrics, indent=2))
print("```")

# 4. Qualitative plot for one example series

In [None]:
# selecting one series and inverse‑transform 
sample_id = eval_df['unique_id'].iloc[0] # selecting first for submission
hist = df_nf[df_nf['unique_id'] == sample_id].copy()
hist['y_ppb'] = scaler.inverse_transform(hist[['y']])

fut  = eval_df[eval_df['unique_id'] == sample_id]
plt.figure(figsize=(10,4))
plt.plot(hist['ds'], hist['y_ppb'], label='History (ppb)')
plt.scatter(fut['ds'], fut['y_ppb'], label='Truth', marker='o')
plt.scatter(fut['ds'], fut['y_pred_ppb'], label='Pred', marker='x')
plt.title(f'Informer 3‑day forecast – {sample_id}')
plt.ylabel('Concentration (ppb)')
plt.legend(); plt.tight_layout(); plt.show()

# 5. Persist artefacts

In [None]:
#convert nested dict → flat DataFrame
metrics_df = (
    pd.DataFrame(metrics).T
    .reset_index()
    .rename(columns={'index':'Pollutant'})
    .round(4)
)
Path('results').mkdir(exist_ok=True)
metrics_df.to_csv('results/test_metrics_informer.csv', index=False)
import joblib, json
joblib.dump({'scaler': scaler}, 'results/scaler.joblib')
with open('results/config.json', 'w') as fp: json.dump(CONFIG, fp, indent=2)

print('Artefacts saved in ./results/')
