In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessed-aqi-data-3/preprocessed_aqi_data (3).csv


In [2]:
# === Multi-horizon AQI Forecasting with TCN (Kaggle-ready, start→finish) ===
from pathlib import Path
import json, warnings
warnings.filterwarnings("ignore")

# ----------------- Imports -----------------
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import matplotlib.pyplot as plt

# Torch (Kaggle already has torch)
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils import weight_norm

In [3]:
# ----------------- CONFIG (Kaggle-aware) -----------------
WINDOW_SIZE     = 24       # past hours per sample
PREDICT_HORIZON = 72       # next 72 hours (3 days)
TARGET_COL      = "us_aqi"
TIME_COL_CANDS  = ["time", "datetime"]

# Training
EPOCHS          = 50
BATCH_SIZE      = 256
LR              = 2e-3
WD              = 1e-4
PATIENCE        = 8
DROPOUT         = 0.25
CHANNELS        = [128, 128, 128, 128]   # dilations 1,2,4,8 -> receptive field 31 (>24)
KERNEL_SIZE     = 3
RANDOM_SEED     = 42

# Paths (auto-detect Kaggle)
IS_KAGGLE = Path("/kaggle").exists()
if IS_KAGGLE:
    INPUT_ROOT = Path("/kaggle/input/preprocessed-aqi-data-3")
    DATA_PATH = INPUT_ROOT / "preprocessed_aqi_data (3).csv"     # <— your file
    FEATURES_JSON_PATH = INPUT_ROOT / "final_feature_list.json"  # add to dataset if you have it
    OUT_DIR  = Path("/kaggle/working/predictions")
    SAVE_DIR = Path("/kaggle/working/models/current")
else:
    PROJECT_ROOT = Path.cwd().parent
    DATA_PATH = PROJECT_ROOT / "preprocessed_aqi_data (3).csv"
    FEATURES_JSON_PATH = PROJECT_ROOT / "final_feature_list.json"
    OUT_DIR  = PROJECT_ROOT / "predictions"
    SAVE_DIR = PROJECT_ROOT / "models" / "current"

OUT_DIR.mkdir(parents=True, exist_ok=True)
SAVE_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# ----------------- Reproducibility & Device -----------------
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ------------------- LOAD & PARSE TIME (DAY-FIRST) -------------------
df = pd.read_csv(DATA_PATH)

# pick time column name automatically
for c in TIME_COL_CANDS:
    if c in df.columns:
        TIME_COL = c
        break
else:
    raise ValueError(f"No datetime column found. Expected one of: {TIME_COL_CANDS}")

# clean & parse as day-first (e.g., 4/8/25 -> 4 Aug 2025)
raw_time = (
    df[TIME_COL].astype(str)
      .str.strip()
      .str.replace("\u00A0", " ", regex=False)
      .str.replace("\u202F", " ", regex=False)
)
df[TIME_COL] = pd.to_datetime(raw_time, dayfirst=True, errors="coerce")

# sort chronologically
df = df.sort_values(TIME_COL).reset_index(drop=True)

Device: cuda


In [5]:
# ------------------- FEATURE LIST -------------------
if FEATURES_JSON_PATH.exists():
    feat_cols = json.loads(FEATURES_JSON_PATH.read_text())
    print(f"Loaded {len(feat_cols)} features from final_feature_list.json")
else:
    print("⚠️ final_feature_list.json not found; inferring numeric features automatically.")
    feat_cols = [c for c in df.columns if c not in [TARGET_COL, TIME_COL] and pd.api.types.is_numeric_dtype(df[c])]
    print(f"Inferred {len(feat_cols)} numeric features.")

missing = [c for c in feat_cols + [TARGET_COL, TIME_COL] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in data: {missing}")

# ------------------- Helpers: build windows -------------------
def build_sequences_limit(frame, features, target, window, horizon):
    n = len(frame)
    return n - window - horizon

def build_sequences_seq(frame, features, target, window, horizon):
    """
    Returns:
      X: (N, T, F) with oldest->newest order (T==window)
      Y: (N, H)
    """
    Xs, Ys = [], []
    Xmat = frame[features].values
    yvec = frame[target].values
    n = len(frame)
    limit = n - window - horizon
    for i in range(limit):
        Xs.append(Xmat[i:i+window])                        # (T, F)
        Ys.append(yvec[i+window:i+window+horizon])         # (H,)
    return np.asarray(Xs), np.asarray(Ys), limit

⚠️ final_feature_list.json not found; inferring numeric features automatically.
Inferred 52 numeric features.


In [6]:
# ------------------- TRAIN-ONLY WINSORIZATION + IMPUTE + SCALE -------------------
limit = build_sequences_limit(df, feat_cols, TARGET_COL, WINDOW_SIZE, PREDICT_HORIZON)
if limit <= 0:
    raise ValueError("Not enough rows to make sliding windows. Add more data.")

train_windows = int(limit * 0.8)
raw_end_for_train_inputs = (train_windows - 1) + WINDOW_SIZE
raw_end_for_train_inputs = max(raw_end_for_train_inputs, WINDOW_SIZE)

# Winsorize on TRAIN INPUT rows only (1%/99%)
numeric_feats = [c for c in feat_cols if pd.api.types.is_numeric_dtype(df[c])]
low = df.loc[:raw_end_for_train_inputs, numeric_feats].quantile(0.01).to_dict()
high = df.loc[:raw_end_for_train_inputs, numeric_feats].quantile(0.99).to_dict()
for c in numeric_feats:
    df[c] = df[c].clip(lower=low[c], upper=high[c])

# Median impute using TRAIN INPUT rows only
train_medians = df.loc[:raw_end_for_train_inputs, feat_cols].median(numeric_only=True).to_dict()
df[feat_cols] = df[feat_cols].fillna(train_medians)

# Scale (fit on TRAIN INPUT rows only)
scaler = StandardScaler()
scaler.fit(df.loc[:raw_end_for_train_inputs, feat_cols].values)
df[feat_cols] = scaler.transform(df[feat_cols].values)

In [7]:
# ------------------- BUILD SEQUENCES AFTER CLEANING/SCALING -------------------
X_seq, y_seq, limit = build_sequences_seq(df, feat_cols, TARGET_COL, WINDOW_SIZE, PREDICT_HORIZON)
N, T, F = X_seq.shape
H = y_seq.shape[1]
assert T == WINDOW_SIZE and H == PREDICT_HORIZON
print(f"Windows: {N}, Window={T}, Features={F}, Horizon={H}")

# ------------------- TRAIN/VAL SPLIT (chronological 80/20) -------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_seq, y_seq, test_size=0.2, shuffle=False
)

# ------------------- TCN MODEL -------------------
class CausalConv1d(nn.Module):
    """Left-pad only, to keep causality."""
    def __init__(self, in_ch, out_ch, kernel_size, dilation):
        super().__init__()
        pad = (kernel_size - 1) * dilation
        self.pad = nn.ConstantPad1d((pad, 0), 0.0)
        self.conv = weight_norm(nn.Conv1d(in_ch, out_ch, kernel_size,
                                          dilation=dilation, padding=0))
    def forward(self, x):  # x: (B, C, L)
        return self.conv(self.pad(x))

class TemporalBlock(nn.Module):
    def __init__(self, in_ch, out_ch, kernel_size, dilation, dropout):
        super().__init__()
        self.net = nn.Sequential(
            CausalConv1d(in_ch, out_ch, kernel_size, dilation),
            nn.ReLU(),
            nn.Dropout(dropout),
            CausalConv1d(out_ch, out_ch, kernel_size, dilation),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        self.downsample = nn.Conv1d(in_ch, out_ch, 1) if in_ch != out_ch else None
    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return torch.relu(out + res)

class TCN(nn.Module):
    def __init__(self, in_feats, channels, kernel_size, dropout, horizon):
        super().__init__()
        layers = []
        prev_c = in_feats
        for i, c in enumerate(channels):
            dilation = 2 ** i
            layers.append(TemporalBlock(prev_c, c, kernel_size, dilation, dropout))
            prev_c = c
        self.tcn = nn.Sequential(*layers)
        self.head = nn.Sequential(
            nn.Linear(channels[-1], 256),
            nn.ReLU(),
            nn.Linear(256, horizon)
        )
    def forward(self, x):              # x: (B, T, F)
        x = x.permute(0, 2, 1)         # -> (B, F, T)
        y = self.tcn(x)                # (B, C, T)
        last = y[:, :, -1]             # last time step (B, C)
        return self.head(last)         # (B, horizon)

model = TCN(F, CHANNELS, KERNEL_SIZE, DROPOUT, H).to(device)
criterion = nn.SmoothL1Loss()   # Huber
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)

# DataLoaders
train_dl = DataLoader(TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
), batch_size=BATCH_SIZE, shuffle=False)

val_dl = DataLoader(TensorDataset(
    torch.tensor(X_val, dtype=torch.float32),
    torch.tensor(y_val, dtype=torch.float32)
), batch_size=BATCH_SIZE, shuffle=False)

Windows: 2208, Window=24, Features=52, Horizon=72


In [9]:
best_val_mae = float("inf")
best_state = None
wait = 0

def eval_loader(dataloader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dataloader:
            xb = xb.to(device); yb = yb.to(device)
            pred = model(xb)
            preds.append(pred.cpu().numpy())
            trues.append(yb.cpu().numpy())
    return np.vstack(preds), np.vstack(trues)

for ep in range(1, EPOCHS+1):
    model.train()
    for xb, yb in train_dl:
        xb = xb.to(device); yb = yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)  # ← fixed
        optimizer.step()

    # Validate
    val_pred, val_true = eval_loader(val_dl)
    val_mae  = mean_absolute_error(val_true, val_pred)
    scheduler.step(val_mae)
    print(f"Epoch {ep:02d} | Val MAE: {val_mae:.4f}")

    if val_mae < best_val_mae - 1e-4:
        best_val_mae = val_mae
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
        wait = 0
    else:
        wait += 1
        if wait >= PATIENCE:
            print("Early stopping.")
            break

if best_state:
    model.load_state_dict(best_state)


Epoch 01 | Val MAE: 334.6885
Epoch 02 | Val MAE: 39.8368
Epoch 03 | Val MAE: 8.9431
Epoch 04 | Val MAE: 15.9521
Epoch 05 | Val MAE: 10.8660
Epoch 06 | Val MAE: 9.1476
Epoch 07 | Val MAE: 35.3650
Epoch 08 | Val MAE: 9.4651
Epoch 09 | Val MAE: 7.0653
Epoch 10 | Val MAE: 10.0670
Epoch 11 | Val MAE: 9.5027
Epoch 12 | Val MAE: 6.2276
Epoch 13 | Val MAE: 7.1481
Epoch 14 | Val MAE: 6.0640
Epoch 15 | Val MAE: 15.6402
Epoch 16 | Val MAE: 19.2960
Epoch 17 | Val MAE: 12.6974
Epoch 18 | Val MAE: 6.5028
Epoch 19 | Val MAE: 9.5930
Epoch 20 | Val MAE: 5.7691
Epoch 21 | Val MAE: 6.2426
Epoch 22 | Val MAE: 9.0018
Epoch 23 | Val MAE: 8.5239
Epoch 24 | Val MAE: 6.8508
Epoch 25 | Val MAE: 6.1187
Epoch 26 | Val MAE: 5.9358
Epoch 27 | Val MAE: 6.2435
Epoch 28 | Val MAE: 6.1734
Early stopping.


In [12]:
# ------------------- FINAL METRICS (Train + Val) -------------------
train_pred, train_true = eval_loader(train_dl)
val_pred,   val_true   = eval_loader(val_dl)

def metrics_block(y_t, y_p, label):
    mae  = mean_absolute_error(y_t, y_p)
    rmse = mean_squared_error(y_t, y_p, squared=False)
    r2s = [r2_score(y_t[:, h], y_p[:, h]) for h in range(H)]
    print(f"\n=== {label} Metrics (averaged over {H} horizons) ===")
    print(f"MAE:  {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R²:   {np.mean(r2s):.3f}")
    mae_list  = [mean_absolute_error(y_t[:, h], y_p[:, h]) for h in range(H)]
    if H >= 12: print(f"First 12 horizons MAE: {[round(m,3) for m in mae_list[:12]]}")
    if H >= 24: print(f"24h MAE: {mae_list[23]:.3f}")
    if H >= 48: print(f"48h MAE: {mae_list[47]:.3f}")
    if H >= 72: print(f"72h MAE: {mae_list[71]:.3f}")
    return mae_list, r2s

train_mae_list, train_r2_list = metrics_block(train_true, train_pred, "Train")
val_mae_list,   val_r2_list   = metrics_block(val_true,   val_pred,   "Validation")

# Save per-horizon validation metrics
val_report = pd.DataFrame({
    "horizon": np.arange(1, H+1),
    "MAE": val_mae_list,
    "R2":  val_r2_list
})
val_report.to_csv(OUT_DIR / "tcn_val_per_horizon_metrics.csv", index=False)
print("Saved:", OUT_DIR / "tcn_val_per_horizon_metrics.csv")


=== Train Metrics (averaged over 72 horizons) ===
MAE:  8.113
RMSE: 10.728
R²:   0.339
First 12 horizons MAE: [7.718, 7.597, 7.384, 7.338, 7.176, 7.201, 7.039, 7.04, 6.945, 6.911, 6.865, 6.914]
24h MAE: 7.138
48h MAE: 8.535
72h MAE: 10.118

=== Validation Metrics (averaged over 72 horizons) ===
MAE:  5.769
RMSE: 7.229
R²:   -0.720
First 12 horizons MAE: [5.188, 5.12, 5.105, 5.181, 5.161, 5.186, 5.216, 5.223, 5.157, 5.211, 5.174, 5.18]
24h MAE: 5.652
48h MAE: 6.036
72h MAE: 6.267
Saved: /kaggle/working/predictions/tcn_val_per_horizon_metrics.csv


In [13]:
# ------------------- FORECAST NEXT 72 HOURS (ONE CSV) -------------------
last_window = df[feat_cols].values[-WINDOW_SIZE:]                     # (T, F)
last_window_t = torch.tensor(last_window, dtype=torch.float32).unsqueeze(0).to(device)  # (1, T, F)

model.eval()
with torch.no_grad():
    future_pred = model(last_window_t).cpu().numpy().reshape(-1)      # (72,)

# anchor and format times: d/m/yy HH:MM
last_valid_time = df.loc[df[TIME_COL].notna(), TIME_COL].iloc[-1]
start = last_valid_time.floor("H") + pd.Timedelta(hours=1)
future_times = pd.date_range(start=start, periods=PREDICT_HORIZON, freq="h")
ft = pd.Series(future_times)
formatted_dt = (
    ft.dt.day.astype(str) + "/" +
    ft.dt.month.astype(str) + "/" +
    ft.dt.strftime("%y") + " " +
    ft.dt.strftime("%H:%M")
)

forecast_df = pd.DataFrame({
    "datetime": formatted_dt,
    "predicted_aqi_us": future_pred
})
pred_csv = OUT_DIR / "tcn_predicted_aqi_72hrs.csv"
forecast_df.to_csv(pred_csv, index=False)
print(f"\nSaved forecast → {pred_csv}")
print("First/last timestamps:", forecast_df['datetime'].iloc[0], "→", forecast_df['datetime'].iloc[-1])


Saved forecast → /kaggle/working/predictions/tcn_predicted_aqi_72hrs.csv
First/last timestamps: 10/8/25 00:00 → 12/8/25 23:00


In [14]:
# ------------------- SAVE MODEL + METADATA + SCALER -------------------
MODEL_PATH   = SAVE_DIR / "tcn_multioutput_72h.pt"
SCALER_PATH  = SAVE_DIR / "scaler.joblib"
META_PATH    = SAVE_DIR / "metadata.json"

torch.save(model.state_dict(), MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

meta = {
    "features": feat_cols,
    "target_col": TARGET_COL,
    "time_col": TIME_COL,
    "window_size": WINDOW_SIZE,
    "horizon": PREDICT_HORIZON,
    "dayfirst": True,
    "winsor_low": low,
    "winsor_high": high,
    "train_medians": train_medians,
    "scaler_path": str(SCALER_PATH),
    "model": "TCN",
    "channels": CHANNELS,
    "kernel_size": KERNEL_SIZE,
    "dropout": DROPOUT
}
META_PATH.write_text(json.dumps(meta, indent=2))
print("Saved model →", MODEL_PATH)
print("Saved scaler →", SCALER_PATH)
print("Saved metadata →", META_PATH)

Saved model → /kaggle/working/models/current/tcn_multioutput_72h.pt
Saved scaler → /kaggle/working/models/current/scaler.joblib
Saved metadata → /kaggle/working/models/current/metadata.json
