# Air Quality Dashboard Pipeline Notebook

This notebook runs the **full ingestion and cleaning pipeline** for the project, which then shows the first few rows of clean data.



In [None]:
# Pipeline wrapper: run ETL/cleaning from the notebook

from prototype.ingestion.ingest import ingest
from prototype.cleaning.clean import clean
import duckdb
import pandas as pd


In [None]:
# Run the ingestion pipeline
ingest(
    raw_dir="data/raw",                # Path to your raw data
    db_path="data/airquality.duckdb"   # Output DuckDB file
)


In [None]:
# Run the cleaning pipeline
clean(
    db_path="data/airquality.duckdb",  # DuckDB file to clean
    max_gap_hours=2                    # Example: max gap for forward fill
)


In [None]:
# Connect to DuckDB and see what's inside
con = duckdb.connect("data/airquality.duckdb")
print("Tables:", con.execute("SHOW TABLES").fetchall())

df_aurn = con.execute("SELECT * FROM clean_aurn LIMIT 5").df()
df_weather = con.execute("SELECT * FROM clean_weather LIMIT 5").df()

print("AURN sample:")
display(df_aurn)

print("Weather sample:")
display(df_weather)

con.close()


In [None]:
# Quick plot: NO2 time series from clean_aurn

import matplotlib.pyplot as plt

con = duckdb.connect("data/airquality.duckdb")
df = con.execute("SELECT datetime, no2 FROM clean_aurn ORDER BY datetime LIMIT 100").df()
con.close()

plt.figure(figsize=(10, 4))
plt.plot(df["datetime"], df["no2"], label="NO₂ (clean)")
plt.xlabel("Datetime")
plt.ylabel("NO₂ (µg/m³)")
plt.title("NO₂ Time Series (Sample)")
plt.legend()
plt.tight_layout()
plt.show()
