# Bronze - Locations Ingestion

**Purpose**: Fetch all Canadian air quality monitoring stations from OpenAQ API\
**Schedule**: Daily\
**Source**: OpenAQ API v3 `/locations` endpoint\
**Target**: `airquality.bronze.locations`

### 1.0 - Configuration

In [0]:
%run ./00_bronze_utils

### 2.0 - API Call Functions

In [0]:
import time

def fetch_all_locations(countries_id):
    """
    Fetch all locations for a country with automatic pagination.

    Args:
        countries_id: OpenAQ country ID (e.g., 156 for Canada)

    Returns:
        List of location dictionaries

    Note:
        Includes 1 second delay between pages to respect API rate limit (60/min)
    """
    all_locations = []
    page = 1
    
    while True:
        data = fetch_openaq("locations", params={
            "countries_id": countries_id,
            "limit": 1000,
            "page": page
        })
        
        results = data["results"]
        all_locations.extend(results)
        
        print(f"Page {page}: fetched {len(results)} locations (total: {len(all_locations)})")
        
        if len(results) < 1000:
            break
        
        page += 1
        time.sleep(1)
    
    return all_locations

### 3.0 - Run ingestion

In [0]:
# Canada = 156
canada_locations = fetch_all_locations(156)
print(f"\nTotal: {len(canada_locations)} locations in Canada")


### 4.0 - Clean Data

In [0]:
import json

# Create clean copy with JSON strings for complex fields
locations_clean = []
for loc in canada_locations:
    clean = {
        "id": loc["id"],
        "name": loc.get("name"),
        "locality": loc.get("locality"),
        "timezone": loc.get("timezone"),
        "isMobile": loc.get("isMobile"),
        "isMonitor": loc.get("isMonitor"),
        # "distance": loc.get("distance"), always null
        # Complex fields as JSON strings
        "country": json.dumps(loc.get("country")),
        "owner": json.dumps(loc.get("owner")),
        "provider": json.dumps(loc.get("provider")),
        "instruments": json.dumps(loc.get("instruments")),
        "sensors": json.dumps(loc.get("sensors")),
        "coordinates": json.dumps(loc.get("coordinates")),
        "licenses": json.dumps(loc.get("licenses")),
        "bounds": json.dumps(loc.get("bounds")),
        "datetimeFirst": json.dumps(loc.get("datetimeFirst")),
        "datetimeLast": json.dumps(loc.get("datetimeLast")),
    }
    locations_clean.append(clean)

print(f"Cleaned {len(locations_clean)} locations")

### 5.0 - Save Locations to Bronze

In [0]:
import pandas as pd
from pyspark.sql import functions as F
from datetime import datetime, timezone

pdf = pd.DataFrame(locations_clean)
df = spark.createDataFrame(pdf)

# Add metadata
df_bronze = df.withColumn("ingested_at", F.lit(datetime.now(timezone.utc).isoformat())) \
              .withColumn("source", F.lit("openaq_api_v3"))

# Append to bronze table
df_bronze.write.mode("append").saveAsTable(f"{CATALOG}.{SCHEMA}.locations")

print(f"Saved {df_bronze.count()} measurements to {CATALOG}.{SCHEMA}.locations")

### 6.0 - Check Locations Table

In [0]:
spark.sql("SHOW TABLES IN airquality.bronze").display()

In [0]:
spark.sql("DESCRIBE TABLE EXTENDED airquality.bronze.locations").display()