In [0]:
import json
import requests
import time
import logging
from datetime import datetime
from azure.storage.blob import BlobServiceClient

In [0]:
'''
Logging configuration and initialization, reduced for external libraries.
'''

logging.basicConfig(
    format="%(asctime)s - %(levelname)s: %(message)s",
    level=logging.INFO,
    handlers=[logging.StreamHandler()]
)

for lib in ["azure", "urllib3", "py4j"]:
    logging.getLogger(lib).setLevel(logging.WARNING)

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [0]:
'''
Fetching access data from Databricks secrets.
'''

AZURE_CONNECTION_STRING = dbutils.secrets.get("dev_secrets", "storageconnection")
API_KEY_OPENAQ = dbutils.secrets.get("dev_secrets", "openaq")

In [0]:
'''
Initializing the client for handling Azure Blob Storage.
'''

CONTAINER_NAME = "data"
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container_client = blob_service.get_container_client(CONTAINER_NAME)

In [0]:
'''
Definiton of paths and API endpoints.
'''

API_SENSORS_URL = "https://api.openaq.org/v3/sensors"
LOCATIONS_PATH = "openaq-locations-data"
MEASUREMENTS_PATH = "openaq-measurements-data"
SENSORS_PATH = "openaq-sensors-data"

In [0]:
'''
Definition of the date range for data retrieval.
'''

end_date_iso = datetime.utcnow().replace(minute=0, second=0, microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
start_date_iso = "2025-01-01T00:00:00Z"

In [0]:
def get_latest_measurements_file():
    '''
    Retrieves the name of the latest measurement file from Azure Blob Storage.

    Returns:
        str or None: File name or None if no files were found.
    '''
    logger.info("Searching for the latest measurement file in Azure Blob Storage.")
    blobs = list(container_client.list_blobs(name_starts_with=MEASUREMENTS_PATH))
    
    if not blobs:
        logger.warning("No previous measurement files found. Fetching data from the beginning.")
        return None

    latest_blob = max(blobs, key=lambda b: b.last_modified)
    logger.info(f"Found file: {latest_blob.name}")
    return latest_blob.name

In [0]:
def get_latest_measurement_timestamp(blob_name):
    '''
    Retrieves the latest timestamp from a measurement file in Azure Blob Storage.

    Args:
        blob_name (str): Name of the file in Azure Blob Storage.

    Returns:
        str or None: Latest timestamp in UTC format or None if no valid data is found.
    '''
    try:
        logger.info(f"Reading latest timestamp from file {blob_name}.")
        blob_client = container_client.get_blob_client(blob_name)
        blob_data = blob_client.download_blob().readall().decode("utf-8")
        measurements = json.loads(blob_data)

        if not measurements:
            logger.warning("File is empty. Fetching data from the beginning.")
            return None

        # Retrieving the maximum datetimeTo_utc value from measurements.
        latest_timestamp = max(
            m["coverage"]["datetimeTo"]["utc"]
            for m in measurements if "coverage" in m and "datetimeTo" in m["coverage"]
        )

        logger.info(f"Latest coverage_datetimeTo_utc: {latest_timestamp}")
        return latest_timestamp

    except Exception as e:
        logger.error(f"Error reading timestamp: {e}")
        return None

In [0]:
def load_active_sensors_from_blob():
    '''
    Loads active sensors from Azure Blob Storage.

    Returns:
        list: List of active sensors as dictionaries.
    '''
    try:
        logger.info("Fetching all sensors from Azure Blob Storage.")

        blob_list = list(container_client.list_blobs(name_starts_with=SENSORS_PATH))
        if not blob_list:
            logger.warning("No sensor files found.")
            return []

        active_sensors = []

        for blob in blob_list:
            blob_client = container_client.get_blob_client(blob.name)
            blob_data = json.loads(blob_client.download_blob().readall())

            if blob_data.get("is_active", True):
                active_sensors.append(blob_data)

        if not active_sensors:
            logger.warning("No active sensors found.")
            return []

        logger.info(f"Loaded {len(active_sensors)} active sensors.")
        return active_sensors

    except Exception as e:
        logger.error(f"Error reading sensors: {e}")
        return []

In [0]:
def fetch_measurements(sensor):
    '''
    Fetches measurement data for a given sensor from OpenAQ API.

    Args:
        sensor (dict): Dictionary containing sensor_id, location_id and parameter.
    
    Returns:
        list: List of measurement records.
    '''
    page = 1
    limit = 1000
    measurements = []

    while True:
        try:
            url = (
                f"{API_SENSORS_URL}/{sensor['sensor_id']}/measurements/hourly"
                f"?datetime_from={start_date_iso}&datetime_to={end_date_iso}&limit={limit}&page={page}"
            )

            response = requests.get(url, headers={"X-API-Key": API_KEY_OPENAQ})

            if response.status_code == 429:
                logger.warning(f"API rate limit exceeded. Retrying in 60 seconds.")
                time.sleep(60)
                continue

            response.raise_for_status()
            data = response.json().get("results", [])

            if not data:
                break

            for record in data:
                record["location_id"] = sensor["location_id"]
                record["sensor_id"] = sensor["sensor_id"]
                record["parameter_name"] = sensor["parameter"]

            measurements.extend(data)
            logger.info(f"Page {page}: Retrieved {len(data)} records for sensor {sensor['sensor_id']} ({sensor['parameter']})")

            if len(data) < limit:
                break

            page += 1
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching data for sensor {sensor['sensor_id']}: {e}")
            break

    return measurements

In [0]:
def save_to_blob(data, path):
    '''
    Saves measurement data to Azure Blob Storage.

    Args:
        data (list): List of measurement records to be saved.
        path (str): Azure Blob Storage path.
    '''

    if not data:
        logger.warning("No new data to save.")
        return

    # Constructing the file name in the format measurements_YYYYMMDDHHMMSS.json.
    blob_name = f"{path}/measurements_{datetime.utcnow().strftime('%Y%m%d%H%M%S')}.json"

    try:
        container_client.upload_blob(blob_name, json.dumps(data, indent=2), overwrite=True)
        logger.info(f"Data successfully saved to {blob_name}.")

    except Exception as e:
        logger.error(f"Error saving data: {e}")

In [0]:
def main():
    latest_measurements_file = get_latest_measurements_file()
    if latest_measurements_file:
        latest_timestamp = get_latest_measurement_timestamp(latest_measurements_file)
        if latest_timestamp:
            global start_date_iso
            start_date_iso = latest_timestamp  

    sensors = load_active_sensors_from_blob()
    if not sensors:
        exit(1)

    all_measurements = []
    for sensor in sensors:
        all_measurements.extend(fetch_measurements(sensor))

    if all_measurements:
        save_to_blob(all_measurements, MEASUREMENTS_PATH)

In [0]:
main()