In [0]:
import json
import requests
import time
import logging
from datetime import datetime, timedelta
from azure.storage.blob import BlobServiceClient

In [0]:
'''
Logging configuration and initialization, reduced for external libraries.
'''

logging.basicConfig(
    format="%(asctime)s - %(levelname)s: %(message)s",
    level=logging.INFO,
    handlers=[logging.StreamHandler()]
)

for lib in ["azure", "urllib3", "py4j"]:
    logging.getLogger(lib).setLevel(logging.WARNING)

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [0]:
'''
Fetching access data from Databricks secret.
'''

AZURE_CONNECTION_STRING = dbutils.secrets.get("dev_secrets", "storageconnection")

In [0]:
'''
Initializing the client for handling Azure Blob Storage.
'''

CONTAINER_NAME = "data"
blob_service = BlobServiceClient.from_connection_string(AZURE_CONNECTION_STRING)
container_client = blob_service.get_container_client(CONTAINER_NAME)

In [0]:
'''
Definiton of paths and API endpoints.
'''

LOCATIONS_PATH = "openaq-locations-data"
WEATHER_PATH = "openmeteo-weather-data"
OPEN_METEO_URL = "https://archive-api.open-meteo.com/v1/archive"

In [0]:
def list_location_blobs():
    '''
    Lists all blob files for locations in Azure Blob Storage.

    Returns:
        list: A list of BlobProperties objects corresponding to location files.
    '''
    blobs = list(container_client.list_blobs(name_starts_with=LOCATIONS_PATH + "/location_"))

    if not blobs:
        logger.warning("No location files found in Azure Blob Storage.")
        return []
    
    return blobs

In [0]:
def load_all_locations():
    '''
    Loads data from all active location JSON files in the Azure Blob Storage.

    Returns:
        list: A list of dictionaries, each representing an active location.
    '''
    all_locs = []
    blobs = list_location_blobs()

    for blob in blobs:
        try:
            blob_client = container_client.get_blob_client(blob.name)
            blob_data = blob_client.download_blob().readall().decode("utf-8")
            location_dict = json.loads(blob_data)

            if location_dict.get("is_active", True):
                all_locs.append(location_dict)

        except Exception as e:
            logger.error(f"Error reading location file '{blob.name}': {e}")
            
    logger.info(f"Loaded {len(all_locs)} active locations from blob storage.")
    return all_locs

In [0]:
def get_latest_weather_file_for_location(location_id):
    '''
    Searches Azure Blob Storage for weather files for a specific location.

    Args:
        location_id (int or str): The ID of the location.

    Returns:
        str or None: The name of the newest weather blob file for the location, or None if no file is found.
    '''
    prefix = f"{WEATHER_PATH}/weather_{location_id}_"
    blobs = list(container_client.list_blobs(name_starts_with=prefix))

    if not blobs:
        return None
    
    latest_blob = max(blobs, key=lambda b: b.last_modified)
    return latest_blob.name

In [0]:
def get_latest_weather_timestamp_for_location(location_id):
    '''
    Retrieves the maximum timestamp from the
    newest weather file for the specified location.

    Args:
        location_id (int or str): The ID of the location.

    Returns:
        datetime or None: The latest datetime object found in 'hourly.time' 
        within the newest weather file, or None if no data is found.
    '''
    latest_file = get_latest_weather_file_for_location(location_id)
    if not latest_file:
        return None

    try:
        blob_client = container_client.get_blob_client(latest_file)
        blob_data = blob_client.download_blob().readall().decode("utf-8")
        weather_json = json.loads(blob_data)

        if ("hourly" not in weather_json) or ("time" not in weather_json["hourly"]):
            return None

        times = weather_json["hourly"]["time"]
        if not times:
            return None

        max_dt_str = max(times)
        max_dt = datetime.strptime(max_dt_str, "%Y-%m-%dT%H:%M")
        return max_dt

    except Exception as e:
        logger.error(f"Error getting timestamp from file '{latest_file}': {e}")
        return None

In [0]:
def fetch_weather_data(latitude, longitude, location_id, start_datetime):
    '''
    Fetches weather data from the Open-Meteo Archive API for a specific 
    latitude/longitude range from start_datetime until the latest full day (UTC). 
    Also attaches 'location_id' to the returned JSON.

    Args:
        latitude (float): The latitude of the location.
        longitude (float): The longitude of the location.
        location_id (int or str): The ID of the location.
        start_datetime (datetime): The starting timestamp for the weather data query.

    Returns:
        dict or None: A JSON-like dictionary with the weather data, or None if retrieval fails.
    '''
    retry_attempts = 3
    attempt = 0

    start_datetime = start_datetime.replace(minute=0, second=0, microsecond=0)
    start_date_str = start_datetime.strftime("%Y-%m-%d")

    end_date = datetime.utcnow().replace(minute=0, second=0, microsecond=0)
    end_date_str = end_date.strftime("%Y-%m-%d")

    if start_date_str > end_date_str:
            logger.error(f"Skipping location {loc_id} - start_date ({start_date}) > end_date ({end_date})")
            exit
            
    while attempt < retry_attempts:
        try:
            params = {
                "latitude": latitude,
                "longitude": longitude,
                "start_date": start_date_str,
                "end_date": end_date_str,
                "timezone": "auto",
                "hourly": [
                    "temperature_2m", "relative_humidity_2m", "dew_point_2m", "apparent_temperature", "pressure_msl",
                    "surface_pressure", "precipitation", "rain", "snowfall", "cloud_cover", "cloud_cover_low",
                    "cloud_cover_mid", "cloud_cover_high", "shortwave_radiation", "direct_radiation",
                    "direct_normal_irradiance", "diffuse_radiation", "global_tilted_irradiance",
                    "sunshine_duration", "wind_speed_10m", "wind_speed_100m", "wind_direction_10m",
                    "wind_direction_100m", "wind_gusts_10m", "et0_fao_evapotranspiration", "weather_code",
                    "snow_depth", "vapour_pressure_deficit", "soil_temperature_0_to_7cm", "soil_temperature_7_to_28cm",
                    "soil_temperature_28_to_100cm", "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm",
                    "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm"
                ]
            }

            response = requests.get(OPEN_METEO_URL, params=params, timeout=200)

            if response.status_code == 429:
                logger.warning("API rate limit exceeded. Retrying in 60 seconds.")
                time.sleep(60)
                attempt += 1
                continue

            response.raise_for_status()
            data = response.json()
            data["location_id"] = location_id
            return data

        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching weather data for location {location_id}: {e}")
            attempt += 1
            time.sleep(5)

    return None

In [0]:
def save_weather_for_location(weather_dict):
    '''
    Saves the weather data for a specific location to a distinct JSON file,
    but only includes timestamps up to the current hour.

    Args:
        weather_dict (dict): The weather data dictionary to be saved.
                             Must contain 'location_id'.
    '''
    if not weather_dict:
        logger.warning("No weather data to save.")
        return

    # Get the current UTC time up to the hour.
    current_utc_time = datetime.utcnow().replace(minute=0, second=0, microsecond=0)
    
    # Filter out future timestamps.
    if "hourly" in weather_dict and "time" in weather_dict["hourly"]:
        valid_indices = [i for i, t in enumerate(weather_dict["hourly"]["time"]) if datetime.strptime(t, "%Y-%m-%dT%H:%M") <= current_utc_time]
        
        # Keep only the valid timestamps and corresponding data.
        weather_dict["hourly"]["time"] = [weather_dict["hourly"]["time"][i] for i in valid_indices]
        for key in weather_dict["hourly"]:
            if key != "time":
                weather_dict["hourly"][key] = [weather_dict["hourly"][key][i] for i in valid_indices]
    
    location_id = weather_dict.get("location_id", "unknown")
    timestamp = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    blob_name = f"{WEATHER_PATH}/weather_{location_id}_{timestamp}.json"
    
    try:
        blob_content = json.dumps(weather_dict, indent=2)
        container_client.upload_blob(blob_name, blob_content, overwrite=True)
        logger.debug(f"Weather data saved to: {blob_name}")
    except Exception as e:
        logger.error(f"Error saving weather data for location {location_id}: {e}")

In [0]:
def main():
    logger.info("Starting weather data processing for all active locations.")

    locations = load_all_locations()
    if not locations:
        logger.warning("No active locations found. Exiting.")
        return

    processed_count = 0
    for loc in locations:
        loc_id = loc.get("id")
        lat = loc.get("coordinates", {}).get("latitude")
        lon = loc.get("coordinates", {}).get("longitude")

        if not lat or not lon:
            logger.warning(f"Skipping location {loc_id}, missing coordinates.")
            continue

        latest_weather_dt = get_latest_weather_timestamp_for_location(loc_id)
        if latest_weather_dt:
            start_datetime = latest_weather_dt + timedelta(hours=1)
        else:
            start_datetime = datetime.strptime("2025-01-01", "%Y-%m-%d")

        weather_data = fetch_weather_data(lat, lon, loc_id, start_datetime)
        if (
            weather_data 
            and "hourly" in weather_data 
            and "time" in weather_data["hourly"] 
            and weather_data["hourly"]["time"]
        ):
            save_weather_for_location(weather_data)
            processed_count += 1
        else:
            logger.info(f"No new weather data for location {loc_id} (start: {start_datetime}).")

    logger.info(f"Weather data saved for {processed_count} locations.")

In [0]:
main()