In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
SCaMPR Data Downloader

This script finds and downloads the latest 'GLB-5' data files from a public
S3 bucket. It maintains a fixed number of the most recent files in a local
directory, cleaning up older files as new ones are downloaded.

The configuration is managed through a 'configSCaMPR.yaml' file.
"""

# --- Standard Library Imports ---
import logging
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path

# --- Third-Party Imports ---
import boto3
import requests
import yaml
from botocore import UNSIGNED
from botocore.config import Config
from botocore.exceptions import ClientError

# --- Constants ---
# Use the current working directory. This is safer for interactive environments like notebooks.
SCRIPT_DIR = Path.cwd()
CONFIG_FILE = SCRIPT_DIR / "configSCaMPR.yaml"

def setup_logging():
    """Configure basic logging for the script."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

def load_config(config_path):
    """Load and validate the YAML configuration file."""
    try:
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
        # Basic validation
        required_keys = [
            'bucket_name', 'root_prefix', 'search_hours_back',
            'output_directory', 'max_files_to_keep'
        ]
        if not all(key in config for key in required_keys):
            raise ValueError("Config file is missing one or more required keys.")
        return config
    except FileNotFoundError:
        logging.error(f"Config file not found at: {config_path}")
        sys.exit(1)
    except (yaml.YAMLError, ValueError) as e:
        logging.error(f"Error processing config file: {e}")
        sys.exit(1)

def find_latest_s3_links(config, num_files=12):
    """
    Find the latest N file links from the S3 bucket that contain 'GLB-5'.

    Args:
        config (dict): The loaded configuration dictionary.
        num_files (int): The number of latest file links to find.

    Returns:
        list: A sorted list of download URLs (oldest to newest).
    """
    found_links = []
    utc_now = datetime.utcnow()
    s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))

    logging.info(f"Searching for the latest {num_files} 'GLB-5' files...")

    for i in range(config['search_hours_back']):
        search_time = utc_now - timedelta(hours=i)
        prefix = f"{config['root_prefix']}/{search_time.strftime('%Y/%m/%d/%H')}/"

        try:
            response = s3_client.list_objects_v2(
                Bucket=config['bucket_name'], Prefix=prefix
            )
            if 'Contents' not in response:
                continue

            # Sort files in this prefix by key name, newest first
            files_in_prefix = sorted(
                [obj['Key'] for obj in response['Contents'] if not obj['Key'].endswith('/')],
                reverse=True
            )

            for key in files_in_prefix:
                if "GLB-5" in key:
                    url = f"https://{config['bucket_name']}.s3.amazonaws.com/{key}"
                    found_links.append(url)
                    if len(found_links) >= num_files:
                        logging.info(f"Found {len(found_links)} file links. Target met.")
                        # Return sorted oldest to newest for sequential download
                        return sorted(found_links)

        except ClientError as e:
            logging.warning(f"S3 client error for prefix '{prefix}': {e}")

    logging.info(f"Search complete. Found a total of {len(found_links)} 'GLB-5' files.")
    return sorted(found_links) # Return what was found, sorted oldest to newest

def download_file(url, local_path, config):
    """
    Download a single file from a URL with a progress bar.

    Args:
        url (str): The URL of the file to download.
        local_path (Path): The local path to save the file to.
        config (dict): The loaded configuration dictionary.

    Returns:
        bool: True if download was successful, False otherwise.
    """
    logging.info(f"Downloading: {local_path.name}")
    try:
        with requests.get(url, stream=True, timeout=config.get('request_timeout_seconds', 30)) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(local_path, 'wb') as f:
                dl = 0
                for chunk in r.iter_content(chunk_size=8192):
                    dl += len(chunk)
                    f.write(chunk)
                    if total_size > 0:
                        done = int(50 * dl / total_size)
                        sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl/1024/1024:.2f} MB")
                        sys.stdout.flush()
        sys.stdout.write("\n") # Move to next line after progress bar
        return True
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to download {url}: {e}")
        if local_path.exists():
            os.remove(local_path) # Clean up partial download
        return False

def cleanup_old_files(directory, max_files):
    """
    Ensure the directory contains at most max_files, deleting the oldest ones.

    Args:
        directory (Path): The directory to clean.
        max_files (int): The maximum number of files to keep.
    """
    logging.info(f"Cleaning directory '{directory}', keeping the newest {max_files} files.")
    try:
        # Get all files and sort them by modification time (oldest first)
        files = sorted(
            [p for p in directory.iterdir() if p.is_file()],
            key=os.path.getmtime
        )
        # Remove files until the count is at or below the max
        while len(files) > max_files:
            file_to_delete = files.pop(0)
            logging.info(f" -> Deleting old file: {file_to_delete.name}")
            os.remove(file_to_delete)
    except FileNotFoundError:
        logging.warning(f"Cleanup directory '{directory}' not found. Skipping.")
    except Exception as e:
        logging.error(f"An error occurred during file cleanup: {e}")

def main():
    """Main execution function."""
    setup_logging()
    config = load_config(CONFIG_FILE)

    output_dir = SCRIPT_DIR / config['output_directory']
    output_dir.mkdir(exist_ok=True)

    # 1. Find the latest file links from the S3 source
    latest_links = find_latest_s3_links(config, num_files=12)
    if not latest_links:
        logging.info("No 'GLB-5' files found to download.")
        return

    # 2. Determine which files are new and need to be downloaded
    new_files_to_download = []
    for link in latest_links:
        local_path = output_dir / link.split('/')[-1]
        if not local_path.exists():
            new_files_to_download.append((link, local_path))

    # 3. Download only the new files
    if new_files_to_download:
        logging.info(f"Found {len(new_files_to_download)} new files to download.")
        for link, path in new_files_to_download:
            download_file(link, path, config)
    else:
        logging.info("All latest files already exist locally. No downloads needed.")

    # 4. Clean up the output directory once after all operations
    cleanup_old_files(output_dir, config['max_files_to_keep'])
    logging.info("--- Process finished ---")

if __name__ == "__main__":
    main()



2025-09-29 09:31:30 [INFO] Searching for the latest 12 'GLB-5' files...
2025-09-29 09:31:31 [INFO] Found 12 file links. Target met.
2025-09-29 09:31:31 [INFO] Found 12 new files to download.
2025-09-29 09:31:31 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290720000_e202509290729599_c202509290740324.nc




2025-09-29 09:31:34 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290730000_e202509290739599_c202509290756455.nc




2025-09-29 09:31:38 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290740000_e202509290749599_c202509290801041.nc




2025-09-29 09:31:42 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290750000_e202509290759599_c202509290811102.nc




2025-09-29 09:31:45 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290800000_e202509290809599_c202509290821500.nc





2025-09-29 09:31:48 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290810000_e202509290819599_c202509290831212.nc




2025-09-29 09:31:51 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290820000_e202509290829599_c202509290839205.nc





2025-09-29 09:31:55 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290830000_e202509290839599_c202509290851552.nc




2025-09-29 09:31:58 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290840000_e202509290849599_c202509290859309.nc





2025-09-29 09:32:01 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290850000_e202509290859599_c202509290909325.nc




2025-09-29 09:32:05 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290900000_e202509290909599_c202509290921524.nc




2025-09-29 09:32:08 [INFO] Downloading: RRQPE-INST-GLB-5_v1r1_blend_s202509290910000_e202509290919599_c202509290930032.nc





2025-09-29 09:32:11 [INFO] Cleaning directory '/cipslab_shared/cipslab/home/nugrahab/scampr/input/nc', keeping the newest 12 files.
2025-09-29 09:32:11 [INFO] --- Process finished ---
