In [2]:
from pathlib import Path

import requests
from bs4 import BeautifulSoup

base_url = "https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/"


local_archive = Path("/Users/brianpm/Dropbox/Data/ersst")
current_files = sorted(local_archive.glob("ersst.v5.*.nc"))
print(f"Current ERSST v5 files: n={len(current_files)}, first one is {current_files[0].name}, last one is {current_files[-1].name}")

# Get the webpage content
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links ending in .nc
remote_files = [link.get('href') for link in soup.find_all('a') 
               if link.get('href', '').endswith('.nc')]

print(f"Found {len(remote_files)} netCDF files:")
for file in sorted(remote_files):
    print(f"  {file}")

Current ERSST v5 files: n=2022, first one is ersst.v5.185401.nc, last one is ersst.v5.202206.nc
Found 2053 netCDF files:
  ersst.v5.185401.nc
  ersst.v5.185402.nc
  ersst.v5.185403.nc
  ersst.v5.185404.nc
  ersst.v5.185405.nc
  ersst.v5.185406.nc
  ersst.v5.185407.nc
  ersst.v5.185408.nc
  ersst.v5.185409.nc
  ersst.v5.185410.nc
  ersst.v5.185411.nc
  ersst.v5.185412.nc
  ersst.v5.185501.nc
  ersst.v5.185502.nc
  ersst.v5.185503.nc
  ersst.v5.185504.nc
  ersst.v5.185505.nc
  ersst.v5.185506.nc
  ersst.v5.185507.nc
  ersst.v5.185508.nc
  ersst.v5.185509.nc
  ersst.v5.185510.nc
  ersst.v5.185511.nc
  ersst.v5.185512.nc
  ersst.v5.185601.nc
  ersst.v5.185602.nc
  ersst.v5.185603.nc
  ersst.v5.185604.nc
  ersst.v5.185605.nc
  ersst.v5.185606.nc
  ersst.v5.185607.nc
  ersst.v5.185608.nc
  ersst.v5.185609.nc
  ersst.v5.185610.nc
  ersst.v5.185611.nc
  ersst.v5.185612.nc
  ersst.v5.185701.nc
  ersst.v5.185702.nc
  ersst.v5.185703.nc
  ersst.v5.185704.nc
  ersst.v5.185705.nc
  ersst.v5.185706.

In [4]:
# Get base filenames for comparison
local_names = {f.name for f in current_files}
remote_names = set(remote_files)

# Find files that need downloading
files_to_download = remote_names - local_names

# Create full URLs for downloading
download_urls = [base_url + fname for fname in files_to_download]

print(f"Found {len(files_to_download)} files to download:")
for url in sorted(download_urls):
    print(f"  {url}")


# Download missing files
print(f"Starting download of {len(files_to_download)} files...")

for filename in sorted(files_to_download):
    url = base_url + filename
    output_path = local_archive / filename
    
    try:
        print(f"Downloading {filename}...", end=" ", flush=True)
        response = requests.get(url, timeout=300, verify=True, stream=True)
        response.raise_for_status()
        
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print("Done!")
        
    except requests.exceptions.RequestException as e:
        print(f"Failed! Error: {e}")


Found 31 files to download:
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202207.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202208.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202209.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202210.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202211.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202212.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202301.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202302.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202303.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202304.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202305.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ersst.v5.202306.nc
  https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/ers