In [14]:
!pip install xarray netCDF4 h5netcdf h5py requests dask --upgrade




In [16]:
import os

# NOAA OISST v2.1 Data URL
BASE_URL = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr"

# Directories
RAW_DIR = "/content/oisst_raw"
SUBSET_DIR = "/content/oisst_california_subset"
MERGED_FILE = "/content/oisst_california_1981_2025.nc"

# Create directories
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(SUBSET_DIR, exist_ok=True)

# Define years & months to download
YEARS = range(1981, 2026)
MONTHS = range(1, 13)

# Define California Coast region
LAT_RANGE = slice(30, 42)  # 30°N to 42°N
LON_RANGE = slice(230, 245)  # Convert -130°W to -115°W (360° system)

# Minimum valid NetCDF file size (100 KB threshold)
MIN_VALID_SIZE = 100_000


In [3]:
import requests

for year in YEARS:
    for month in MONTHS:
        date_str = f"{year}{month:02d}01"
        month_str = f"{year}{month:02d}"

        file_url = f"{BASE_URL}/{month_str}/oisst-avhrr-v02r01.{date_str}.nc"
        output_file = os.path.join(RAW_DIR, f"oisst-avhrr-v02r01.{date_str}.nc")

        # Skip existing valid files
        if os.path.exists(output_file) and os.path.getsize(output_file) > MIN_VALID_SIZE:
            print(f"✅ Already exists: {output_file}")
            continue

        # Check if file exists on NOAA server
        response = requests.head(file_url)
        if response.status_code == 404:
            print(f"❌ File not found: {file_url}")
            continue

        # Download the file
        print(f"🔽 Downloading {file_url} ...")
        os.system(f"wget -q -O {output_file} {file_url}")

        # Validate download size
        if os.path.getsize(output_file) < MIN_VALID_SIZE:
            print(f"❌ File too small, deleting: {output_file}")
            os.remove(output_file)


❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198101/oisst-avhrr-v02r01.19810101.nc
❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198102/oisst-avhrr-v02r01.19810201.nc
❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198103/oisst-avhrr-v02r01.19810301.nc
❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198104/oisst-avhrr-v02r01.19810401.nc
❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198105/oisst-avhrr-v02r01.19810501.nc
❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/198106/oisst-avhrr-v02r01.19810601.nc
❌ File not found: https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation

In [17]:
import xarray as xr
import os
import gc

subset_files = []
batch_size = 5  # Process in batches of 5

file_list = sorted([f for f in os.listdir(RAW_DIR) if f.endswith(".nc")])

for i in range(0, len(file_list), batch_size):
    batch = file_list[i:i+batch_size]
    print(f"\n🔹 Processing batch {i//batch_size + 1}/{len(file_list)//batch_size + 1}...")

    for file in batch:
        file_path = os.path.join(RAW_DIR, file)

        try:
            # Open dataset with chunking to reduce memory use
            ds = xr.open_dataset(file_path, engine="netcdf4", chunks={"time": 1})

            # Subset region (California Coast)
            ds_subset = ds.sel(lat=LAT_RANGE, lon=LON_RANGE)

            # Keep only SST variable
            ds_subset = ds_subset[["sst"]]

            # Save subset file
            subset_file_path = os.path.join(SUBSET_DIR, file)
            ds_subset.to_netcdf(subset_file_path)

            # Close dataset to free memory
            ds_subset.close()
            ds.close()

            subset_files.append(subset_file_path)
            print(f"✅ Processed: {file}")

        except Exception as e:
            print(f"❌ Skipping corrupt file: {file_path} | Error: {e}")

    # Manually clear memory after each batch
    gc.collect()

print(f"\n✅ All {len(subset_files)} files saved in {SUBSET_DIR}")



🔹 Processing batch 1/105...
✅ Processed: oisst-avhrr-v02r01.19810901.nc
✅ Processed: oisst-avhrr-v02r01.19811001.nc
✅ Processed: oisst-avhrr-v02r01.19811101.nc
✅ Processed: oisst-avhrr-v02r01.19811201.nc
✅ Processed: oisst-avhrr-v02r01.19820101.nc

🔹 Processing batch 2/105...
✅ Processed: oisst-avhrr-v02r01.19820201.nc
✅ Processed: oisst-avhrr-v02r01.19820301.nc
✅ Processed: oisst-avhrr-v02r01.19820401.nc
✅ Processed: oisst-avhrr-v02r01.19820501.nc
✅ Processed: oisst-avhrr-v02r01.19820601.nc

🔹 Processing batch 3/105...
✅ Processed: oisst-avhrr-v02r01.19820701.nc
✅ Processed: oisst-avhrr-v02r01.19820801.nc
✅ Processed: oisst-avhrr-v02r01.19820901.nc
✅ Processed: oisst-avhrr-v02r01.19821001.nc
✅ Processed: oisst-avhrr-v02r01.19821101.nc

🔹 Processing batch 4/105...
✅ Processed: oisst-avhrr-v02r01.19821201.nc
✅ Processed: oisst-avhrr-v02r01.19830101.nc
✅ Processed: oisst-avhrr-v02r01.19830201.nc
✅ Processed: oisst-avhrr-v02r01.19830301.nc
✅ Processed: oisst-avhrr-v02r01.19830401.nc

🔹 P

In [18]:
import os
import xarray as xr

subset_output_dir = "/content/oisst_california_subset"
corrupt_files = []

# Check all NetCDF files
for file in sorted(os.listdir(subset_output_dir)):
    if file.endswith(".nc"):
        file_path = os.path.join(subset_output_dir, file)
        try:
            ds = xr.open_dataset(file_path, engine="netcdf4")
            ds.close()
        except Exception as e:
            print(f"❌ Corrupt file detected: {file_path} | Error: {e}")
            corrupt_files.append(file_path)

# Remove corrupted files
if corrupt_files:
    print(f"\n🚨 Removing {len(corrupt_files)} corrupt files...")
    for bad_file in corrupt_files:
        os.remove(bad_file)
    print("✅ Corrupt files deleted.")
else:
    print("✅ No corrupt files found.")


✅ No corrupt files found.


In [19]:
import xarray as xr

# Pick two files to compare metadata
file1 = "/content/oisst_california_subset/oisst-avhrr-v02r01.20100501.nc"
file2 = "/content/oisst_california_subset/oisst-avhrr-v02r01.20100601.nc"

# Open files
ds1 = xr.open_dataset(file1, engine="netcdf4")
ds2 = xr.open_dataset(file2, engine="netcdf4")

# Compare dimensions
print("\n🔹 Dimensions in File 1:")
print(ds1.dims)
print("\n🔹 Dimensions in File 2:")
print(ds2.dims)

# Compare coordinate variables
print("\n🔹 Coordinates in File 1:")
print(ds1.coords)
print("\n🔹 Coordinates in File 2:")
print(ds2.coords)

# Compare attributes
print("\n🔹 Global attributes in File 1:")
print(ds1.attrs)
print("\n🔹 Global attributes in File 2:")
print(ds2.attrs)

# Close datasets
ds1.close()
ds2.close()



🔹 Dimensions in File 1:

🔹 Dimensions in File 2:

🔹 Coordinates in File 1:
Coordinates:
  * lat      (lat) float32 192B 30.12 30.38 30.62 30.88 ... 41.38 41.62 41.88
  * lon      (lon) float32 240B 230.1 230.4 230.6 230.9 ... 244.4 244.6 244.9
  * time     (time) datetime64[ns] 8B 2010-05-01T12:00:00
  * zlev     (zlev) float32 4B 0.0

🔹 Coordinates in File 2:
Coordinates:
  * lat      (lat) float32 192B 30.12 30.38 30.62 30.88 ... 41.38 41.62 41.88
  * lon      (lon) float32 240B 230.1 230.4 230.6 230.9 ... 244.4 244.6 244.9
  * time     (time) datetime64[ns] 8B 2010-06-01T12:00:00
  * zlev     (zlev) float32 4B 0.0

🔹 Global attributes in File 1:
{'title': 'NOAA/NCEI 1/4 Degree Daily Optimum Interpolation Sea Surface Temperature (OISST) Analysis, Version 2.1 - Final', 'Description': 'Reynolds, et al.(2007) Daily High-resolution Blended Analyses. Available at ftp://eclipse.ncdc.noaa.gov/pub/OI-daily/daily-sst.pdf  Climatology is based on 1971-2000 OI.v2 SST, Satellite data: Navy  NOA

In [20]:
import xarray as xr
import os

subset_output_dir = "/content/oisst_california_subset"
nc_files = sorted([
    os.path.join(subset_output_dir, f) for f in os.listdir(subset_output_dir) if f.endswith(".nc")
])

# Check time variable for each file
for file in nc_files[:10]:  # Check first 10 files
    ds = xr.open_dataset(file, engine="netcdf4")
    print(f"\n🔹 {file}")
    print(ds.time)
    ds.close()



🔹 /content/oisst_california_subset/oisst-avhrr-v02r01.19810901.nc
<xarray.DataArray 'time' (time: 1)> Size: 8B
array(['1981-09-01T12:00:00.000000000'], dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 8B 1981-09-01T12:00:00
Attributes:
    long_name:  Center time of the day

🔹 /content/oisst_california_subset/oisst-avhrr-v02r01.19811001.nc
<xarray.DataArray 'time' (time: 1)> Size: 8B
array(['1981-10-01T12:00:00.000000000'], dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 8B 1981-10-01T12:00:00
Attributes:
    long_name:  Center time of the day

🔹 /content/oisst_california_subset/oisst-avhrr-v02r01.19811101.nc
<xarray.DataArray 'time' (time: 1)> Size: 8B
array(['1981-11-01T12:00:00.000000000'], dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 8B 1981-11-01T12:00:00
Attributes:
    long_name:  Center time of the day

🔹 /content/oisst_california_subset/oisst-avhrr-v02r01.19811201.nc
<xarray.DataArray 'time' (time: 1)> 

In [21]:
import xarray as xr

# Select a few files for testing
test_files = [
    "/content/oisst_california_subset/oisst-avhrr-v02r01.20100501.nc",
    "/content/oisst_california_subset/oisst-avhrr-v02r01.20100601.nc",
    "/content/oisst_california_subset/oisst-avhrr-v02r01.20100701.nc",
    "/content/oisst_california_subset/oisst-avhrr-v02r01.20100801.nc",
    "/content/oisst_california_subset/oisst-avhrr-v02r01.20100901.nc"
]

try:
    ds_list = [xr.open_dataset(f, engine="netcdf4") for f in test_files]
    ds_combined = xr.concat(ds_list, dim="time")  # Merge along time axis

    print("✅ Test merge successful!")

    # Close datasets
    for ds in ds_list:
        ds.close()
except Exception as e:
    print(f"❌ Merge failed! Error: {e}")


✅ Test merge successful!


In [22]:
!apt-get install -y nco


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
nco is already the newest version (5.0.6-1).
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [23]:
import os

subset_output_dir = "/content/oisst_california_subset"
final_output_file = "/content/oisst_california_1981_2025.nc"

# Get sorted list of valid NetCDF files
valid_files = sorted([
    os.path.join(subset_output_dir, f) for f in os.listdir(subset_output_dir) if f.endswith(".nc")
])

print(f"\n🔹 Merging {len(valid_files)} NetCDF files using `ncrcat`...")

# Merge using NCO (NetCDF Operators)
merge_command = f"ncrcat {' '.join(valid_files)} {final_output_file}"
os.system(merge_command)

print(f"✅ Final dataset saved as {final_output_file}")



🔹 Merging 521 NetCDF files using `ncrcat`...
✅ Final dataset saved as /content/oisst_california_1981_2025.nc


In [24]:
from google.colab import drive
drive.mount('/content/drive')

# Move the final dataset
!mv /content/oisst_california_1981_2025.nc /content/drive/MyDrive/oisst_california_1981_2025.nc


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
