In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import numcodecs
from dask.distributed import Client, progress, LocalCluster
import zarr
import glob

In [5]:
ds_zarr = xr.open_zarr('/Volumes/Elements/zarr/test-500x500x5-2/')

In [8]:
%%time

subset = ds_zarr.analysed_sst.sel(lat=slice(10,50),lon=slice(-180,-100))
#subset.load()
subset.mean({'lat','lon'}).values

CPU times: user 24.4 s, sys: 9.81 s, total: 34.2 s
Wall time: 10.3 s


array([292.99274, 293.016  , 293.04068, 293.04245, 293.0007 , 292.993  ,
       293.00204, 293.02985, 293.06314, 293.08994, 293.1151 , 293.13104,
       293.18292, 293.2693 , 293.3431 , 293.4406 , 293.54028, 293.61066,
       293.67648, 293.72345, 293.74954, 293.76962, 293.7842 , 293.79865,
       293.8131 , 293.85007, 293.89017, 293.98636, 294.0708 , 294.14474,
       294.18906, 294.23914, 294.2966 , 294.31686, 294.33334, 294.3644 ,
       294.40582, 294.43735, 294.49307, 294.55234, 294.596  , 294.65213,
       294.6818 , 294.69547, 294.6951 , 294.7071 , 294.7373 , 294.93655,
       294.87177, 294.8785 ], dtype=float32)

In [16]:
%%time

subset = ds_500.analysed_sst.sel(lat=slice(10,50),lon=slice(-180,-100))
subset.load()
subset.mean({'lat','lon'})

CPU times: user 8.87 s, sys: 4.92 s, total: 13.8 s
Wall time: 12.3 s


<xarray.DataArray 'analysed_sst' (time: 20)>
array([292.99307, 293.01593, 293.0404 , 293.0424 , 293.00082, 292.99283,
       293.00195, 293.02985, 293.0633 , 293.08972, 293.1153 , 293.13074,
       293.18292, 293.26917, 293.34296, 293.44092, 293.54065, 293.6105 ,
       293.6764 , 293.72372], dtype=float32)
Coordinates:
  * time     (time) datetime64[ns] 2002-06-01T09:00:00 ... 2002-06-20T09:00:00

## Connecting to webdav server

In [2]:
prefix = '/Volumes/files/allData/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1'
year = 2002
chunks = {'time': 10, 'lat': 1800, 'lon': 3600}

In [3]:
def generate_file_list(start_doy, end_doy):   
    """
    Given a start day and end end day, generate a list of file locations.
    Assumes a 'prefix' and 'year' variables have already been defined.
    'Prefix' should be a local directory or http url and path.
    'Year' should be a 4 digit year.
    """
    days_of_year = list(range(start_doy, end_doy))
    fileObjs = []

    for doy in days_of_year:
        file = glob.glob(f"{prefix}/{year}/{doy}/*.nc")[0]
        fileObjs.append(file)
    return fileObjs

In [4]:
%%time
start_doy = 152
end_doy = 157

fileObjs = generate_file_list(start_doy, end_doy)
print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
ds = xr.open_mfdataset(fileObjs, chunks=chunks, combine='by_coords')

start doy: 152, file: 20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 157, file: 20020605090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
CPU times: user 166 ms, sys: 75.9 ms, total: 242 ms
Wall time: 4.52 s


In [15]:
%%time
start_doy = 152
end_doy = 157
prefix = '/Volumes/Elements/netcdf'

fileObjs = generate_file_list(start_doy, end_doy)
print(f"start doy: {start_doy}, file: {fileObjs[0].split('/')[-1]}")
print(f"end doy: {end_doy}, file: {fileObjs[-1].split('/')[-1]}")          
ds = xr.open_mfdataset(fileObjs, chunks=chunks, combine='by_coords')

start doy: 152, file: 20020601090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
end doy: 157, file: 20020605090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc
CPU times: user 134 ms, sys: 51.7 ms, total: 185 ms
Wall time: 4.55 s


In [None]:
import time
chunk_opts = [
    {'time': 1, 'lat': 4500, 'lon': 9000},
    {'time': 4, 'lat': 2000, 'lon': 4000},
    {'time': 100, 'lat': 450, 'lon': 900}    
]

time_slice = slice(datetime.strptime('2002-06-01', '%Y-%m-%d'), datetime.strptime('2002-06-05', '%Y-%m-%d'))
lat_slice = slice(10, 50)
lon_slice = slice(-150, -100)

for chunks in chunk_opts:
    s = time.time()
    ds_netcdf = xr.open_mfdataset(fileObjs, chunks=chunks, parallel=True, combine='by_coords')
    e = time.time()
    print(f"time to open files for chunking {chunks}: {e - s} seconds")
    s = time.time()
    ds_netcdf.analysed_sst.sel(time=time_slice).mean({'lat','lon'}).values
    e = time.time()
    print(f"time to calculate mean for 5 day time slice for chunking {chunks}: {e - s} seconds")
    s = time.time()
    ds_netcdf.analysed_sst.sel(lat=lat_slice, lon=lon_slice).mean({'lat','lon'}).values
    e = time.time()
    print(f"time to calculate mean for lat/lon slice for chunking {chunks}: {e - s} seconds")  