In [1]:
import s3fs
from glob import glob
import xarray as xr

## Specify the S3 bucket where the NPS assets are

In [2]:
bucket = 's3://npwbanalres'

## Create a connection to the S3 bucket and list all of the assets 

In [3]:
s3 = s3fs.S3FileSystem(anon=False)

In [4]:
s3.glob(f'{bucket}/*.nc4')

['npwbanalres/v_1_5_1980_gridmet_historical_accumswe.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_accumswe_monthly.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_aet.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_aet_monthly.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_deficit.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_deficit_monthly.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_pet.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_pet_monthly.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_rain.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_rain_monthly.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_runoff_monthly.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_soilwater.nc4',
 'npwbanalres/v_1_5_1980_gridmet_historical_soilwater_monthly.nc4',
 'npwbanalres/v_1_5_1981_gridmet_historical_accumswe.nc4',
 'npwbanalres/v_1_5_1981_gridmet_historical_accumswe_monthly.nc4',
 'npwbanalres/v_1_5_1981

In [5]:
years = list(set(int(v.split('_')[3].split('.')[0]) for v in s3.glob(f'{bucket}/*.nc4') if 'monthly' not in v))
print(f'Number of Years: {len(years)}')
print(f'Year Range: {min(years)} to {max(years)}')

Number of Years: 42
Year Range: 1980 to 2021


## List the variables for both daily and monthly assets

In [6]:
wb_variables = list(set(v.split('_')[-1].split('.')[0] for v in s3.glob(f'{bucket}/*.nc4') if 'monthly' not in v))
wb_variables

['aet', 'pet', 'deficit', 'runoff', 'soilwater', 'accumswe', 'rain']

In [7]:
wb_m_variables = list(set(f"{v.split('_')[-2]}_{v.split('_')[-1].split('.')[0]}" for v in s3.glob(f'{bucket}/*.nc4') if 'monthly' in v))
wb_m_variables

['accumswe_monthly',
 'soilwater_monthly',
 'runoff_monthly',
 'pet_monthly',
 'aet_monthly',
 'rain_monthly',
 'deficit_monthly']

## Connect to an NPS asset in S3

In [8]:
var = wb_variables[3]
var

'runoff'

In [9]:
urls = s3.glob(f'{bucket}/*{var}.nc4')
urls

['npwbanalres/v_1_5_1980_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1981_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1982_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1983_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1984_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1985_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1986_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1987_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1988_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1989_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1990_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1991_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1992_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1993_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1994_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1995_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1996_gridmet_historical_runoff.nc4',
 'npwbanalres/v_1_5_1997_gridme

### Read in a single asset

In [10]:
url = urls[0]
url

'npwbanalres/v_1_5_1980_gridmet_historical_runoff.nc4'

In [11]:
s3_file_obj = s3.open(url, mode='rb')

In [12]:
%%time
xr_ds = xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf')
xr_ds

CPU times: user 543 ms, sys: 122 ms, total: 665 ms
Wall time: 1.65 s


Unnamed: 0,Array,Chunk
Bytes,59.17 MiB,59.17 MiB
Shape,"(3300, 4700)","(3300, 4700)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 59.17 MiB 59.17 MiB Shape (3300, 4700) (3300, 4700) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",4700  3300,

Unnamed: 0,Array,Chunk
Bytes,59.17 MiB,59.17 MiB
Shape,"(3300, 4700)","(3300, 4700)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,59.17 MiB,59.17 MiB
Shape,"(3300, 4700)","(3300, 4700)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 59.17 MiB 59.17 MiB Shape (3300, 4700) (3300, 4700) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",4700  3300,

Unnamed: 0,Array,Chunk
Bytes,59.17 MiB,59.17 MiB
Shape,"(3300, 4700)","(3300, 4700)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.72 kiB,5.72 kiB
Shape,"(366, 2)","(366, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 5.72 kiB 5.72 kiB Shape (366, 2) (366, 2) Dask graph 1 chunks in 2 graph layers Data type datetime64[ns] numpy.ndarray",2  366,

Unnamed: 0,Array,Chunk
Bytes,5.72 kiB,5.72 kiB
Shape,"(366, 2)","(366, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,732 B,732 B
Shape,"(366,)","(366,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 732 B 732 B Shape (366,) (366,) Dask graph 1 chunks in 2 graph layers Data type int16 numpy.ndarray",366  1,

Unnamed: 0,Array,Chunk
Bytes,732 B,732 B
Shape,"(366,)","(366,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,21.15 GiB,127.70 MiB
Shape,"(366, 3300, 4700)","(66, 596, 851)"
Dask graph,216 chunks in 2 graph layers,216 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 21.15 GiB 127.70 MiB Shape (366, 3300, 4700) (66, 596, 851) Dask graph 216 chunks in 2 graph layers Data type float32 numpy.ndarray",4700  3300  366,

Unnamed: 0,Array,Chunk
Bytes,21.15 GiB,127.70 MiB
Shape,"(366, 3300, 4700)","(66, 596, 851)"
Dask graph,216 chunks in 2 graph layers,216 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Read in multiple assets

In [13]:
%%time
ds_all = []
for url in urls:
    #s3_file_obj = s3.open(url, mode='rb')
    ds_all.append(xr.open_dataset(s3.open(url, mode='rb'), chunks='auto', engine='h5netcdf'))

CPU times: user 17 s, sys: 3.98 s, total: 21 s
Wall time: 1min 19s


In [14]:
%%time
ds_ts = xr.concat(ds_all, dim='time')
ds_ts

CPU times: user 30.6 s, sys: 5.26 s, total: 35.8 s
Wall time: 2min 1s


Unnamed: 0,Array,Chunk
Bytes,239.70 kiB,5.72 kiB
Shape,"(15341, 2)","(366, 2)"
Dask graph,42 chunks in 85 graph layers,42 chunks in 85 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 239.70 kiB 5.72 kiB Shape (15341, 2) (366, 2) Dask graph 42 chunks in 85 graph layers Data type datetime64[ns] numpy.ndarray",2  15341,

Unnamed: 0,Array,Chunk
Bytes,239.70 kiB,5.72 kiB
Shape,"(15341, 2)","(366, 2)"
Dask graph,42 chunks in 85 graph layers,42 chunks in 85 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,29.96 kiB,732 B
Shape,"(15341,)","(366,)"
Dask graph,42 chunks in 85 graph layers,42 chunks in 85 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 29.96 kiB 732 B Shape (15341,) (366,) Dask graph 42 chunks in 85 graph layers Data type int16 numpy.ndarray",15341  1,

Unnamed: 0,Array,Chunk
Bytes,29.96 kiB,732 B
Shape,"(15341,)","(366,)"
Dask graph,42 chunks in 85 graph layers,42 chunks in 85 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,886.39 GiB,127.55 MiB
Shape,"(15341, 3300, 4700)","(66, 596, 850)"
Dask graph,30492 chunks in 127 graph layers,30492 chunks in 127 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 886.39 GiB 127.55 MiB Shape (15341, 3300, 4700) (66, 596, 850) Dask graph 30492 chunks in 127 graph layers Data type float32 numpy.ndarray",4700  3300  15341,

Unnamed: 0,Array,Chunk
Bytes,886.39 GiB,127.55 MiB
Shape,"(15341, 3300, 4700)","(66, 596, 850)"
Dask graph,30492 chunks in 127 graph layers,30492 chunks in 127 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


---

## Testing

In [15]:
# for v in wb_variables:
#     print(v)
#     urls = s3.glob(f'{bucket}/*{v}.nc4')
#     #print(urls)
#     ly_noly = [0,1,2,3,4,5,6,7,8,9,10]
#     for y in ly_noly:
#         #print(y)
#         url = urls[y]
#         print(url)
#         try:
#             s3_file_obj = s3.open(url, mode='rb')
#             xr_ds = xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf')
#             print(xr_ds.dims)
#             print(xr_ds[v].attrs)
#         except:
#             print(f'FAILED: {url}')