# Creating consolidated metadata file for MUR SST

10/5/2021

NASA JPL PODAAC has put the entire [MUR SST](https://podaac.jpl.nasa.gov/dataset/MUR-JPL-L4-GLOB-v4.1) dataset on AWS cloud as individual netCDF files, **but all ~7000 of them are netCDF files.**\ Accessing one file works well, but accessing multiple files is **very slow** because the metadata for each file has to be queried. Here, we create **fast access** by consolidating the metadata and accessing the entire dataset rapidly via zarr. More background on this project:
[medium article](https://medium.com/pangeo/fake-it-until-you-make-it-reading-goes-netcdf4-data-on-aws-s3-as-zarr-for-rapid-data-access-61e33f8fe685) and in this [repo](https://github.com/lsterzinger/fsspec-reference-maker-tutorial). We need help developing documentation and more test datasets. If you want to help, we are working in the [Pangeo Gitter](https://gitter.im/pangeo-data/cloud-performant-netcdf4).


To run this code:
- you need to set your AWS credentials up using `aws configure --profile esip-qhub`
- you need to set up your `.netrc` file in your home directory with your earthdata login info


Authors:
- [Chelle Gentemann](https://github.com/cgentemann)
- [Rich Signell](https://github.com/rsignell-usgs)
- [Lucas Steringzer](https://github.com/lsterzinger/)
- [Martin Durant](https://github.com/martindurant)

Credit:
- Funding: Interagency Implementation and Advanced Concepts Team [IMPACT](https://earthdata.nasa.gov/esds/impact) for the Earth Science Data Systems (ESDS) program
- AWS Public Dataset [Program](https://registry.opendata.aws/mur/)
- [QuanSight](https://www.quansight.com/) for creating Qhub, [ESIP Labs ](https://www.esipfed.org/lab) for deploying it, and [AWS Sustainablity](https://aws.amazon.com/government-education/sustainability-research-credits/) for funding it!

In [None]:
import s3fs
import requests
from urllib import request
from http.cookiejar import CookieJar
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from json import dumps
from io import StringIO
from os.path import dirname, join
import netrc

import os
import sys
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
import hvplot.xarray

In [None]:
import fsspec_reference_maker
fsspec_reference_maker.__version__

- output file locations

In [None]:
json_dir = 's3://esip-qhub/nasa/mur/jsons_subset_TEST/'
json_consolidated_dir = 's3://esip-qhub-public/nasa/mur_TEST/'
json_out1 = 'murv41_consolidated_subset_20211005_TEST.json'
json_out = './../data/mur_consolidated_tem_subset_TEST.json'

## set up earthdata login credentials
- code for setting up earthdata_login from [here](https://github.com/podaac/tutorials/blob/master/notebooks/cloudwebinar/cloud_direct_access_s3.py)
- for the earthdata login to work you need to create a .netrc file on your home directory
- .netrc file contains:\
machine urs.earthdata.nasa.gov\
login 'earthdata username'\
password 'password'

In [2]:
#pip install earthdata

In [None]:
from earthdata import Auth #, DataColletions, DataGranules, Accessor
auth = Auth().login()

In [None]:
def begin_s3_direct_access():
    url="https://archive.podaac.earthdata.nasa.gov/s3credentials"
    response = requests.get(url).json()
    return s3fs.S3FileSystem(key=response['accessKeyId'],
                             secret=response['secretAccessKey'],
                             token=response['sessionToken'],
                             client_kwargs={'region_name':'us-west-2'})


In [None]:
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

## Create a list of all MUR files that are on the PODAAC Cloud

In [None]:
%%time
fs = begin_s3_direct_access()
flist = []
for lyr in range(2002,2005):
    for imon in range(1,13):
        fstr = str(lyr)+str(imon).zfill(2)+'*.nc'
        files = fs.glob(join("podaac-ops-cumulus-protected/", "MUR-JPL-L4-GLOB-v4.1", fstr))
        for file in files:
            flist.append(file)
print('total number of individual netcdf files:',len(flist))

- add s3 to filenames

In [None]:
%%time
urls = ["s3://" + f for f in flist]

so = dict(mode='rb', anon=True, default_fill_cache=False, default_cache_type='first')

- link to filesystem

In [None]:
fs2 = fsspec.filesystem('s3', anon=False)  

In [None]:
infile = fs.open(urls[0], **so) #create link to file
h5chunks = SingleHdf5ToZarr(infile, urls[0], inline_threshold=300)
h5chunks

In [None]:
#ujson.dumps(h5chunks.translate()).encode()

In [None]:
#If the directory exists, remove it (and all the files)
try:
    fs2.rm(json_dir, recursive=True)
except:
    pass

In [None]:
def gen_json(u):
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        p = u.split('/')
        date = p[4][0:8] #p[3]
        fname = p[4] #p[5]
        outf = f'{json_dir}{date}.{fname}.json'
        print(outf)
        with fs2.open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

- Create all the individual files using dask

In [None]:
%%time
_ = dask.compute(*[dask.delayed(gen_json)(u) for u in urls], retries=10);

In [None]:
flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
print(len(furls))
furls[0]

In [None]:
client.close(); cluster.shutdown()

In [None]:
from dask.distributed import Client

In [None]:
client = Client()

In [None]:
client

In [None]:
%%time
mzz = MultiZarrToZarr(furls, 
    storage_options={'anon':False}, 
    remote_protocol='s3',
    remote_options={'anon' : 'True'},   #JSON files  
    xarray_open_kwargs={
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['reference_time', 'crs'],
        'decode_coords' : False
    },
    xarray_concat_args={
#          "data_vars": "minimal",
#          "coords": "minimal",
#          "compat": "override",
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
)

In [None]:
%%time
#%%prun -D multizarr_profile 
mzz.translate(json_out)

In [None]:
rpath = json_consolidated_dir + json_out1
fs2.put_file(lpath=json_out, rpath=rpath)

# testing

#### Try a single json

In [None]:
url="https://archive.podaac.earthdata.nasa.gov/s3credentials"
response = requests.get(url).json()

In [None]:
turl = json_dir+'20021201.20021201090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc.json'

In [None]:
%%time
s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'key':response['accessKeyId'],
          'secret':response['secretAccessKey'],
          'token':response['sessionToken'],
          'client_kwargs':{'region_name':'us-west-2'}}

fs = fsspec.filesystem("reference", fo=turl, 
                       ref_storage_args=s_opts,
                       remote_protocol='s3', 
                       remote_options=r_opts,
                       simple_templates=True)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False)
ds

#### Try the consolidated JSON

In [None]:
%%time

rpath = json_consolidated_dir + json_out1

s_opts = {'requester_pays':True, 'skip_instance_cache':True}
r_opts = {'key':response['accessKeyId'],
          'secret':response['secretAccessKey'],
          'token':response['sessionToken'],
          'client_kwargs':{'region_name':'us-west-2'}}

fs = fsspec.filesystem("reference", 
                       fo=rpath, 
                       ref_storage_args=s_opts,
                       remote_protocol='s3', 
                       remote_options=r_opts)#,simple_templates=True)
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", consolidated=False)
ds

In [None]:
import hvplot.xarray

In [None]:
%%time
sst = ds['analysed_sst'].sel(time='2002-12-20 12:00', method='nearest').load()
sst.hvplot.quadmesh(x='lon', y='lat', geo=True, rasterize=True, cmap='turbo' )