# Creating consolidated metadata file for MUR SST

# step 2 consolidate metadata

Step 1: is [here](
https://github.com/cgentemann/cloud_science/blob/master/zarr_meta/cloud_mur_v41-all-step1.ipynb)

NASA JPL PODAAC has put the entire [MUR SST](https://podaac.jpl.nasa.gov/dataset/MUR-JPL-L4-GLOB-v4.1) dataset on AWS cloud as individual netCDF files, **but all ~7000 of them are netCDF files.**\ Accessing one file works well, but accessing multiple files is **very slow** because the metadata for each file has to be queried. Here, we create **fast access** by consolidating the metadata and accessing the entire dataset rapidly via zarr. More background on this project:
[medium article](https://medium.com/pangeo/fake-it-until-you-make-it-reading-goes-netcdf4-data-on-aws-s3-as-zarr-for-rapid-data-access-61e33f8fe685) and in this [repo](https://github.com/lsterzinger/fsspec-reference-maker-tutorial). We need help developing documentation and more test datasets. If you want to help, we are working in the [Pangeo Gitter](https://gitter.im/pangeo-data/cloud-performant-netcdf4).


To run this code:
- you need to set your AWS credentials up using `aws configure --profile esip-qhub`
- you need to set up your `.netrc` file in your home directory with your earthdata login info


Authors:
- [Chelle Gentemann](https://github.com/cgentemann)
- [Rich Signell](https://github.com/rsignell-usgs)
- [Lucas Steringzer](https://github.com/lsterzinger/)
- [Martin Durant](https://github.com/martindurant)

Credit:
- Funding: Interagency Implementation and Advanced Concepts Team [IMPACT](https://earthdata.nasa.gov/esds/impact) for the Earth Science Data Systems (ESDS) program and AWS Public Dataset Program
- AWS Credit Program
- ESIP Hub

In [None]:
import s3fs
import requests
from urllib import request
from http.cookiejar import CookieJar
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from json import dumps
from io import StringIO
from os.path import dirname, join
import netrc
import dask.bag as db

import os
import sys
import fsspec
import ujson   # fast json
from fsspec_reference_maker.hdf import SingleHdf5ToZarr 
from fsspec_reference_maker.combine import MultiZarrToZarr
import xarray as xr
import dask
from dask.distributed import Client
import hvplot.xarray
import tempfile



In [None]:
import fsspec_reference_maker
fsspec_reference_maker.__version__

- output file locations

In [None]:
json_dir = 's3://esip-qhub/nasa/mur/jsons_all/'
json_consolidated_dir = 's3://esip-qhub-public/nasa/mur/'
json_out1 = 'murv41_consolidated_20211011.json'
json_out = './../data/mur_consolidated_tem.json'

## set up earthdata login credentials
- code for setting up earthdata_login from [here](https://github.com/podaac/tutorials/blob/master/notebooks/cloudwebinar/cloud_direct_access_s3.py)
- for the earthdata login to work you need to create a .netrc file on your home directory
- .netrc file contains:\
machine urs.earthdata.nasa.gov\
login 'earthdata username'\
password 'password'

In [None]:
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

- Create list of all the individual files

In [None]:
fs2 = fsspec.filesystem('s3', anon=False)  
flist2 = fs2.ls(json_dir)
furls = sorted(['s3://'+f for f in flist2])
print(len(furls))
furls[0]

# trying lucas's dask bag here

In [None]:
def preprocess(ds):
    ds.time.attrs['_FillValue']=0
    return ds  
    
def gen_reference(furls):  
    arg_dict = {
        'decode_cf' : False,
        'mask_and_scale' : False,
        'decode_times' : False,
        'use_cftime' : False,
        'drop_variables': ['dt_1km_data','sst_anomaly'],
        'decode_coords' : False
    }
    concat_dict = {
        "join": "override",
        "combine_attrs": "override",
        "dim": "time"
    }
    mzz = MultiZarrToZarr(
    furls,
    remote_protocol="s3",
    remote_options={'anon':True},
    xarray_open_kwargs=arg_dict,
    xarray_concat_args=concat_dict,
    preprocess=preprocess
    )
    return [mzz.translate(template_count=None)]

In [None]:
#from_sequence: create a dask bag using the list of urls 
# and tell it how many partitions to create within that list
#map_partititions: Apply a function to every partition across one or more bags.

#test here with first 1000
#b = db.from_sequence(furls[:1000], npartitions=10).map_partitions(gen_reference)
#run all
b = db.from_sequence(furls, npartitions=100).map_partitions(gen_reference)

In [None]:
#run
out = b.compute(retries=10)
print(len(out),'this should == npartitions above')

In [None]:
# debug
#import logging
#logging.basicConfig(level=logging.DEBUG)
def preprocess(ds):
    ds.time.attrs['_FillValue']=0
    return ds  

# now combine 20 
arg_dict = {
    'decode_cf' : False,
    'mask_and_scale' : False,
    'decode_times' : False,
    'use_cftime' : False,
    'drop_variables': ['dt_1km_data', 'sst_anomaly'],
    'decode_coords' : False
}
concat_dict = {
    "join": "override",
    "combine_attrs": "override",
    "dim": "time"
}
mzz = MultiZarrToZarr(
    out,
    remote_protocol="s3",
    remote_options={'anon':True},
    xarray_open_kwargs=arg_dict,
    xarray_concat_args=concat_dict,
    preprocess=preprocess
)

mzz.translate(json_out, template_count=None)


In [None]:
rpath = json_consolidated_dir + json_out1
fs2.put_file(lpath=json_out, rpath=rpath)    

In [None]:
client.close(); cluster.close()

# STOP HERE RUN ALL ABOVE

In [None]:
out[0]['refs']

In [None]:
out[0]['refs']['time/.zarray']

In [None]:
fll = fs2.ls('s3://esip-qhub-public/nasa/mur/',detail=True)
for i in range(len(fll)):
    print(fll[i])