In [None]:
#code adapted from https://medium.com/pangeo/continuously-extending-zarr-datasets-c54fbad3967d
#David Brochart

import xarray as xr
import os
import zarr
import os.path
from os import path
from datetime import datetime, timedelta
import shutil
import subprocess
import pandas as pd
import numpy as np

def get_encoding(name):
    '''Get encodings from a Zarr archive.
    Arguments:
        - name: the name of the archive.
    Returns:
        - encoding: the encodings of the variables.
    '''
    ds = xr.open_zarr(name)
    encoding = {name: ds[name].encoding for name in list(ds.variables)}
    return encoding

def create_zarr(ds, name, encoding=None):
    '''Create a Zarr archive from an Xarray Dataset.
    Arguments:
        - ds: the Dataset to store.
        - name: the name of the Zarr archive.
        - encoding: the encoding to use for each variable.
    Returns:
        - encoding: the encoding used for each variable.
    '''
    shutil.rmtree(name, ignore_errors=True)
    ds = ds.chunk({name: ds[name].shape for name in list(ds.dims)})
    ds.to_zarr(name, encoding=encoding)
    if encoding is None:
        encoding = get_encoding(name)
    return encoding

def empty_zarr(name, variable=None):
    '''Empty the Zarr archive of its data (but not its metadata).
    Arguments:
        - name: the name of the archive.
        - variable: the name of the variable to empty (if None, empty all
          variables)
    '''
    for dname in [f for f in os.listdir(name) if not f.startswith('.')]:
        if variable is not None and dname == variable:
            for fname in [f for f in os.listdir(f'{name}/{dname}')
                          if not f.startswith('.')]:
                os.remove(f'{name}/{dname}/{fname}')

def append_zarr(src_name, dst_name):
    '''Append a Zarr archive to another one.
    Arguments:
        - src_name: the name of the archive to append.
        - dst_name: the name of the archive to be appended to.
    '''
    zarr_src = zarr.open(src_name, mode='r')
    zarr_dst = zarr.open(dst_name, mode='a')
    for key in [k for k in zarr_src.array_keys() if k not in ['lat', 'lon']]:
        zarr_dst[key].append(zarr_src[key])
    empty_zarr('F:/data/sst/jpl_mur/zarr_bucket/mur_v41', 'time')

def download_files(datetime_0, datetime_nb):
    '''
    Download files from FTP server.
    Arguments:
        - datetime_0: date from which to download.
        - datetime_nb: number of dates (~files) to download.
    Returns:
        - filenames: list of file names to be downloaded.
        - datetimes: list of dates corresponding to the downloaded files.
    '''

  # only downloads files not already local
#downloads from mapped podaac drive
    dir_local = 'F:/data/sst/jpl_mur/v4.1/'
    dir_podaac = 'Z:/OceanTemperature/ghrsst/data/GDS2/L4/GLOB/JPL/MUR/v4.1/'
    mur_end = '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'

    datetimes = [datetime_0 + timedelta(hours=24*i) for i in range(datetime_nb)]
    urls, filenames = [], []
    for dt in datetimes:
        year = str(dt.year)
        month = str(dt.month).zfill(2)
        day = str(dt.day).zfill(2)
        doyear = str(dt.timetuple().tm_yday).zfill(3)
        filename = dir_local+year+'/'+doyear+'/'+year+month+day+ mur_end
        if path.exists(filename):
            filenames.append(filename)
        else:
            #print('not local:',filename)
            filename = dir_podaac+year+'/'+doyear+'/'+year+month+day+ mur_end
            if path.exists(filename):
                filenames.append(filename)
    return filenames, datetimes

In [None]:
time=datetime(2019, 6, 1, 12)
filenames,datetimes=download_files(time,1)
print(filenames)

In [None]:
dt0 = dt = datetime(2002, 6, 1, 12) # from this date (included)
dt1 = datetime(2002, 6, 3, 12)     # to that date (excluded)
resume = False  # if True, resume a previous upload
                # and dt0 and dt1 must be later than the previous date range
fake_gcs = True # if True, won't upload to Google Cloud Storage
                # but fake it in local trmm_bucket directory
if fake_gcs:
    store = 'F:/data/sst/jpl_mur/mur_bucket/'
else:
    store = gcsfs.GCSMap('pangeo-data/mur_v41')

if resume:
    time_prev = xr.open_zarr(store).time.values
time_nb = 1


In [None]:
while dt < dt1:
    #print(f'Downloading {time_nb} files from {dt}...')
    filenames, datetimes = download_files(dt, time_nb)
    ds = xr.open_mfdataset(filenames,combine='nested',concat_dim='time') #create_dataset(filenames, datetimes)
    if not resume and dt == dt0:
        encoding = create_zarr(ds, 'F:/data/sst/jpl_mur/mur_bucket/mur_v41')
        empty_zarr('F:/data/sst/jpl_mur/mur_bucket/mur_v41', 'time')
    else:
        if resume:
            encoding = get_encoding('F:/data/sst/jpl_mur/mur_bucket/mur_v41_new')
        create_zarr(ds, 'F:/data/sst/jpl_mur/mur_bucket/mur_v41_new', encoding)
        empty_zarr('F:/data/sst/jpl_mur/mur_bucket/mur_v41')
        append_zarr('F:/data/sst/jpl_mur/mur_bucket/mur_v41_new', 'F:/data/sst/jpl_mur/mur_bucket/mur_v41')
    print('Uploading...')
    #if fake_gcs:
    #    subprocess.check_call('mkdir -p F:/data/sst/jpl_mur/mur_bucket/; cp -r mur_v41/* '
    #                          'F:/data/sst/jpl_mur/mur_bucket/; cp -r mur_v41/.[^.]* '
    #                          'F:/data/sst/jpl_mur/mur_bucket/', shell=True)
    #else:
    #    subprocess.check_call('gsutil -m cp -r mur_v41/ gs://pangeo-data/'
    #                          .split())
    dt += timedelta(hours=24*time_nb)

time_new = pd.date_range(dt0, dt1-timedelta(hours=24), freq='1D')
if resume:
    time_var = np.hstack((time_prev, time_new))
else:
    time_var = time_new
time_ds = xr.DataArray(np.zeros(len(time_var)), coords=[time_var], dims=['time']).to_dataset(name='mur_time')
shutil.rmtree('F:\data\sst\jpl_mur\mur_time', ignore_errors=True)
time_ds.to_zarr('F:\data\sst\jpl_mur\mur_time')
if fake_gcs:
    subprocess.check_call('rm -rf F:/data/sst/jpl_mur/mur_bucket/time/'.split())
    subprocess.check_call('cp -r F:/data/sst/jpl_mur/mur_time/time F:/data/sst/jpl_mur/mur_bucket/'.split())
    #subprocess.check_call('rm -rf trmm_bucket/time'.split())
    #subprocess.check_call('cp -r trmm_time/time trmm_bucket/'.split())
else:
    subprocess.check_call('gsutil -m rm -rf '
                          'gs://pangeo-data/mur_v41/time'.split())
    subprocess.check_call('gsutil -m cp -r mur_time/time/ '
                          'gs://pangeo-data/mur_v41/'.split())


In [None]:
#    subprocess.check_call('rm -rf F:\data\sst\jpl_mur\zarr_bucket\mur_time\'.split())
#    subprocess.check_call('cp -r F:/data/sst/jpl_mur/zarr_time/mur_time F:/data/sst/jpl_mur/zarr_bucket/'.split())
subprocess.check_call('rmdir /Q /S F:/data/sst/jpl_mur/zarr_bucket/mur_time/')
subprocess.check_call('cp -r F:/data/sst/jpl_mur/zarr_time/mur_time F:/data/sst/jpl_mur/zarr_bucket/'.split())


In [None]:
if fake_gcs:
    subprocess.check_call('rm -rf trmm_bucket/time'.split())
    subprocess.check_call('cp -r trmm_time/time trmm_bucket/'.split())
else:
    subprocess.check_call('gsutil -m rm -rf '
                          'gs://pangeo-data/trmm_3b42rt/time'.split())
    subprocess.check_call('gsutil -m cp -r trmm_time/time/ '
                          'gs://pangeo-data/trmm_3b42rt/'.split())
subprocess.check_call('rm -rf F:\data\sst\jpl_mur\zarr_bucket\time'.split())