# Download and preprocess NMME

Download a 40S to 40N subset of NMME hindcasts, keeping only the shortest lead time forecasts. 

Sample file characteristics: http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.CanCM4i/.FORECAST/.MONTHLY/.prec/

"The forecast starts occur at the beginning of a month of the year, and adding together the forecast start time and the lead time (3-month seasonal lead) determines the season for which the forecast is valid." from https://iridl.ldeo.columbia.edu/maproom/Global/Forecasts/NMME/Seasonal_Anomaly.html

In [None]:
import xarray as xr
import xagg as xa
import numpy as np
import pandas as pd
import datetime as dt
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
import warnings

import os
import re
import glob
from matplotlib import pyplot as plt
from cartopy import crs as ccrs
import cmocean

from funcs_support import get_params,area_mean,utility_save
dir_list = get_params()

In [2]:
var = 'prec'

In [3]:
from distributed import Client
# Start dask client
client = Client()
display(client)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36419 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:36419/status,

0,1
Dashboard: http://127.0.0.1:36419/status,Workers: 8
Total threads: 48,Total memory: 503.37 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:38369,Workers: 0
Dashboard: http://127.0.0.1:36419/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:45550,Total threads: 6
Dashboard: http://127.0.0.1:44364/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:35356,
Local directory: /tmp/dask-scratch-space/worker-_7m5wdje,Local directory: /tmp/dask-scratch-space/worker-_7m5wdje

0,1
Comm: tcp://127.0.0.1:45722,Total threads: 6
Dashboard: http://127.0.0.1:44161/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:38299,
Local directory: /tmp/dask-scratch-space/worker-h1bsirib,Local directory: /tmp/dask-scratch-space/worker-h1bsirib

0,1
Comm: tcp://127.0.0.1:45186,Total threads: 6
Dashboard: http://127.0.0.1:41192/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:46472,
Local directory: /tmp/dask-scratch-space/worker-mwdithvq,Local directory: /tmp/dask-scratch-space/worker-mwdithvq

0,1
Comm: tcp://127.0.0.1:39927,Total threads: 6
Dashboard: http://127.0.0.1:42086/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:36416,
Local directory: /tmp/dask-scratch-space/worker-gm69ldrf,Local directory: /tmp/dask-scratch-space/worker-gm69ldrf

0,1
Comm: tcp://127.0.0.1:45799,Total threads: 6
Dashboard: http://127.0.0.1:45168/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:42583,
Local directory: /tmp/dask-scratch-space/worker-7yel2vff,Local directory: /tmp/dask-scratch-space/worker-7yel2vff

0,1
Comm: tcp://127.0.0.1:44924,Total threads: 6
Dashboard: http://127.0.0.1:38611/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:36803,
Local directory: /tmp/dask-scratch-space/worker-z9kx1cu4,Local directory: /tmp/dask-scratch-space/worker-z9kx1cu4

0,1
Comm: tcp://127.0.0.1:37169,Total threads: 6
Dashboard: http://127.0.0.1:44495/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:34890,
Local directory: /tmp/dask-scratch-space/worker-b0f_lvtq,Local directory: /tmp/dask-scratch-space/worker-b0f_lvtq

0,1
Comm: tcp://127.0.0.1:38698,Total threads: 6
Dashboard: http://127.0.0.1:43691/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:40253,
Local directory: /tmp/dask-scratch-space/worker-dkuw9d_r,Local directory: /tmp/dask-scratch-space/worker-dkuw9d_r


In [4]:
#mods = ['CanCM4i','CanSIPS-IC3','CanSIPSv2','CMC1-CanCM3','CMC2-CanCM4',
#        'GEM-NEMO','GFDL-SPEAR','NASA-GEOSS2S','NCAR-CESM1','NCEP-CFSv2']
mods = ['NCEP-CFSv2']

In [5]:
rename_dict = {'Y':'lat','X':'lon',
               'S':'forecast_time','L':'lead',
               'M':'run',
               'T':'forecast_time',
               'prec':'pr',
              }

#subset_params = {'lat':slice(-3,12.5),'lon':slice(32,55)}; suffix = '_HoAfrica'; lead0 = False

subset_params = {'lat':slice(-40,40),'lon':slice(-180,180)}; suffix = '_40to40lead0'; lead0 = True

In [None]:
overwrite=False

dss = dict()

for mod in mods:
    print('processing model '+mod)
    #load_dir = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.'+mod+'/.HINDCAST/.MONTHLY/.prec/dods'
    load_dir = 'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.'+mod+'/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/dods'
    
    dss[mod] = xr.open_dataset(load_dir,
                             decode_times=False,
                               chunks={'L':-1,'S':-1,'Y':20,'X':20,'M':7})
    
    # Rename to standards 
    dss[mod] = xa.fix_ds(dss[mod].rename(**{k:v for k,v in rename_dict.items() if k in dss[mod]}))

    # Date is in months since 1960-01-01
    # It's "Forecast Start Time (forecast_reference_time)" in CanCM4i. Not quite sure what that means yet
    dss[mod]['forecast_time'] = [pd.DatetimeIndex(['1960-01-01'])[0] + relativedelta(months=t.values.item()) for t in dss[mod]['forecast_time']]
    
    # Subset
    dss[mod] = dss[mod].sel(**subset_params)
    # Lead subset (added 25/10/08)
    if lead0:
        dss[mod] = dss[mod].isel(lead = [0])
    
    # Clarify a few things
    if 'lead' in dss[mod]:
        dss[mod]['lead'].attrs['units'] = 'month'
    dss[mod]['pr'].attrs['units'] = 'mm/day'
    
    if 'run' in dss[mod]:
        dss[mod]['run'] = dss[mod]['run'].astype(int)
        
    # Get time that the forecast is actually for 
    dst = dss[mod][['forecast_time','lead']].stack(tvars = ['forecast_time','lead'])
    dst['time'] = (('tvars'),[ft + relativedelta(months=np.floor(lt),days=15) for ft,lt in zip(pd.DatetimeIndex(dst.tvars.forecast_time),
                     dst.lead)])

    dss[mod]['time'] = dst.unstack()['time']
    
    # Get date string
    datestr = (re.sub(r'\-','',str(np.unique(dss[mod].time)[0])[0:8])+'01-'+
               re.sub(r'\-','',str(np.unique(dss[mod].time)[-1])[0:8]+'31'))
    
    # Get output filename
    output_fn = dir_list['raw']+mod+'/pr_Amon_'+mod+'_hindcasts_NMME_'+datestr+suffix+'.zarr'

    if (overwrite) or (not os.path.exists(output_fn)):
        # Load data into memory
        #dss[mod] = dss[mod].load()

        # Change coordinate system to time instead of forecast time
        ds_tmp = xr.Dataset(data_vars = {'pr':(['time','lead','run','lat','lon'],
                                              np.zeros([len(np.unique(dss[mod]['time'])),
                                                        *[dss[mod].sizes[k] for k in ['lead','run','lat','lon']]])*np.nan)},
                             coords = {'time':(['time'],np.unique(dss[mod]['time'])),
                                       **{k:([k],dss[mod][k].values) for k in ['lead','run','lat','lon']}})

        for lead in tqdm(dss[mod].lead):
            for ftime in dss[mod].forecast_time:
                ds_tmp['pr'].loc[{'lead':lead,
                               'time':dss[mod].sel(lead=lead,forecast_time=ftime)['time']}] = (dss[mod].sel(lead=lead,
                                                                                                        forecast_time=ftime))['pr']

        dss[mod] = ds_tmp

        # Export as new file 

        dss[mod].attrs['SOURCE'] = 'download_nmme'
        dss[mod].attrs['DESCRIPTION'] = 'NMME downloaded from IRI DL, standardized, and re-index to time that forecast is _for_, not time that it is forecasted.'

        if '.zarr' in output_fn:
            dss[mod] = dss[mod].chunk({'lat':20,'lon':20,'lead':1})
        utility_save(dss[mod],output_fn)
        print(output_fn+' saved!')
    else:
        dss[mod] = xr.open_dataset(output_fn)
        print(output_fn+' already exists, loaded!')

processing model NCEP-CFSv2


  0%|          | 0/1 [00:00<?, ?it/s]

HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 1:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = 'https://iridl.ldeo.