# Download and preprocess NMME

Download a 40S to 40N subset of NMME hindcasts, keeping only the shortest lead time forecasts. 

Sample file characteristics: http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.CanCM4i/.FORECAST/.MONTHLY/.prec/

"The forecast starts occur at the beginning of a month of the year, and adding together the forecast start time and the lead time (3-month seasonal lead) determines the season for which the forecast is valid." from https://iridl.ldeo.columbia.edu/maproom/Global/Forecasts/NMME/Seasonal_Anomaly.html

In [35]:
import xarray as xr
import xagg as xa
import numpy as np
import pandas as pd
import datetime as dt
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
import warnings

import os
import re
import glob
from matplotlib import pyplot as plt
from cartopy import crs as ccrs
import cmocean

from funcs_support import get_params,area_mean,utility_save
dir_list = get_params()

In [2]:
var = 'prec'

In [3]:
from distributed import Client
# Start dask client
client = Client()
display(client)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40122 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:40122/status,

0,1
Dashboard: http://127.0.0.1:40122/status,Workers: 8
Total threads: 48,Total memory: 503.37 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44136,Workers: 0
Dashboard: http://127.0.0.1:40122/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:38111,Total threads: 6
Dashboard: http://127.0.0.1:37486/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:46103,
Local directory: /tmp/dask-scratch-space/worker-wdxmp2d8,Local directory: /tmp/dask-scratch-space/worker-wdxmp2d8

0,1
Comm: tcp://127.0.0.1:42012,Total threads: 6
Dashboard: http://127.0.0.1:41171/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:35675,
Local directory: /tmp/dask-scratch-space/worker-2y8inm2i,Local directory: /tmp/dask-scratch-space/worker-2y8inm2i

0,1
Comm: tcp://127.0.0.1:46022,Total threads: 6
Dashboard: http://127.0.0.1:35157/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:45527,
Local directory: /tmp/dask-scratch-space/worker-vtwq01ze,Local directory: /tmp/dask-scratch-space/worker-vtwq01ze

0,1
Comm: tcp://127.0.0.1:34409,Total threads: 6
Dashboard: http://127.0.0.1:36216/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:43259,
Local directory: /tmp/dask-scratch-space/worker-76il5j95,Local directory: /tmp/dask-scratch-space/worker-76il5j95

0,1
Comm: tcp://127.0.0.1:38055,Total threads: 6
Dashboard: http://127.0.0.1:39648/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:41907,
Local directory: /tmp/dask-scratch-space/worker-y_wj6ztr,Local directory: /tmp/dask-scratch-space/worker-y_wj6ztr

0,1
Comm: tcp://127.0.0.1:35190,Total threads: 6
Dashboard: http://127.0.0.1:33231/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:39043,
Local directory: /tmp/dask-scratch-space/worker-bnowy3cu,Local directory: /tmp/dask-scratch-space/worker-bnowy3cu

0,1
Comm: tcp://127.0.0.1:45982,Total threads: 6
Dashboard: http://127.0.0.1:46020/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:38463,
Local directory: /tmp/dask-scratch-space/worker-ouq2lxkg,Local directory: /tmp/dask-scratch-space/worker-ouq2lxkg

0,1
Comm: tcp://127.0.0.1:36338,Total threads: 6
Dashboard: http://127.0.0.1:43214/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:45826,
Local directory: /tmp/dask-scratch-space/worker-leyxsz50,Local directory: /tmp/dask-scratch-space/worker-leyxsz50


In [4]:
#mods = ['CanCM4i','CanSIPS-IC3','CanSIPSv2','CMC1-CanCM3','CMC2-CanCM4',
#        'GEM-NEMO','GFDL-SPEAR','NASA-GEOSS2S','NCAR-CESM1','NCEP-CFSv2']
mods = ['NCEP-CFSv2']

In [5]:
rename_dict = {'Y':'lat','X':'lon',
               'S':'forecast_time','L':'lead',
               'M':'run',
               'T':'forecast_time',
               'prec':'pr',
              }

#subset_params = {'lat':slice(-3,12.5),'lon':slice(32,55)}; suffix = '_HoAfrica'; lead0 = False

subset_params = {'lat':slice(-40,40),'lon':slice(-180,180)}; suffix = '_40to40lead0'; lead0 = True

proc_by_year = True

In [33]:
start_years = np.arange(1982,2023,10)
end_years = [np.min([sy+9,2025]) for sy in start_years]

# Since it's forecasts for the future, ending 

base_url = 'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/'
urls = [(f'{base_url}Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%20{str(sy)}'+
       f'%29%280000%2015%20Dec%20{str(ey)}%29RANGEEDGES/dods')
        for sy,ey in zip(start_years,end_years)]

['https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%201982%29%280000%2015%20Dec%201991%29RANGEEDGES/dods',
 'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%201992%29%280000%2015%20Dec%202001%29RANGEEDGES/dods',
 'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%202002%29%280000%2015%20Dec%202011%29RANGEEDGES/dods',
 'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%202012%29%280000%2015%20Dec%202021%29RANGEEDGES/dods',
 'https://iridl.ldeo.columbia.ed

In [None]:
overwrite=False


#for mod in mods:
for load_dir in urls:
    #print('processing model '+mod)
    #load_dir = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.'+mod+'/.HINDCAST/.MONTHLY/.prec/dods'
    #load_dir = 'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.'+mod+'/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/dods'
    print(f'processing file {load_dir}')
    
    ds = xr.open_dataset(load_dir,
                     decode_times=False,
                     chunks={'L':-1,'S':-1,'Y':20,'X':20,'M':7})

    ds = ds.load() # Should take ~ 5 minute for a decade chunk
        
    # Rename to standards 
    ds = xa.fix_ds(ds.rename(**{k:v for k,v in rename_dict.items() if k in ds}))
    
    # Date is in months since 1960-01-01
    # It's "Forecast Start Time (forecast_reference_time)" in CanCM4i. Not quite sure what that means yet
    ds['forecast_time'] = [pd.DatetimeIndex(['1960-01-01'])[0] + relativedelta(months=t.values.item()) for t in ds['forecast_time']]
    
    # Subset
    ds = ds.sel(**subset_params)
    # Lead subset (added 25/10/08)
    if lead0:
        ds = ds.isel(lead = [0])
    
    # Clarify a few things
    if 'lead' in ds:
        ds['lead'].attrs['units'] = 'month'
    ds['pr'].attrs['units'] = 'mm/day'
    
    if 'run' in ds:
        ds['run'] = ds['run'].astype(int)
        
    # Get time that the forecast is actually for 
    dst = ds[['forecast_time','lead']].stack(tvars = ['forecast_time','lead'])
    dst['time'] = (('tvars'),[ft + relativedelta(months=np.floor(lt),days=15) for ft,lt in zip(pd.DatetimeIndex(dst.tvars.forecast_time),
                     dst.lead)])
    
    ds['time'] = dst.unstack()['time']
    
    # Get date string 
    datestr = (re.sub(r'\-','',str(np.unique(ds.time)[0])[0:8])+'01-'+
               re.sub(r'\-','',str(np.unique(ds.time)[-1])[0:8]+'31'))
    
    # Get output filename
    output_fn = dir_list['raw']+mod+'/pr_Amon_'+mod+'_hindcasts_NMME_'+datestr+suffix+'.zarr'

    if (overwrite) or (not os.path.exists(output_fn)):
        # Load data into memory
        #ds = ds.load()

        # Change coordinate system to time instead of forecast time
        ds_tmp = xr.Dataset(data_vars = {'pr':(['time','lead','run','lat','lon'],
                                              np.zeros([len(np.unique(ds['time'])),
                                                        *[ds.sizes[k] for k in ['lead','run','lat','lon']]])*np.nan)},
                             coords = {'time':(['time'],np.unique(ds['time'])),
                                       **{k:([k],ds[k].values) for k in ['lead','run','lat','lon']}})

        for lead in tqdm(ds.lead):
            for ftime in ds.forecast_time:
                ds_tmp['pr'].loc[{'lead':lead,
                               'time':ds.sel(lead=lead,forecast_time=ftime)['time']}] = (ds.sel(lead=lead,
                                                                                                        forecast_time=ftime))['pr']
        # Export as new file 

        ds_tmp.attrs['SOURCE'] = 'download_nmme'
        ds_tmp.attrs['DESCRIPTION'] = 'NMME downloaded from IRI DL, standardized, and re-index to time that forecast is _for_, not time that it is forecasted.'

        if '.zarr' in output_fn:
            ds_tmp = ds_tmp.chunk({'lat':20,'lon':20,'lead':1})
        utility_save(ds_tmp,output_fn)
        print(output_fn+' saved!')
    else:
        #ds = xr.open_dataset(output_fn)
        print(output_fn+' already exists!')

processing file https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%201982%29%280000%2015%20Dec%201991%29RANGEEDGES/dods


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 2:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = 'https://iridl.ldeo.

  0%|          | 0/1 [00:00<?, ?it/s]

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_19820101-19911231_40to40lead0.zarr saved!
/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_19820101-19911231_40to40lead0.zarr saved!
processing file https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%201992%29%280000%2015%20Dec%202001%29RANGEEDGES/dods


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 3:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = 'https://iridl.ldeo.

  0%|          | 0/1 [00:00<?, ?it/s]

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_19920101-20011231_40to40lead0.zarr saved!
/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_19920101-20011231_40to40lead0.zarr saved!
processing file https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%202002%29%280000%2015%20Dec%202011%29RANGEEDGES/dods


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 3:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
HDF5-DIAG: Error detected in HDF5 (1.14.6)  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
 thread 3    minor: Can't get value
:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: 

  0%|          | 0/1 [00:00<?, ?it/s]

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_20020101-20111231_40to40lead0.zarr saved!
/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_20020101-20111231_40to40lead0.zarr saved!
processing file https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%202012%29%280000%2015%20Dec%202021%29RANGEEDGES/dods


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 4:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = 'https://iridl.ldeo.

  0%|          | 0/1 [00:00<?, ?it/s]

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_20120101-20211231_40to40lead0.zarr saved!
/dx06/data/climate_raw/NCEP-CFSv2/pr_Amon_NCEP-CFSv2_hindcasts_NMME_20120101-20211231_40to40lead0.zarr saved!
processing file https://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.NCEP-CFSv2/.HINDCAST/.PENTAD_SAMPLES_FULL/.prec/Y/%2840S%29%2840N%29RANGEEDGES/L/%280.5%29%280.5%29RANGEEDGES/S/%280000%201%20Jan%202022%29%280000%2015%20Dec%202025%29RANGEEDGES/dods


HDF5-DIAG: Error detected in HDF5 (1.14.6) thread 5:
  #000: H5F.c line 496 in H5Fis_accessible(): unable to determine if file is accessible as HDF5
    major: File accessibility
    minor: Not an HDF5 file
  #001: H5VLcallback.c line 3913 in H5VL_file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #002: H5VLcallback.c line 3848 in H5VL__file_specific(): file specific failed
    major: Virtual Object Layer
    minor: Can't operate on object
  #003: H5VLnative_file.c line 344 in H5VL__native_file_specific(): error in HDF5 file check
    major: File accessibility
    minor: Can't get value
  #004: H5Fint.c line 1055 in H5F__is_hdf5(): unable to open file
    major: File accessibility
    minor: Unable to initialize object
  #005: H5FD.c line 787 in H5FD_open(): can't open file
    major: Virtual File Layer
    minor: Unable to open file
  #006: H5FDsec2.c line 323 in H5FD__sec2_open(): unable to open file: name = 'https://iridl.ldeo.

In [37]:
#----- Collate into single file, delete individual files

fns = np.sort(glob.glob(dir_list['raw']+mod+'/*.zarr'))

dss = xr.open_mfdataset(fns)
datestr = (re.sub(r'\-','',str(np.unique(dss.time)[0])[0:8])+'01-'+
               re.sub(r'\-','',str(np.unique(dss.time)[-1])[0:8]+'31'))

output_fn = dir_list['raw']+mod+'/pr_Amon_'+mod+'_hindcasts_NMME_'+datestr+suffix+'.zarr'

dss.attrs = {'DESCRIPTION':f'{mod}, preprocessed to file system standards and reindexed to month forecast is for',
             'SOURCE':'preprocess_nmme.ipynb'}

dss.to_zarr(output_fn)

for fn in fns:
    os.system('rm -rf '+fn)

In [61]:
#----- Save version of the mean forecast across ensemble members
dss = xr.open_zarr(output_fn)
dss = dss.mean('run')

dss.attrs = {'DESCRIPTION':f'Ensemble mean of {mod} hind/forecasts, preprocessed to file system standards and reindexed to month forecast is for',
             'SOURCE':'preprocess_nmme.ipynb'}

utility_save(dss,re.sub(mod,mod+'-ensmean',output_fn))

/dx06/data/climate_raw/NCEP-CFSv2-ensmean created!




/dx06/data/climate_raw/NCEP-CFSv2-ensmean/pr_Amon_NCEP-CFSv2-ensmean_hindcasts_NMME_19820101-20251231_40to40lead0.zarr saved!
