# Download and preprocess NMME

Download a 40S to 40N subset of NMME hindcasts, keeping only the shortest lead time forecasts. 

Sample file characteristics: http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.CanCM4i/.FORECAST/.MONTHLY/.prec/

"The forecast starts occur at the beginning of a month of the year, and adding together the forecast start time and the lead time (3-month seasonal lead) determines the season for which the forecast is valid." from https://iridl.ldeo.columbia.edu/maproom/Global/Forecasts/NMME/Seasonal_Anomaly.html

In [1]:
import xarray as xr
import xagg as xa
import numpy as np
import pandas as pd
import datetime as dt
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
import warnings

import os
import re
import glob
from matplotlib import pyplot as plt
from cartopy import crs as ccrs
import cmocean

from funcs_support import get_params,area_mean,utility_print
dir_list = get_params()

In [2]:
var = 'prec'

In [3]:
from distributed import Client
# Start dask client
client = Client()
display(client)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37740 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:37740/status,

0,1
Dashboard: http://127.0.0.1:37740/status,Workers: 8
Total threads: 48,Total memory: 503.37 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41118,Workers: 0
Dashboard: http://127.0.0.1:37740/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:39026,Total threads: 6
Dashboard: http://127.0.0.1:46816/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:45698,
Local directory: /tmp/dask-scratch-space/worker-s408gtcc,Local directory: /tmp/dask-scratch-space/worker-s408gtcc

0,1
Comm: tcp://127.0.0.1:43069,Total threads: 6
Dashboard: http://127.0.0.1:43845/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:40812,
Local directory: /tmp/dask-scratch-space/worker-7da5nzjq,Local directory: /tmp/dask-scratch-space/worker-7da5nzjq

0,1
Comm: tcp://127.0.0.1:37080,Total threads: 6
Dashboard: http://127.0.0.1:36255/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:43869,
Local directory: /tmp/dask-scratch-space/worker-9tk6ygn1,Local directory: /tmp/dask-scratch-space/worker-9tk6ygn1

0,1
Comm: tcp://127.0.0.1:46529,Total threads: 6
Dashboard: http://127.0.0.1:40093/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:40296,
Local directory: /tmp/dask-scratch-space/worker-mremj1v6,Local directory: /tmp/dask-scratch-space/worker-mremj1v6

0,1
Comm: tcp://127.0.0.1:46510,Total threads: 6
Dashboard: http://127.0.0.1:36683/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:37791,
Local directory: /tmp/dask-scratch-space/worker-4j85ih40,Local directory: /tmp/dask-scratch-space/worker-4j85ih40

0,1
Comm: tcp://127.0.0.1:43627,Total threads: 6
Dashboard: http://127.0.0.1:35662/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:42646,
Local directory: /tmp/dask-scratch-space/worker-pyyh4fpp,Local directory: /tmp/dask-scratch-space/worker-pyyh4fpp

0,1
Comm: tcp://127.0.0.1:36612,Total threads: 6
Dashboard: http://127.0.0.1:32920/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:42409,
Local directory: /tmp/dask-scratch-space/worker-3cajti4n,Local directory: /tmp/dask-scratch-space/worker-3cajti4n

0,1
Comm: tcp://127.0.0.1:42266,Total threads: 6
Dashboard: http://127.0.0.1:34381/status,Memory: 62.92 GiB
Nanny: tcp://127.0.0.1:45842,
Local directory: /tmp/dask-scratch-space/worker-sr5anmm2,Local directory: /tmp/dask-scratch-space/worker-sr5anmm2


In [4]:
mods = ['CanCM4i','CanSIPS-IC3','CanSIPSv2','CMC1-CanCM3','CMC2-CanCM4',
        'GEM-NEMO','GFDL-SPEAR','NASA-GEOSS2S','NCAR-CESM1','NCEP-CFSv2']

In [5]:
rename_dict = {'Y':'lat','X':'lon',
               'S':'forecast_time','L':'lead',
               'M':'run',
               'T':'forecast_time',
               'prec':'pr',
              }

#subset_params = {'lat':slice(-3,12.5),'lon':slice(32,55)}; suffix = '_HoAfrica'; lead0 = False

subset_params = {'lat':slice(-40,40),'lon':slice(-180,180)}; suffix = '_40to40lead0'; lead0 = True

In [6]:
overwrite=False

dss = dict()

for mod in mods:
    print('processing model '+mod)
    load_dir = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.'+mod+'/.HINDCAST/.MONTHLY/.prec/dods'
    
    dss[mod] = xr.open_dataset(load_dir,
                             decode_times=False,
                               chunks='auto')
    
    # Rename to standards 
    dss[mod] = xa.fix_ds(dss[mod].rename(**{k:v for k,v in rename_dict.items() if k in dss[mod]}))

    # Date is in months since 1960-01-01
    # It's "Forecast Start Time (forecast_reference_time)" in CanCM4i. Not quite sure what that means yet
    dss[mod]['forecast_time'] = [pd.DatetimeIndex(['1960-01-01'])[0] + relativedelta(months=t.values.item()) for t in dss[mod]['forecast_time']]
    
    # Subset
    dss[mod] = dss[mod].sel(**subset_params)
    # Lead subset (added 25/10/08)
    if lead0:
        dss[mod] = dss[mod].isel(lead = [0])
    
    # Clarify a few things
    if 'lead' in dss[mod]:
        dss[mod]['lead'].attrs['units'] = 'month'
    dss[mod]['pr'].attrs['units'] = 'mm/day'
    
    if 'run' in dss[mod]:
        dss[mod]['run'] = dss[mod]['run'].astype(int)
        
    # Get time that the forecast is actually for 
    dst = dss[mod][['forecast_time','lead']].stack(tvars = ['forecast_time','lead'])
    dst['time'] = (('tvars'),[ft + relativedelta(months=np.floor(lt),days=15) for ft,lt in zip(pd.DatetimeIndex(dst.tvars.forecast_time),
                     dst.lead)])

    dss[mod]['time'] = dst.unstack()['time']
    
    # Get date string
    datestr = (re.sub(r'\-','',str(np.unique(dss[mod].time)[0])[0:8])+'01-'+
               re.sub(r'\-','',str(np.unique(dss[mod].time)[-1])[0:8]+'31'))
    
    # Get output filename
    output_fn = dir_list['raw']+mod+'/pr_Amon_'+mod+'_hindcasts_NMME_'+datestr+suffix+'.nc'

    if (overwrite) or (not os.path.exists(output_fn)):
        # Load data into memory
        dss[mod] = dss[mod].load()

        # Change coordinate system to time instead of forecast time
        ds_tmp = xr.Dataset(data_vars = {'pr':(['time','lead','run','lat','lon'],
                                              np.zeros([len(np.unique(dss[mod]['time'])),
                                                        *[dss[mod].sizes[k] for k in ['lead','run','lat','lon']]])*np.nan)},
                             coords = {'time':(['time'],np.unique(dss[mod]['time'])),
                                       **{k:([k],dss[mod][k].values) for k in ['lead','run','lat','lon']}})

        for lead in tqdm(dss[mod].lead):
            for ftime in dss[mod].forecast_time:
                ds_tmp['pr'].loc[{'lead':lead,
                               'time':dss[mod].sel(lead=lead,forecast_time=ftime)['time']}] = (dss[mod].sel(lead=lead,
                                                                                                        forecast_time=ftime))['pr']

        dss[mod] = ds_tmp

        # Export as new file 
        if not os.path.exists(dir_list['raw']+mod+'/'):
            os.mkdir(dir_list['raw']+mod+'/')
            print(dir_list['raw']+mod+'/ created!')

        dss[mod].attrs['SOURCE'] = 'download_nmme'
        dss[mod].attrs['DESCRIPTION'] = 'NMME downloaded from IRI DL, standardized, and re-index to time that forecast is _for_, not time that it is forecasted.'

        if os.path.exists(output_fn):
            os.remove(output_fn)
            print(output_fn+' removed to allow overwrite!')
        
        dss[mod].to_netcdf(output_fn)
        print(output_fn+' saved!')
    else:
        dss[mod] = xr.open_dataset(output_fn)
        print(output_fn+' already exists, loaded!')

processing model CanCM4i
/dx06/data/climate_raw/CanCM4i/pr_Amon_CanCM4i_hindcasts_NMME_19810101-20181231_40to40lead0.nc already exists, loaded!
processing model CanSIPS-IC3
/dx06/data/climate_raw/CanSIPS-IC3/pr_Amon_CanSIPS-IC3_hindcasts_NMME_19800101-20211131_40to40lead0.nc already exists, loaded!
processing model CanSIPSv2
/dx06/data/climate_raw/CanSIPSv2/pr_Amon_CanSIPSv2_hindcasts_NMME_19810101-20181231_40to40lead0.nc already exists, loaded!
processing model CMC1-CanCM3
/dx06/data/climate_raw/CMC1-CanCM3/pr_Amon_CMC1-CanCM3_hindcasts_NMME_19810101-20101231_40to40lead0.nc already exists, loaded!
processing model CMC2-CanCM4
/dx06/data/climate_raw/CMC2-CanCM4/pr_Amon_CMC2-CanCM4_hindcasts_NMME_19810101-20101231_40to40lead0.nc already exists, loaded!
processing model GEM-NEMO
/dx06/data/climate_raw/GEM-NEMO/pr_Amon_GEM-NEMO_hindcasts_NMME_19810101-20181231_40to40lead0.nc already exists, loaded!
processing model GFDL-SPEAR
/dx06/data/climate_raw/GFDL-SPEAR/pr_Amon_GFDL-SPEAR_hindcasts_

In [7]:
dss

{'CanCM4i': <xarray.Dataset> Size: 1GB
 Dimensions:  (time: 456, lead: 1, run: 10, lat: 81, lon: 360)
 Coordinates:
   * time     (time) datetime64[ns] 4kB 1981-01-16 1981-02-16 ... 2018-12-16
   * lead     (lead) float32 4B 0.5
   * run      (run) int64 80B 1 2 3 4 5 6 7 8 9 10
   * lat      (lat) float32 324B -40.0 -39.0 -38.0 -37.0 ... 37.0 38.0 39.0 40.0
   * lon      (lon) float32 1kB -180.0 -179.0 -178.0 -177.0 ... 177.0 178.0 179.0
 Data variables:
     pr       (time, lead, run, lat, lon) float64 1GB ...
 Attributes:
     SOURCE:       download_nmme
     DESCRIPTION:  NMME downloaded from IRI DL, standardized, and re-index to ...,
 'CanSIPS-IC3': <xarray.Dataset> Size: 2GB
 Dimensions:  (time: 503, lead: 1, run: 20, lat: 81, lon: 360)
 Coordinates:
   * time     (time) datetime64[ns] 4kB 1980-01-16 1980-02-16 ... 2021-11-16
   * lead     (lead) float32 4B 0.5
   * run      (run) int64 160B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
   * lat      (lat) float32 324B -40.0