# Download and preprocess NMME

Sample file characteristics: http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.CanCM4i/.FORECAST/.MONTHLY/.prec/

"The forecast starts occur at the beginning of a month of the year, and adding together the forecast start time and the lead time (3-month seasonal lead) determines the season for which the forecast is valid." from https://iridl.ldeo.columbia.edu/maproom/Global/Forecasts/NMME/Seasonal_Anomaly.html

In [None]:
import xarray as xr
import xagg as xa
import numpy as np
import pandas as pd
import datetime as dt
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
import warnings

import os
import re
import glob
from matplotlib import pyplot as plt
from cartopy import crs as ccrs
import cmocean

from funcs_support import get_params,area_mean,subset_to_srat,utility_print
dir_list = get_params()

In [None]:
var = 'prec'

In [None]:
from distributed import Client
# Start dask client
client = Client()
display(client)

In [None]:
mods = ['CanCM4i','CanSIPS-IC3','CanSIPSv2','CMC1-CanCM3','CMC2-CanCM4',
        'GEM-NEMO','GFDL-SPEAR','NASA-GEOSS2S','NCAR-CESM1','NCEP-CFSv2']

In [None]:
rename_dict = {'Y':'lat','X':'lon',
               'S':'forecast_time','L':'lead',
               'M':'run',
               'T':'forecast_time',
               'prec':'pr',
              }

#subset_params = {'lat':slice(-3,12.5),'lon':slice(32,55)}; suffix = '_HoAfrica'; lead0 = False

subset_params = {'lat':slice(-40,40),'lon':slice(-180,180)}; suffix = '_40to40lead0'; lead0 = True

In [None]:
overwrite=False

dss = dict()

for mod in mods:
    print('processing model '+mod)
    load_dir = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/.'+mod+'/.HINDCAST/.MONTHLY/.prec/dods'
    
    dss[mod] = xr.open_dataset(load_dir,
                             decode_times=False,
                               chunks='auto')
    
    # Rename to standards 
    dss[mod] = xa.fix_ds(dss[mod].rename(**{k:v for k,v in rename_dict.items() if k in dss[mod]}))

    # Date is in months since 1960-01-01
    # It's "Forecast Start Time (forecast_reference_time)" in CanCM4i. Not quite sure what that means yet
    dss[mod]['forecast_time'] = [pd.DatetimeIndex(['1960-01-01'])[0] + relativedelta(months=t.values.item()) for t in dss[mod]['forecast_time']]
    
    # Subset
    dss[mod] = dss[mod].sel(**subset_params)
    # Lead subset (added 25/10/08)
    if lead0:
        dss[mod] = dss[mod].isel(lead = [0])
    
    # Clarify a few things
    if 'lead' in dss[mod]:
        dss[mod]['lead'].attrs['units'] = 'month'
    dss[mod]['pr'].attrs['units'] = 'mm/day'
    
    if 'run' in dss[mod]:
        dss[mod]['run'] = dss[mod]['run'].astype(int)
        
    # Get time that the forecast is actually for 
    dst = dss[mod][['forecast_time','lead']].stack(tvars = ['forecast_time','lead'])
    dst['time'] = (('tvars'),[ft + relativedelta(months=np.floor(lt),days=15) for ft,lt in zip(pd.DatetimeIndex(dst.tvars.forecast_time),
                     dst.lead)])

    dss[mod]['time'] = dst.unstack()['time']
    
    # Get date string
    datestr = (re.sub(r'\-','',str(np.unique(dss[mod].time)[0])[0:8])+'01-'+
               re.sub(r'\-','',str(np.unique(dss[mod].time)[-1])[0:8]+'31'))
    
    # Get output filename
    output_fn = dir_list['raw']+mod+'/pr_Amon_'+mod+'_hindcasts_NMME_'+datestr+suffix+'.nc'

    if (overwrite) or (not os.path.exists(output_fn)):
        # Load data into memory
        dss[mod] = dss[mod].load()

        # Change coordinate system to time instead of forecast time
        ds_tmp = xr.Dataset(data_vars = {'pr':(['time','lead','run','lat','lon'],
                                              np.zeros([len(np.unique(dss[mod]['time'])),
                                                        *[dss[mod].sizes[k] for k in ['lead','run','lat','lon']]])*np.nan)},
                             coords = {'time':(['time'],np.unique(dss[mod]['time'])),
                                       **{k:([k],dss[mod][k].values) for k in ['lead','run','lat','lon']}})

        for lead in tqdm(dss[mod].lead):
            for ftime in dss[mod].forecast_time:
                ds_tmp['pr'].loc[{'lead':lead,
                               'time':dss[mod].sel(lead=lead,forecast_time=ftime)['time']}] = (dss[mod].sel(lead=lead,
                                                                                                        forecast_time=ftime))['pr']

        dss[mod] = ds_tmp

        # Export as new file 
        if not os.path.exists(dir_list['raw']+mod+'/'):
            os.mkdir(dir_list['raw']+mod+'/')
            print(dir_list['raw']+mod+'/ created!')

        dss[mod].attrs['SOURCE'] = 'download_nmme'
        dss[mod].attrs['DESCRIPTION'] = 'NMME downloaded from IRI DL, standardized, and re-index to time that forecast is _for_, not time that it is forecasted.'

        if os.path.exists(output_fn):
            os.remove(output_fn)
            print(output_fn+' removed to allow overwrite!')
        
        dss[mod].to_netcdf(output_fn)
        print(output_fn+' saved!')
    else:
        dss[mod] = xr.open_dataset(output_fn)
        print(output_fn+' already exists, loaded!')