# Preprocess NDVI

Here we take our gridded NDVI data, interpolate to daily values, summarise over the ecoregions, then save the file as a pickle file.

The NDVI data was already smoothed and interpolated to bi-weekly in another notebook

We can then use the pickle file for extracting phenometrics in the next notebook.

We do this because daily NDVI over Australia is a huge amount of data so it better if we only do this once and save the results.

<!-- # current_debt = 615007+42778
# lvr = 0.8
# house_value = 790000
# amount_to_pay_off = current_debt - (lvr*house_value)
# amount_to_pay_off -->

In [None]:
import sys
import dask
import pickle
import warnings
import xarray as xr
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from odc.geo.xr import assign_crs
from odc.geo.geom import Geometry
%matplotlib inline

## Open a dask client

In [None]:
import sys
sys.path.append('/g/data/os22/chad_tmp/AusEFlux/src/')
from _utils import start_local_dask
client = start_local_dask()
client

## Analysis Parameters


In [None]:
# product='AusENDVI-clim'
# product='AusENDVI-clim_MCD43A4'
# product='GIMMS-PKU'
# product='GIMMS-PKU_2022'
# product='GIMMSv1.1'
# product='GIMMSv1.2'
# product='MCD43A4'
# product='Landsat'
product='AusENDVI-clim_2000'

ds_path = '/g/data/os22/chad_tmp/Aus_phenology/data/NDVI/NDVI_smooth_'+product+'.nc'

# save_file = '/g/data/os22/chad_tmp/Aus_phenology/data/IBRA_regions_NDVI_timeseries.pkl'
save_file = '/g/data/os22/chad_tmp/Aus_phenology/data/pickle/IBRA_subregions_NDVI_'+product+'.pkl'

# ecoregions_file = '/g/data/os22/chad_tmp/Aus_phenology/data/vectors/IBRAv7_regions_modified.geojson'
ecoregions_file = '/g/data/os22/chad_tmp/Aus_phenology/data/vectors/IBRAv7_subregions_modified.geojson'

# var='REG_NAME_7'
var='SUB_NAME_7'

## Open data

## Load ecoregions

In [None]:
gdf = gpd.read_file(ecoregions_file)

## Summarise NDVI over ecoregions

### Parallelisation version using "dask.delayed"

In [None]:
#decorate the function
@dask.delayed
def zonal_timeseries(index, ds, gdf, var):
    
    ds = assign_crs(ds, crs='EPSG:4326')
    geom = Geometry(geom=gdf.iloc[index].geometry, crs=gdf.crs)
    yy = ds.odc.mask(poly=geom)
    yy = yy.dropna(dim='longitude',
          how='all').dropna(dim='latitude', how='all')

    #summarise into 1d timeseries
    yy = yy.mean(['latitude', 'longitude'])
    
    try:
    # ---Interpolate to daily with quadratic function-------
        yy = yy.dropna(dim='time',
            how='all').resample(time='1D').interpolate(kind='quadratic')
    except:
        return np.nan
    
    return yy

#delay open datasets
dss = dask.delayed(xr.open_dataarray)(ds_path)
gdff = dask.delayed(gpd.read_file)(ecoregions_file)

results={}
# lazily loop through polygons
for index, row in gdf.iterrows():
    zz = zonal_timeseries(index, dss, gdff, var)
    results[row[var]] = zz

In [None]:
%%time
results = dask.compute(results)[0] #bring into memory

# remove NaNs
results = {k: results[k] for k in results if not type(results[k]) is float}

## Save data

In [None]:
with open(save_file, 'wb') as f:
    pickle.dump(results, f)

### Serialised version

In [None]:
# %%time
# results={}
# i=0
# for index, row in gdf[0:20].iterrows():
#     print("Feature {:02}/{:02}\r".format(i + 1, len(range(0, len(gdf)))), end="")
    
#     #clip to ecoregion
#     geom = Geometry(geom=row.geometry, crs=gdf.crs)
#     xx = ds_smooth.odc.mask(poly=geom)
#     xx = xx.dropna(dim='longitude', how='all').dropna(dim='latitude', how='all')
    
#     # #summarise into 1d timeseries
#     xx = xx.mean(['latitude', 'longitude'])

#     #handle case where islands have no NDVI data
#     if np.isnan(xx).sum() == len(xx.time):
#         i+=1
#         continue
#     # ---Interpolate to daily with quadratic function-------
#     xx = xx.dropna(dim='time', how='all').resample(time='1D').interpolate(kind='quadratic')
    
#     results[row[var]] = xx
    
    # i+=1

## Using "multiprocess"

This works, and its fast but it was hanging at the end with only a few polygons to complete. The dask.delayed approach seems more robust, but maybe a little slower.

In [None]:
# import multiprocess as mp
# from tqdm import tqdm
# # import geopandas as gpd
# # import sys
# # from odc.geo.geom import Geometry

# def zonal_timeseries(index, var, results):
    
#     path='/g/data/os22/chad_tmp/Aus_phenology/data/NDVI_smooth.nc'
#     ds = assign_crs(xr.open_dataarray(path), crs='EPSG:4326')
#     gdff = gpd.read_file(ecoregions_file)
    
#     geom = Geometry(geom=gdff.iloc[index].geometry, crs=gdff.crs)
#     yy = ds.odc.mask(poly=geom)
#     yy = yy.dropna(dim='longitude',
#           how='all').dropna(dim='latitude', how='all')

#     #summarise into 1d timeseries
#     yy = yy.mean(['latitude', 'longitude'])
    
#     try:
#     # ---Interpolate to daily with quadratic function-------
#         yy = yy.dropna(dim='time',
#             how='all').resample(time='1D').interpolate(kind='quadratic')
        
#         results[gdff.iloc[index][var]] = yy
        
#     except:
#         results[gdff.iloc[index][var]] = np.nan

# # parallel function for above function
# def _parallel_fun(var, gdf, ncpus):

#     manager = mp.Manager()
#     results = manager.dict()

#     # progress bar
#     pbar = tqdm(total=len(gdf))

#     def update(*a):
#         pbar.update()

#     with mp.Pool(ncpus) as pool:
#         for index, row in gdf.iterrows():
#             pool.apply_async(
#                 zonal_timeseries,
#                 [index, var, results],
#                 callback=update,
#             )
                
#         pool.close()
#         pool.join()
#         pbar.close()
            
#     return results

# %%time
# results = _parallel_fun(var, gdf, ncpus=22)
# results = results._getvalue()