# Collecting various gridded datasets to use as predictors in a NEE model

- ANU climate data
- MODIS LAI
- Soil moisture from GRAFS

Results are saved to `/g/data/os22/chad_tmp/NEE_modelling/results/input_data/input_data_<YYYY>.nc`

In [None]:
# !pip install xarray
# !pip install datacube
# !pip install --extra-index-url="https://packages.dea.ga.gov.au" \
#   odc-algo

# !pip install rioxarray
# !pip install odc-geo
# !pip install dea_tools
# !pip install joblib
# !pip install tqdm
# !pip install geopandas
# !pip install -U scikit-learn
# !pip install dask-ml

In [None]:
import os
import odc.geo.xr
import xarray as xr
import numpy as np
import pandas as pd
from odc.algo import xr_reproject
from odc.geo.xr import assign_crs

import sys
sys.path.append('/g/data/os22/chad_tmp/NEE_modelling/')
from collect_gridded_data import collect_gridded_data

In [None]:
from datacube.utils.dask import start_local_dask
client = start_local_dask(mem_safety_margin='2Gb')
client

## Analysis Parameters

In [None]:
time=('2015')

In [None]:
%%time
collect_gridded_data(time=time)

---
---
# Collecting and processing various datasets

## Resampling GRAFS and LAI to monthly

In [None]:
# client = start_local_dask(mem_safety_margin='1Gb')
# client

In [None]:
i=0
for y in range(2018, 2022):
    print(" {:03}/{:03}\r".format(i + 1, len(range(2018, 2022))), end="")
    
    year = str(y)
    
    sws = xr.open_dataset('/g/data/fj4/SatelliteSoilMoistureProducts/S-GRAFS/ANNUAL_NC/surface_soil_moisture_vol_1km_'+year+'.nc',
                      chunks=dict(lat=1000, lon=1000))
    sws = assign_crs(sws, crs=sws.attrs['crs'][-9:])
    sws = sws.soil_moisture.where(sws >=0)
    sws = sws.soil_moisture.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean().compute()
    sws = sws.rename({'lat':'latitude', 'lon':'longitude'})
    sws.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/GRAFS/GRAFS_1km_monthly_'+year+'.nc')
    i+=1
    
i=0
for y in range(2016, 2022):
    print(" {:03}/{:03}\r".format(i + 1, len(range(2016, 2022))), end="")
    
    year = str(y)
    
    base = '/g/data/ub8/au/MODIS/mosaic/MOD15A2H.006/'
    lai = xr.open_dataset('/g/data/ub8/au/MODIS/mosaic/MOD15A2H.006/MOD15A2H.006.b02.500m_lai.'+year+'.nc',
                          chunks=dict(latitude=1000, longitude=1000))
    lai = assign_crs(lai, crs='epsg:4326')
    lai = lai['500m_lai'].rename('lai') #tidy up the dataset
    lai = lai.where((lai <= 10) & (lai >=0)) #remove artefacts and 'no-data'
    lai = lai.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean().compute() # resample to monthly
    lai.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/LAI/LAI_500m_monthly_'+year+'.nc')
    i+=1

## Stitch together GO-SIF

In [None]:
import xarray as xr
import rioxarray
from odc.geo.xr import assign_crs
import pandas as pd
import numpy as np
import os

In [None]:
base='/g/data/os22/chad_tmp/NEE_modelling/data/SIF/GOSIF/'

arrs=[]
files = os.listdir(base)
i = 0
for f in files:
    print(" Tif {:03}/{:03}\r".format(i + 1, len(files)), end="")
    if f.endswith('.tif'):
        y=f[-12:-8]
        m=f[-6:-4]
        sif = rioxarray.open_rasterio(base+f).squeeze().drop('band')
        sif = assign_crs(sif, crs='epsg:4326')
        time=pd.date_range(np.datetime64(y+'-'+m), periods=1, freq="MS") + pd.Timedelta(14, 'd')
        sif = sif.expand_dims(time=time) 
        sif = sif.where(sif < 32766) # clean up dataset
        sif = sif.sel(x=slice(110,155)).sel(y=slice(-8,-45)).astype('float32') #Australia only
        arrs.append(sif)
        i += 1
        
sif = xr.concat(arrs, dim='time').sortby('time')
sif = xr.where(sif < 0, 0, sif) #replace -ve values with 0
sif.name = 'SIF'
sif.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/SIF/GOSIF_2000_2020.nc')

## MODIS Land surface temp

In [None]:
base='/g/data/ub8/au/MODIS/mosaic/MYD11A1.006/'
files = os.listdir(base)
paths = [base+i for i in files if not 'QC' in i]

In [None]:
for p in paths:
    y=p[-7:-3] #year
    lst = xr.open_dataset('/g/data/ub8/au/MODIS/mosaic/MYD11A1.006/MYD11A1.006.LST_Day_1km.2020.nc',
                          chunks=dict(latitude=1000, longitude=1000))
    lst = assign_crs(lst, crs='epsg:4326')
    lst = lst.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean().compute()
    lst = lst.LST_Day_1km.rename('LST')
    lst.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/LST/LST_'+y+'.nc')
    

## MODIS fPAR from GEE

https://github.com/aazuspan/wxee/blob/main/docs/examples/image_collection_to_xarray.ipynb

https://github.com/aazuspan/wxee/blob/main/docs/examples/modis.ipynb

In [None]:
# !pip install earthengine-api
# !pip install wxee

In [None]:
import ee
import wxee
import xarray as xr
import rioxarray
from odc.geo.xr import assign_crs
import pandas as pd
import numpy as np
import os
from odc.algo import xr_reproject

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

In [None]:
# ~Aus region (slightly less to trick google into giving me 1km res)
region = ee.Geometry.Polygon([[
            [114,-43.0],
            [153.0,-43.0],
            [153.0,-10.0],
            [114,-10.0],
            [114,-43.0]]])

### Loop through months and resample 4-day fPAR MODIS data

In [None]:
#use start and end dates to loop through months and load GEE FPAR data
start = pd.date_range(start='7/1/2002', end='12/1/2021', freq='MS') 
end = pd.date_range(start='7/1/2002', end='12/31/2021', freq='M')

i = 0
for s, e in zip(start,end):
    
    print(" {:03}/{:03}\r".format(i + 1, len(start)), end="")
    
    #use this to check if file already exists
    ss = s+pd.Timedelta(14, 'd')
    if os.path.isfile('/g/data/os22/chad_tmp/NEE_modelling/data/FPAR/FPAR_'+ss.strftime('%Y-%m-%d')+'.nc'):
        pass
    
    else:
        try:
            s = s.strftime('%Y-%m-%d')
            e = e.strftime('%Y-%m-%d')

            #download data from GEE
            ts = wxee.TimeSeries("MODIS/061/MCD15A3H").filterDate(s, e)
            ts = ts.select(["Fpar"])
            ds = ts.wx.to_xarray(region=region, scale=1000, crs="EPSG:3577", progress=False) #download at 1km res

            attrs=ds.attrs #extract attributes so we don't loose them
            ds = assign_crs(ds, crs='epsg:3577') #add geobox
            ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean() #resample to monthly
            ds = ds.Fpar #convert to dataarray

            lst = xr.open_dataarray('/g/data/os22/chad_tmp/NEE_modelling/data/LST/LST_2002.nc').isel(time=0) #use this to reproject too
            ds = xr_reproject(ds, geobox=lst.geobox, resampling='bilinear') #reproject
            ds = ds.where(ds!=0) # remove spurious zeros from reprojection
            ds = ds.assign_attrs(attrs) #add back attrs
            ds.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/FPAR/FPAR_'+np.datetime_as_string(ds.time.values[0], unit='D')+'.nc')

        except:
            print('fail:', s,e)
            pass
        
    i+=1

### Stitch together monthly files into annual netcdfs

In [None]:
base='/g/data/os22/chad_tmp/NEE_modelling/data/FPAR/'
i=0
for y in range(2002, 2022):
    print(" {:03}/{:03}\r".format(i + 1, len(range(2002, 2023))), end="")
    
    year = str(y)
    files = [base+f for f in os.listdir(base) if year in f]
    
    dss=[]
    for f in files:
        ds = xr.open_dataset(f)
        dss.append(ds)
        
    data = xr.concat(dss, dim='time').sortby('time')
    data.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/FPAR_annual/FPAR_MODIS_1km_'+year+'.nc')
    
    i+=1

## Himawari 8 Solar

/g/data/fj8/BoM/AWRA/DATA/CLIMATE/solar_exposure_day

## Standard precipitation index

In [None]:
# !pip install zarr
# !pip install climate_indices
# from climate_indices import compute,indices

### Create long time-series of rainfall

In [None]:
base='/g/data/gh70/ANUClimate/v2-0/stable/month/'
years = [str(i) for i in range(2002,2021)]

pp = []
for y in years:
    precip = xr.open_mfdataset([base+'rain/'+y+'/'+i for i in os.listdir(base+'rain/'+y+'/')],
                              chunks=dict(lat=1000, lon=1000))
    precip = assign_crs(precip, crs='epsg:4283') #GDA94
    precip = precip.drop('crs').rain
    # precip = precip.rename({'lat':'latitude', 'lon':'longitude'})
    pp.append(precip)

precip = xr.concat(pp, dim='time').sortby('time')
del precip.attrs['grid_mapping']
precip.attrs['units'] = 'mm'
precip = precip.to_dataset()
precip.to_zarr('/g/data/os22/chad_tmp/NEE_modelling/data/SPI/rainfall_2002_2020.nc')

### Use `climate_indices` command line to calculate SPI

In [None]:
# scales= < 6 produces NaNs in some places
import os
os.system("process_climate_indices "\
       "--index spi "\
       "--periodicity monthly "\
       "--netcdf_precip /g/data/os22/chad_tmp/NEE_modelling/data/SPI/ANUClim_rainfall_2002_2020.nc "\
       "--var_name_precip rain "\
       "--output_file_base /g/data/os22/chad_tmp/NEE_modelling/data/SPI/ANUClim "\
       "--scales 12 "\
       "--calibration_start_year 2002 "\
       "--calibration_end_year 2020 "\
       "--multiprocessing all "
         )

In [None]:
spi6 = xr.open_dataset('/g/data/os22/chad_tmp/NEE_modelling/data/SPI/ANUClim_spi_gamma_06.nc').sel(time='2019')

In [None]:
spi6.spi_gamma_06.plot.imshow(col='time', col_wrap=4, cmap='RdBu');

In [None]:
# Im memory xarray processing of SPI, this works but is slow (but so is the command line...)

# precip_stacked = precip.stack(point=('latitude','longitude')).groupby('point') #.chunk(dict(time=-1, point=1000000))

# # apply SPI to each `point`
# scale = 6
# distribution = indices.Distribution.gamma
# data_start_year = 2000
# calibration_year_initial = 2000
# calibration_year_final = 2000
# periodicity = compute.Periodicity.monthly

# da_spi = xr.apply_ufunc(indices.spi,
#                         precip_stacked,
#                         scale,
#                         distribution,
#                         data_start_year,
#                         calibration_year_initial,
#                         calibration_year_final,
#                         periodicity,
#                         input_core_dims=[["time", "point"],[],[],[],[],[],[]],
#                         output_core_dims=[["time"]],
#                         # dask='allowed'
#                        )

# unstack the array back into original dimensions
# da_spi = da_spi.unstack('point')

## AWRA Climate

/g/data/fj8/BoM/AWRA/DATA/CLIMATE/g/data/fj8/BoM/AWRA/DATA/CLIMATE

In [2]:
import xarray as xr
import pandas as pd
from matplotlib import pyplot as plt
import os
from odc.geo.xr import assign_crs

In [None]:
var='solar_exposure_day'

aa = []
for y in range(2000, 2022):
    year = str(y)
    ds = xr.open_dataset('/g/data/fj8/BoM/AWRA/DATA/CLIMATE/'+var+'/'+var+'_'+year+'.nc')
    ds = assign_crs(ds, crs='epsg:4236')
    ds = ds / 0.0864 #convert to W/m2
    ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean()
    aa.append(ds)
    
ds = xr.concat(aa, dim='time').sortby('time')
ds.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/AWRA_Climate/solar_monthly_wm2_2000_2022.nc')

In [16]:
var='rain_day'

aa = []
for y in range(1991, 2022):
    year = str(y)
    ds = xr.open_dataset('/g/data/fj8/BoM/AWRA/DATA/CLIMATE/'+var+'/'+var+'_'+year+'.nc')
    ds = assign_crs(ds, crs='epsg:4236')
    ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).sum()
    aa.append(ds)
    
ds = xr.concat(aa, dim='time').sortby('time')
ds.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/AWRA_Climate/rain_monthly_1991_2022.nc')

## FLUXCOM

Downloaded from https://www.bgc-jena.mpg.de/geodb/projects/DataDnld.php

Using RS-METEO driven by ERA5, ensemble of three ML models.

In [None]:
import xarray as xr
from matplotlib import pyplot as plt
import os

In [None]:
base = '/g/data/os22/chad_tmp/NEE_modelling/data/FLUXCOM/NEE/'
files = os.listdir(base)

In [None]:
nee = xr.open_mfdataset([base+f for f in files])
nee = nee.drop(['lat_bnds','lon_bnds','time_bnds']) # clean up
nee = nee.sel(lon=slice(110,155), lat=slice(-9,-45)) # clip to Aus
nee.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/FLUXCOM/nee_rs_meteo_era5.nc')

## Global Canopy Height



In [None]:
import rioxarray
import xarray as xr
from odc.algo import xr_reproject
from odc.geo.xr import assign_crs
from datacube.utils.dask import start_local_dask

client = start_local_dask(mem_safety_margin='2Gb')
print(client)

In [None]:
ds = rioxarray.open_rasterio('/g/data/os22/chad_tmp/NEE_modelling/data/forest_height/Forest_height_2019_AUS.tif').squeeze().drop('band')
ds = assign_crs(ds, crs='epsg:4236')
ds = ds.rename({'y':'latitude', 'x':'longitude'})
ds = ds.chunk(dict(latitude=1000,longitude=1000))

In [None]:
# open a dataset to get geobox to project too (1 km resolution)
lst = xr.open_dataset('/g/data/os22/chad_tmp/NEE_modelling/data/LST/LST_2019.nc').isel(time=0)
lst = lst.chunk(dict(latitude=1000,longitude=1000))

In [None]:
ds = xr_reproject(ds, geobox=lst.geobox, resampling='average').compute()

In [None]:
ds.name = 'forest_height_AUS'

ds.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/forest_height/Forest_height_1km_2019_AUS.nc')