# Collecting various gridded datasets to use as predictors in a NEE model

- ANU climate data
- MODIS LAI
- Soil moisture from GRAFS

Results are saved to `/g/data/os22/chad_tmp/NEE_modelling/results/input_data/input_data_<YYYY>.nc`

In [None]:
# !pip install xarray
# !pip install datacube
# !pip install --extra-index-url="https://packages.dea.ga.gov.au" \
#   odc-algo

# !pip install rioxarray
# !pip install odc-geo
# !pip install dea_tools
# !pip install joblib
# !pip install tqdm
# !pip install geopandas
# !pip install -U scikit-learn
# !pip install dask-ml

In [None]:
from datacube.utils.dask import start_local_dask
import sys
sys.path.append('/g/data/os22/chad_tmp/NEE_modelling/')
from collect_gridded_data import collect_gridded_data

In [None]:
client = start_local_dask(mem_safety_margin='1Gb')
print(client)

# from dask.distributed import Client,Scheduler
# from dask_jobqueue import SLURMCluster
# cluster = SLURMCluster(cores=16,processes=1,memory="47GB")
# client = Client(cluster)
# cluster.scale(cores=16)
# print(client)

## Analysis Parameters

In [None]:
time=('2020')

In [None]:
%%time
data = collect_gridded_data(time=time)

---
---
# Collecting and processing various datasets

## Stitch together GO-SIF

In [None]:
import xarray as xr
import rioxarray
from odc.geo.xr import assign_crs
import pandas as pd
import numpy as np
import os

In [None]:
base='/g/data/os22/chad_tmp/NEE_modelling/data/SIF/GOSIF/'

arrs=[]
files = os.listdir(base)
i = 0
for f in files:
    print(" Tif {:03}/{:03}\r".format(i + 1, len(files)), end="")
    if f.endswith('.tif'):
        y=f[-12:-8]
        m=f[-6:-4]
        sif = rioxarray.open_rasterio(base+f).squeeze().drop('band')
        sif = assign_crs(sif, crs='epsg:4326')
        time=pd.date_range(np.datetime64(y+'-'+m), periods=1, freq="MS") + pd.Timedelta(14, 'd')
        sif = sif.expand_dims(time=time) 
        sif = sif.where(sif < 32766) # clean up dataset
        sif = sif.sel(x=slice(110,155)).sel(y=slice(-8,-45)).astype('float32') #Australia only
        arrs.append(sif)
        i += 1
        
sif = xr.concat(arrs, dim='time').sortby('time')
sif = xr.where(sif < 0, 0, sif) #replace -ve values with 0
sif.name = 'SIF'
sif.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/SIF/GOSIF_2000_2020.nc')

## MODIS Land surface temp

In [None]:
base='/g/data/ub8/au/MODIS/mosaic/MYD11A1.006/'
files = os.listdir(base)
paths = [base+i for i in files if not 'QC' in i]

In [None]:
for p in paths:
    y=p[-7:-3] #year
    lst = xr.open_dataset('/g/data/ub8/au/MODIS/mosaic/MYD11A1.006/MYD11A1.006.LST_Day_1km.2020.nc',
                          chunks=dict(latitude=1000, longitude=1000))
    lst = assign_crs(lst, crs='epsg:4326')
    lst = lst.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean().compute()
    lst = lst.LST_Day_1km.rename('LST')
    lst.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/LST/LST_'+y+'.nc')
    

## MODIS fPAR from GEE

https://github.com/aazuspan/wxee/blob/main/docs/examples/image_collection_to_xarray.ipynb

https://github.com/aazuspan/wxee/blob/main/docs/examples/modis.ipynb

In [None]:
# !pip install earthengine-api
# !pip install wxee

In [None]:
import ee
import wxee
import xarray as xr
import rioxarray
from odc.geo.xr import assign_crs
import pandas as pd
import numpy as np
import os
from odc.algo import xr_reproject

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

In [None]:
# ~Aus region (slightly less to trick google into giving me 1km res)
region = ee.Geometry.Polygon([[
            [114,-43.0],
            [153.0,-43.0],
            [153.0,-10.0],
            [114,-10.0],
            [114,-43.0]]])

### Loop through months and resample 4-day fPAR MODIS data

In [None]:
#use start and end dates to loop through months and load GEE FPAR data
start = pd.date_range(start='7/1/2002', end='12/1/2021', freq='MS') 
end = pd.date_range(start='7/1/2002', end='12/31/2021', freq='M')

i = 0
for s, e in zip(start,end):
    
    print(" {:03}/{:03}\r".format(i + 1, len(start)), end="")
    
    #use this to check if file already exists
    ss = s+pd.Timedelta(14, 'd')
    if os.path.isfile('/g/data/os22/chad_tmp/NEE_modelling/data/FPAR/FPAR_'+ss.strftime('%Y-%m-%d')+'.nc'):
        pass
    
    else:
        try:
            s = s.strftime('%Y-%m-%d')
            e = e.strftime('%Y-%m-%d')

            #download data from GEE
            ts = wxee.TimeSeries("MODIS/061/MCD15A3H").filterDate(s, e)
            ts = ts.select(["Fpar"])
            ds = ts.wx.to_xarray(region=region, scale=1000, crs="EPSG:3577", progress=False) #download at 1km res

            attrs=ds.attrs #extract attributes so we don't loose them
            ds = assign_crs(ds, crs='epsg:3577') #add geobox
            ds = ds.resample(time='MS', loffset=pd.Timedelta(14, 'd')).mean() #resample to monthly
            ds = ds.Fpar #convert to dataarray

            lst = xr.open_dataarray('/g/data/os22/chad_tmp/NEE_modelling/data/LST/LST_2002.nc').isel(time=0) #use this to reproject too
            ds = xr_reproject(ds, geobox=lst.geobox, resampling='bilinear') #reproject
            ds = ds.where(ds!=0) # remove spurious zeros from reprojection
            ds = ds.assign_attrs(attrs) #add back attrs
            ds.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/FPAR/FPAR_'+np.datetime_as_string(ds.time.values[0], unit='D')+'.nc')

        except:
            print('fail:', s,e)
            pass
        
    i+=1

### Stitch together monthly files into annual netcdfs

In [None]:
base='/g/data/os22/chad_tmp/NEE_modelling/data/FPAR/'
i=0
for y in range(2002, 2022):
    print(" {:03}/{:03}\r".format(i + 1, len(range(2002, 2023))), end="")
    
    year = str(y)
    files = [base+f for f in os.listdir(base) if year in f]
    
    dss=[]
    for f in files:
        ds = xr.open_dataset(f)
        dss.append(ds)
        
    data = xr.concat(dss, dim='time').sortby('time')
    data.to_netcdf('/g/data/os22/chad_tmp/NEE_modelling/data/FPAR_annual/FPAR_MODIS_1km_'+year+'.nc')
    
    i+=1

## Himawari 8 Solar

/g/data/fj8/BoM/AWRA/DATA/CLIMATE/solar_exposure_day

## AWRA

In [None]:
# var='rain_day'
# flux = xr.open_dataset('/g/data/fj8/public/BoM/AWRA/Outputs/DailyScheduledRun/processed/values/month/'+var+'.nc')
# flux.isel(time=2)[var].plot.imshow()