# Harmonization of MODIS and AVHRR

Following [Rifai et al. (2022)](https://bg.copernicus.org/articles/19/491/2022/)

    <var>-MODIS = s(<var>-AVHRR)+s(month)+s(SZA)+s(TOD)+s(x,y)

This process was run using an R script and the `mgcv` package. This notebook post-processes the output of the GAM harmonization:
- Create plots of before and after harmonization
- Merging the calibrated AVHRR with MODIS to create a continous time-series 1982-2022
- Gapfilling
    
        
    


In [None]:
import numpy as np
import xarray as xr
import seaborn as sb
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from odc.geo.xr import assign_crs
from scipy.stats import gaussian_kde
from sklearn.metrics import mean_absolute_error

import sys
sys.path.append('/g/data/os22/chad_tmp/NEE_modelling/')
from _collect_prediction_data import round_coords


## Open datasets

In [None]:
var='lst'
crs='epsg:4326'

merge = assign_crs(xr.open_dataset('/g/data/os22/chad_tmp/climate-carbon-interactions/data/'+var.upper()+'_harmonization/Harmonized_'+var.upper()+'_AVHRR_MODIS_1982_2013.nc'),
                   crs=crs)

## Convert to dataframes for nice plotting

In [None]:
modis_flat = merge[var+'_mcd'].sel(time=slice('2001', '2013')).values.flatten()
avhrr_flat = merge[var+'_cdr'].sel(time=slice('2001', '2013')).values.flatten()
avhrr_adjust = merge[var+'_mcd_pred'].sel(time=slice('2001', '2013')).values.flatten()

df = pd.DataFrame({'MODIS':modis_flat, 'AVHRR-original':avhrr_flat, 'AVHRR-adjusted':avhrr_adjust})
df = df.dropna()
df_sample = df.sample(n=10000, random_state=1) #too many pixels to plot so grab sample

In [None]:
# df = merge.ndvi_mcd.mean(['x', 'y']).rename('MODIS').drop('spatial_ref').to_dataframe()
# df['AVHRR-original'] = merge.ndvi_cdr.mean(['x', 'y']).drop('spatial_ref').to_dataframe()
# df['AVHRR-adjusted'] = merge.ndvi_mcd_pred.mean(['x', 'y']).drop('spatial_ref').to_dataframe()
# df = df.dropna()
# df.head()

## Scatter plots of before and after harmonization

In [None]:
products=['AVHRR-original', 'AVHRR-adjusted']

fig,ax = plt.subplots(1,2, figsize=(10,5), sharey=True)
font=15

for prod, ax in zip(products, ax.ravel()):
    obs,pred = df_sample['MODIS'].values, df_sample[prod].values
    slope, intercept, r_value, p_value, std_err = stats.linregress(obs,pred)
    r2 = r_value**2
    ac = mean_absolute_error(obs, pred)
    
    xy = np.vstack([obs,pred])
    z = gaussian_kde(xy)(xy)
    
    sb.scatterplot(data=df_sample, x='MODIS',y=prod, c=z, s=50, lw=1, alpha=0.5, ax=ax)
    sb.regplot(data=df_sample, x='MODIS',y=prod, scatter=False, color='blue', ax=ax)
    sb.regplot(data=df_sample, x='MODIS',y='MODIS', color='black', scatter=False, line_kws={'linestyle':'dashed'}, ax=ax)
    ax.set_title(prod, fontsize=font)
    ax.set_xlabel('MODIS '+var.upper(), fontsize=font)
    ax.set_ylabel('')
    # ax.set_ylim(0,0.9)
    # ax.set_xlim(0,0.9)
    ax.text(.05, .90, 'r\N{SUPERSCRIPT TWO}={:.2f}'.format(np.mean(r2)),
            transform=ax.transAxes, fontsize=font)
    ax.text(.05, .825, 'MAE={:.2g}'.format(np.mean(ac)),
            transform=ax.transAxes, fontsize=font)
    ax.tick_params(axis='x', labelsize=font)
    ax.tick_params(axis='y', labelsize=font)

fig.supylabel('AVHRR '+var.upper(), fontsize=font)
plt.tight_layout();

## Per-pixel Correlations

In [None]:
adjusted_corr = xr.corr(merge[var+'_mcd'].sel(time=slice('2001', '2013')),
                        merge[var+'_mcd_pred'].sel(time=slice('2001', '2013')), 
                        dim='time'
                       )

orig_corr = xr.corr(merge[var+'_mcd'].sel(time=slice('2001', '2013')),
                        merge[var+'_cdr'].sel(time=slice('2001', '2013')), 
                        dim='time'
                       )

In [None]:
corr_data = [orig_corr, adjusted_corr]

In [None]:
fig,axes = plt.subplots(1,2, figsize=(16,6), sharey=True)

for ax, ds, n in zip(axes.ravel(), corr_data, products):
    im = ds.plot.imshow(vmin=0, vmax=1.0, cmap='magma', ax=ax, add_colorbar=False)
    ax.set_title(n,  fontsize=20);
    ax.set_yticklabels([])
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.set_xticklabels([])

fig.subplots_adjust(wspace=0.05)
fig.colorbar(im, ax=axes.ravel().tolist(), pad=0.01, label='Correlation');
# plt.tight_layout();

## Merge the calibrated AVHRR with MODIS

In the AVHRR record, the last 3 months of 1994 are all NaNs, which in the R script means they were discarded from the time-series. This means the time-series in `<var>_AVHRR_MODIS_1982_2013.nc` has missing time-steps.  Below, this is recitified by simply adding 3 empty time-slices. 

In [None]:
if var=='ndvi':
    gbox = xr.open_dataset('/g/data/os22/chad_tmp/climate-carbon-interactions/data/5km/WCF_5km_monthly_1990_2022.nc').odc.geobox

#grab some time-slices from another dataset
missing_time_steps = xr.open_dataset('/g/data/os22/chad_tmp/climate-carbon-interactions/data/5km/WCF_5km_monthly_1990_2022.nc').sel(time=slice('1994-10', '1994-12')).time

### Open the calibrated AVHRR

extract timeseries 1982-2000, resample to epsg:4326 (only if NDVI)

In [None]:
ds = assign_crs(xr.open_dataset('/g/data/os22/chad_tmp/climate-carbon-interactions/data/'+var.upper()+'_harmonization/Harmonized_'+var.upper()+'_AVHRR_MODIS_1982_2013.nc'),
                   crs=crs)

ds_pred = ds[var+'_mcd_pred']
ds_pred = ds_pred.rename(var.upper()).sel(time=slice('1982', '2000'))
ds_pred.attrs['nodata'] = np.nan

if var=='ndvi':
    ds_pred = ds_pred.odc.reproject(gbox, resampling='bilinear')
ds_pred = round_coords(ds_pred)

### Creating three empty slices of data

and merge back with calibrated avhrr

In [None]:
extras = ds_pred.isel(time=range(0,3)) # just grab 3 random slices
extras = xr.where(extras>100000,np.nan, np.nan) #make values all NaNs
extras['time'] = missing_time_steps #change time-stamp to missing values
extras = round_coords(extras)

In [None]:
ds_pred = xr.concat([ds_pred, extras], dim='time').sortby('time')

### Merge calibrated AVHRR with MODIS

In [None]:
mod = assign_crs(xr.open_dataset('/g/data/os22/chad_tmp/climate-carbon-interactions/data/'+var.upper()+'_harmonization/MODIS_'+var.upper()+'_5km_monthly_2001_2022.nc'),
                 crs=crs)

In [None]:
mod.attrs['nodata'] = np.nan
if var=='ndvi':
    mod = mod.odc.reproject(gbox, resampling='bilinear')
    mod = round_coords(mod)

mod = round_coords(mod)
mod = mod[var.upper()+'_median']
mod = mod.rename(var.upper())
merged = xr.concat([ds_pred,mod], dim='time')
merged = merged.rename(var.upper())

In [None]:
merged.astype(np.float32).to_netcdf('/g/data/os22/chad_tmp/climate-carbon-interactions/data/5km/'+var.upper()+'_5km_monthly_1982_2022_wGaps.nc')

## Gap-filling

Try with climatologies and linear interp

In [None]:
import os
import xarray as xr
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from odc.geo.xr import assign_crs

In [None]:
from datacube.utils.dask import start_local_dask
client = start_local_dask(mem_safety_margin='2Gb')
client

In [None]:
var='lst'
crs='epsg:4326'

file='/g/data/os22/chad_tmp/climate-carbon-interactions/data/5km/'+var.upper()+'_5km_monthly_1982_2022_wGaps.nc'
chunks=dict(latitude=1000, longitude=1000, time=-1)

ds = assign_crs(xr.open_dataset(file, chunks=chunks), crs=crs)

### Count fraction of missing data

In [None]:
mask = np.isnan(ds[var.upper()].mean('time').compute())
mask = (~mask)

total_valid_pixels = mask.sum().values
nans = np.isnan(ds[var.upper()].compute())
num_of_nans = nans.where(mask).sum(['latitude', 'longitude'])
num_of_nans_normalised = num_of_nans / total_valid_pixels

### Plot fraction missing against rainfall

To see if missing data coincides with increased cloud coverage

In [None]:
rain = xr.open_dataset('/g/data/os22/chad_tmp/climate-carbon-interactions/data/5km/rain_5km_monthly_1981_2022.nc').rain

In [None]:
# #compute climatology
# rain_clim = rain.sel(time=slice('1982', '2012')).groupby("time.month").mean("time")#.compute()
# rain_anom = rain.groupby("time.month") - rain_clim

In [None]:
rain = rain.sel(time=slice('1982', '2000')).mean(['latitude', 'longitude'])
df = rain.to_dataframe('Rainfall')

#Add fractions missing to dataframe for easy plotting
df['Fraction Missing'] = num_of_nans_normalised.sel(time=slice('1982', '2000')).drop('spatial_ref').to_dataframe()

In [None]:
fig,ax = plt.subplots(1,1, figsize=(13,4))
ax2 = ax.twinx()

# sb.barplot(data=df, x=df.index, y='Rainfall Anomaly', ax=ax2, color='skyblue')
ax2.bar(df.index, height=df['Rainfall'], width=20, color='skyblue')
sb.lineplot(data=df, x=df.index, y='Fraction Missing', ax=ax, c='black')
ax2.set_zorder(ax.get_zorder()-1)
ax2.set_ylabel('Mean Rainfall (mm)')
ax.set_frame_on(False)

### Per pixel fractions of missing data

In [None]:
avhrr_nans = (~nans).sel(time=slice('1982', '2000'))

In [None]:
(avhrr_nans.sum('time')/len(avhrr_nans.time)).where(mask).plot(add_labels=False, vmin=0, vmax=1, cmap='inferno', size=6)
plt.title('AVHRR Available Fraction of Data, monthly: 1982-2000');

## Test some interpolation methods

#### Select some areas

In [None]:
test_area_1 = ds[var.upper()].isel(latitude=range(115,120), longitude=range(355,360)) #Northern Territory tropical savannah
test_area_2 = ds[var.upper()].isel(latitude=range(455,460), longitude=range(100,105)) # SW WA cropping
test_area_3 = ds[var.upper()].isel(latitude=range(545,550), longitude=range(745,750)) # SE Aus forest

#summarise
test_area_1 = test_area_1.sel(time=slice('1982', '2000')).mean(['latitude', 'longitude'])
test_area_2 = test_area_2.sel(time=slice('1982', '2000')).mean(['latitude', 'longitude'])
test_area_3 = test_area_3.sel(time=slice('1982', '2000')).mean(['latitude', 'longitude'])

### Conduct various interpolations

In [None]:
## ---------Interpolate area 1------------------------------------------------------------

linear_area_1 = test_area_1.interpolate_na(method='linear', dim='time', limit=3)
slinear_area_1 = test_area_1.interpolate_na(method='slinear', dim='time', limit=3)
cubic_area_1 = test_area_1.interpolate_na(method='cubic', dim='time', limit=3)

#fill with climatology
test_area_1_clim = ds[var.upper()].isel(latitude=range(115,120),
                                longitude=range(355,360)).groupby("time.month").mean("time")
test_area_1_clim = test_area_1.groupby("time.month").mean("time")#.compute()
clim_area_1 = test_area_1.groupby("time.month").fillna(test_area_1_clim)

#combined fill
combine_area_1 = linear_area_1.groupby("time.month").fillna(clim_area_1).compute()

## ---------Interpolate area 2-------------------------------------------------------------
linear_area_2 = test_area_2.interpolate_na(method='linear', dim='time', limit=3)
slinear_area_2 = test_area_2.interpolate_na(method='slinear', dim='time', limit=3)
cubic_area_2 = test_area_2.interpolate_na(method='cubic', dim='time', limit=3)

#fill with climatology
test_area_1_clim = ds[var.upper()].isel(latitude=range(115,120),
                                longitude=range(355,360)).groupby("time.month").mean("time")
test_area_2_clim = test_area_2.groupby("time.month").mean("time")#.compute()
clim_area_2 = test_area_2.groupby("time.month").fillna(test_area_2_clim)

#combined fill
combine_area_2 = linear_area_2.groupby("time.month").fillna(clim_area_2).compute()

## ---------Interpolate area 3--------------------------------------------------------------

linear_area_3 = test_area_3.interpolate_na(method='linear', dim='time', limit=3)
slinear_area_3 = test_area_3.interpolate_na(method='slinear', dim='time', limit=3)
cubic_area_3 = test_area_3.interpolate_na(method='cubic', dim='time', limit=3)

#fill with climatology
test_area_1_clim = ds[var.upper()].isel(latitude=range(115,120),
                                longitude=range(355,360)).groupby("time.month").mean("time")
test_area_3_clim = test_area_3.groupby("time.month").mean("time")
clim_area_3 = test_area_3.groupby("time.month").fillna(test_area_3_clim)

#combined fill
combine_area_3 = linear_area_3.groupby("time.month").fillna(clim_area_3).compute()

### Plot the interpolation results

In [None]:
fig,ax = plt.subplots(1,1, figsize=(13,4), sharey=True)

combine_area_1.plot(ax=ax, label='linear+climatology', linestyle='--', c='green')
linear_area_1.plot(ax=ax, label='linear', linestyle='dashdot', c='red')
test_area_1.plot(ax=ax, label='original', c='black')

ax.set_title('Northern Territory test area for interpolation')
ax.legend();

In [None]:
fig,ax = plt.subplots(1,1, figsize=(13,4), sharey=True)

combine_area_2.plot(ax=ax, label='linear+climatology', linestyle='--', c='green')
linear_area_2.plot(ax=ax, label='linear', linestyle='dashdot', c='red')
test_area_2.plot(ax=ax, label='original', c='black')
ax.set_title('SW WA test area for interpolation')
ax.legend();

In [None]:
fig,ax = plt.subplots(1,1, figsize=(13,4), sharey=True)

combine_area_3.plot(ax=ax, label='linear+climatology', linestyle='--', c='green')
linear_area_3.plot(ax=ax, label='linear', linestyle='dashdot', c='red')

test_area_3.plot(ax=ax, label='original', c='black')
ax.set_title('SE Aus forest test area for interpolation')
ax.legend();

### Fill dataset using the combined interpolation approach

First linear interpolate maximum 3 steps. Then fill remaining gaps with climatology

In [None]:
#inter[olate nans with linear method
ds_linear = ds.interpolate_na(method='linear', dim='time', limit=3)

#compute climatology
clim = ds.groupby("time.month").mean("time")

#fill remaining gaps with climatology
ds_combine = ds_linear.groupby("time.month").fillna(clim).compute()

### Compare

In [None]:
# fig,ax = plt.subplots(1,1, figsize=(13,5), sharey=True)
# ds.NDVI.sel(time=slice('1982','2000')).mean(['latitude', 'longitude']).plot(ax=ax, label='No-fill')
# ds_linear.NDVI.sel(time=slice('1982','2000')).mean(['latitude', 'longitude']).plot(ax=ax, label='linear')
# ds_combine.NDVI.sel(time=slice('1982','2000')).mean(['latitude', 'longitude']).plot(ax=ax, label='linear+climatology')
# ax.legend()
# plt.title('Australia wide')

## Consider using CLIMFILL for gapfilling:

https://github.com/climachine/climfill/blob/master/example_workflow.py

## Export results

In [None]:
ds_combine.drop('month').to_netcdf('/g/data/os22/chad_tmp/climate-carbon-interactions/data/5km/'+var.upper()+'_5km_monthly_1982_2022.nc')