# Demo of default functions dc.load

*****

__This script is the "official demo" of a function. Please if you want to modify it, work on your own copy__

Load data as an ``xarray`` object.  Each measurement will be a data variable in the :class:`xarray.Dataset`.

This basic default `dc.load` function as plenty of options who worse to be explored. This script present the reprojection options:
* `output_crs`
* `resolution`

Keeping dfault `resampling` option ('nearest neighboor').

Documentation for a given function can be accessed simply by adding ? at the end of the function in a cell. e.g. `dc.load?` or by selecting the function and pressing `Shift-Tab`.

In [None]:
# Make sure the script is using the proper kernel
try:
    %run ../swiss_utils/assert_env.py
except:
    %run ./swiss_utils/assert_env.py

In [None]:
# Import modules

# reload module before executing code
%load_ext autoreload
%autoreload 2

# define modules locations (you might have to adapt define_mod_locs.py)
%run ../swiss_utils/define_mod_locs.py

# to plot figures
%matplotlib inline

import time
import rioxarray # !pip3 install rioxarray
import numpy as np
import pandas as pd

from datetime import datetime
from pyproj import Proj
from pyproj import Transformer
from itertools import product as iterprod

from swiss_utils.data_cube_utilities.sdc_utilities import write_geotiff_from_xr

import datacube
dc = datacube.Datacube()

The next cell contains the dataset configuration information:
- product
- geographical extent
- time period
- bands

You can generate it in three ways:
1. manually from scratch,
2. by manually copy/pasting the final cell content of the [config_tool](config_tool.ipynb) notebook,
3. by loading the final cell content of the [config_tool](config_tool.ipynb) notebook using the magic `%load config_cell.txt`.

**To make this notebook run without modification you will need to use anyone of Landsat or Sentinel 2 products with the single blue band.**

In [None]:
%load config_cell.txt

__Let's use the function with the minimum of options__

In [None]:
start_time = time.time()
dataset_in = dc.load(product = product,
                     time = (start_date, end_date), lon = (min_lon, max_lon), lat = (min_lat,max_lat),
                     measurements = measurements)
dif_time = time.time() - start_time
dataset_in

In [None]:
# plot the xarray.Dataset to select an appropriate time to export
dataset_in.blue.plot(col='time', col_wrap=5)

In [None]:
# Export a given time for comparison
t = 1
write_geotiff_from_xr(tif_path = 'default.tif',
                      dataset = dataset_in.isel(time = 1).astype('int16'),
                      compr = 'DEFLATE')

In [None]:
# And store characteristics for comparison
default_min_lat = dataset_in.latitude.values.min()
default_max_lat = dataset_in.latitude.values.max()
default_min_lon = dataset_in.longitude.values.min()
default_max_lon = dataset_in.longitude.values.max()
default_crs = dataset_in.crs
default_res_lon = (dataset_in.longitude.values.max() - dataset_in.longitude.values.min()) / (len(dataset_in.longitude) - 1)
default_res_lat = (dataset_in.latitude.values.max() - dataset_in.latitude.values.min()) / (len(dataset_in.latitude) - 1)
default_x_dim = len(dataset_in.longitude)
default_y_dim = len(dataset_in.latitude)


dt = [['default',
       '{:.5f}'.format(default_min_lat), '{:.5f}'.format(default_max_lat),
       '{:.5f}'.format(default_min_lon), '{:.5f}'.format(default_max_lon),
       str(default_crs), '{:.5f}'.format(default_res_lat), '{:.5f}'.format(default_res_lon),
       default_x_dim, default_y_dim, len(dataset_in.time),
       '{:.5f}'.format(dif_time)]]
pd.DataFrame(dt, columns=['type',
                          'min_lat', 'max_lat', 'min_lon', 'max_lon',
                          'crs', 'res_lat', 'res_lon',
                          'x_dim', 'y_dim', 't_dim', 'proc_time'])

In [None]:
# By default lat and lon use EPSG:4326 which is the CRS used to store SDC data.
# Let's reproject the xarray.Dataset into (in our case Swiss CRS) CH1903+ / LV95 (EPSG:2056).
start_time = time.time()
dataset_in = dataset_in.rio.set_crs("epsg:4326").rio.reproject("epsg:2056")
dif_time += time.time() - start_time
dataset_in

In [None]:
# xarray.Dataset CRS metadata remains in previous CRS
# let's update metadata
start_time = time.time()
dataset_in.attrs['crs'] = 'EPSG:2056'
dif_time += time.time() - start_time
dataset_in

In [None]:
# Notice how latitude and longitude where converted into y and x
# Even if the majority of function will work as it is, some might bug
# then lets set them back to "normal"
start_time = time.time()
dataset_in = dataset_in.rename({'x': 'longitude', 'y': 'latitude'})
dif_time += time.time() - start_time

In [None]:
# Export same time for comparison
write_geotiff_from_xr(tif_path = 'riorepro.tif',
                      dataset = dataset_in.isel(time = 1).astype('int16'),
                      compr = 'DEFLATE')

In [None]:
# Store characteristics and compare
res_lon = (dataset_in.longitude.values.max() - dataset_in.longitude.values.min()) / (len(dataset_in.longitude) - 1)
res_lat = (dataset_in.latitude.values.max() - dataset_in.latitude.values.min()) / (len(dataset_in.latitude) - 1)

dt.append(['riorepro',
           '{:.1f}'.format(dataset_in.latitude.values.min()), '{:.1f}'.format(dataset_in.latitude.values.max()),
           '{:.1f}'.format(dataset_in.longitude.values.min()), '{:.1f}'.format(dataset_in.longitude.values.max()),
           str(dataset_in.crs), '{:.1f}'.format(res_lat), '{:.1f}'.format(res_lon),
           len(dataset_in.longitude), len(dataset_in.latitude), len(dataset_in.time),
           '{:.5f}'.format(dif_time)])
pd.DataFrame(dt, columns=['type',
                          'min_lat', 'max_lat', 'min_lon', 'max_lon',
                          'crs', 'res_lat', 'res_lon',
                          'x_dim', 'y_dim', 't_dim', 'proc_time'])

__Notice how reprojection changed the x/y ratio !__

__The "same" result can be obtained by directly using the `output_crs`option, but`resolution` option needs to be entered. Let's estimate it reprojecting the default bbox.__

In [None]:
# get the real bbox of the default dataset (as by default extent is given to the center of corner pixels)
real_min_lon = default_min_lon - (default_res_lon / 2)
real_max_lon = default_max_lon + (default_res_lon / 2)
real_min_lat = default_min_lat - (default_res_lat / 2)
real_max_lat = default_max_lat + (default_res_lat / 2)

# reproject real bbox corners from default to riorepro CRSs
# source: https://hatarilabs.com/ih-en/how-to-translate-coordinate-systems-for-xy-point-data-tables-with-python-pandas-and-pyproj
transformer = Transformer.from_crs(default_crs.lower(), dataset_in.crs.lower(),always_xy=True)
corners = list(iterprod([real_min_lon, real_max_lon], [real_min_lat, real_max_lat]))
trans_corners = np.array(list(transformer.itransform(corners)))

repr_res_lon = (np.max(trans_corners[:, 0]) - np.min(trans_corners[:, 0])) / default_x_dim
repr_res_lat = (np.max(trans_corners[:, 1]) - np.min(trans_corners[:, 1])) / default_y_dim

print('Estimated reprojected resolution: {:.1f}, {:.1f}'.format(repr_res_lat, repr_res_lon))

__You will probably get quite different values. Let's use an average value.__

In [None]:
# Then we can load again using output_crs and resolution options to directly get a reprojected
# xarray.Dataset
start_time = time.time()
repr_res = (repr_res_lat + repr_res_lon) / 2
dataset_in = dc.load(product = product,
                     time = (start_date, end_date), lon = (min_lon, max_lon), lat = (min_lat,max_lat),
                     measurements = measurements,
                     output_crs = 'EPSG:2056', resolution = (-repr_res, repr_res))
dif_time += time.time() - start_time
dataset_in

In [None]:
# As previously
start_time = time.time()
dataset_in = dataset_in.rename({'x': 'longitude', 'y': 'latitude'})
dif_time += time.time() - start_time
dataset_in

In [None]:
# Export same time for comparison
write_geotiff_from_xr(tif_path = 'output_crs.tif',
                      dataset = dataset_in.isel(time = 1).astype('int16'),
                      compr = 'DEFLATE')

In [None]:
# Store characteristics and compare
res_lon = (dataset_in.longitude.values.max() - dataset_in.longitude.values.min()) / (len(dataset_in.longitude) - 1)
res_lat = (dataset_in.latitude.values.max() - dataset_in.latitude.values.min()) / (len(dataset_in.latitude) - 1)

dt.append(['outputcrs',
           '{:.1f}'.format(dataset_in.latitude.values.min()), '{:.1f}'.format(dataset_in.latitude.values.max()),
           '{:.1f}'.format(dataset_in.longitude.values.min()), '{:.1f}'.format(dataset_in.longitude.values.max()),
           str(dataset_in.crs), '{:.1f}'.format(res_lat), '{:.1f}'.format(res_lon),
           len(dataset_in.longitude), len(dataset_in.latitude), len(dataset_in.time),
           '{:.5f}'.format(dif_time)])
pd.DataFrame(dt, columns=['type',
                          'min_lat', 'max_lat', 'min_lon', 'max_lon',
                          'crs', 'res_lat', 'res_lon',
                          'x_dim', 'y_dim', 't_dim', 'proc_time'])

__Reprojected dataset differ greatly !__

In [None]:
# Let's compare riorepro and outputcrs dataset
df = pd.DataFrame(dt, columns=['type',
                          'min_lat', 'max_lat', 'min_lon', 'max_lon',
                          'crs', 'res_lat', 'res_lon',
                          'x_dim', 'y_dim', 't_dim', 'proc_time'])

dif_min_lat = float(df.query('type == "riorepro"')['min_lat']) - float(df.query('type == "outputcrs"')['min_lat'])
dif_max_lat = float(df.query('type == "riorepro"')['max_lat']) - float(df.query('type == "outputcrs"')['max_lat'])
dif_min_lon = float(df.query('type == "riorepro"')['min_lon']) - float(df.query('type == "outputcrs"')['min_lon'])
dif_max_lon = float(df.query('type == "riorepro"')['max_lon']) - float(df.query('type == "outputcrs"')['max_lon'])
print('latitude min/max differences: {:.1f}/{:.1f}'.format(dif_min_lat, dif_max_lat))
print('longitude min/max differences: {:.1f}/{:.1f}'.format(dif_min_lon, dif_max_lon))

dif_res_lat = float(df.query('type == "riorepro"')['res_lat']) - float(df.query('type == "outputcrs"')['res_lat'])
dif_res_lon = float(df.query('type == "riorepro"')['res_lon']) - float(df.query('type == "outputcrs"')['res_lon'])
print('resolution lat/lon differences: {:.1f}/{:.1f}'.format(dif_res_lat, dif_res_lon))

__Obviously estimated resolution (*outputcrs*) is far from reality. Even more using `gdal_warp` and `gdalinfo` gives same result as *riorepro*.__

__Lets use a theoretical but realistic value (30 m resolution).__

In [None]:
start_time = time.time()
# Then we can use load again using output_crs and resolution options to get a reprojected xarray.Dataset
dataset_in = dc.load(product = product,
                     time = (start_date, end_date), lon = (min_lon, max_lon), lat = (min_lat,max_lat),
                     measurements = measurements,
                     output_crs = 'EPSG:2056', resolution = (-30, 30))
dataset_in = dataset_in.rename({'x': 'longitude', 'y': 'latitude'})
dif_time = time.time() - start_time

In [None]:
# Export same time for comparison
write_geotiff_from_xr(tif_path = 'roundres.tif',
                      dataset = dataset_in.isel(time = 1).astype('int16'),
                      compr = 'DEFLATE')

In [None]:
# Store characteristics and compare
res_lon = (dataset_in.longitude.values.max() - dataset_in.longitude.values.min()) / (len(dataset_in.longitude) - 1)
res_lat = (dataset_in.latitude.values.max() - dataset_in.latitude.values.min()) / (len(dataset_in.latitude) - 1)

dt.append(['roundres',
           '{:.1f}'.format(dataset_in.latitude.values.min()), '{:.1f}'.format(dataset_in.latitude.values.max()),
           '{:.1f}'.format(dataset_in.longitude.values.min()), '{:.1f}'.format(dataset_in.longitude.values.max()),
           str(dataset_in.crs), '{:.1f}'.format(res_lat), '{:.1f}'.format(res_lon),
           len(dataset_in.longitude), len(dataset_in.latitude), len(dataset_in.time),
           '{:.5f}'.format(dif_time)])
pd.DataFrame(dt, columns=['type',
                          'min_lat', 'max_lat', 'min_lon', 'max_lon',
                          'crs', 'res_lat', 'res_lon',
                          'x_dim', 'y_dim', 't_dim', 'proc_time'])