# Lake tables - GloFAS5
***

**Author:** Chus Casado Rodríguez<br>
**Date:** 03-12-2024<br>

**Introduction:**<br>
This notebook extracts the attributes required by LISFLOOD for the lakes selected to be modelled in GloFAS5:

* Lake area
* Outlet width
* Average inflow

**To do:**

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray as rxr
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import cartopy.feature as cfeature
import cartopy.crs as ccrs
from tqdm.notebook import tqdm
from shapely.geometry import Point
from pathlib import Path
import yaml
from scipy.optimize import curve_fit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from lisfloodreservoirs.utils.utils import outlet_width
from lisfloodreservoirs.utils.plots import plot_reservoir_map, compare_attributes

In [None]:
def width_from_area(area: float, c: float) -> float:
    """Estimation of the river width based on catchment area:
            width = area**c
    
    Parameters:
    -----------
    area: float
        catchment area in km2
    c: float
        coefficient of the model
        
    Returns:
    --------
    width: float
        river width in meters
    """
    return area**c

## Configuration

In [None]:
# paths
PATH_GLOFAS4 = Path('Z:/nahaUsers/casadje/EFASv5')
PATH_GLOFAS5 = Path('Z:/nahaUsers/casadje/EFASv6')
PATH_DATASETS = Path('Z:/nahaUsers/casadje/datasets/')
PATH_GLWD = PATH_DATASETS / 'lakes' / 'GLWD'
PATH_HYLAK = PATH_DATASETS / 'lakes' / 'HydroLAKES' / 'LakeATLAS_v10_shp'
PATH_OUT = PATH_GLOFAS5 / 'lakes' / 'tables'
PATH_OUT.mkdir(parents=True, exist_ok=True)

# # minimum storage capacity included in EFAS
MIN_VOLUME = 10 # hm³
MIN_AREA = 5 # km²
MIN_CATCHMENT = 50 # km²
# MIN_DOR = 30 # days

## DATA

### GLOFAS4

#### Lakes

In [None]:
# load GLOFAS4 lakes
glofas4 = gpd.read_file(PATH_GLOFAS4 / 'tables' / 'GLOFAS4_lakes.shp').set_index('LakID', drop=True)

# add attributes from the tables
if 'efas_attrs' in locals():
    del efas_attrs
prefix = 'lake'
for file in (PATH_GLOFAS4 / 'tables').glob(f'{prefix}*.txt'):
    var = file.stem.removeprefix(prefix)
    try:
        df = pd.read_csv(file, sep=' ', header=None)
        df.dropna(axis=1, how='all', inplace=True)
        df.columns = ['LakID', var]
        df.set_index('LakID', inplace=True, drop=True)
        glofas4[var] = df
    except:
        print(file)
        continue
# convert area to km2
glofas4['area'] /= 1e6
glofas4.rename(columns={'a': 'alpha'}, inplace=True)
glofas4.drop(['AREA_KM2', 'LON_ORG', 'LAT_ORG', 'LISFLOODX', 'LISFLOODY'], axis=1, inplace=True)

In [None]:
glofas4.SOURCE.value_counts()

#### Static_maps

In [None]:
# upstream area
uparea = rxr.open_rasterio(PATH_GLOFAS4 / 'static_maps' / 'upArea_European_01min.nc').squeeze(dim='band')

# channel width
chanbw = rxr.open_rasterio(PATH_GLOFAS4 / 'static_maps' / 'chanbw_European_01min.nc').squeeze(dim='band')
chanbw = chanbw.where(chanbw >= 0, np.nan)

### GLOFAS5
#### Lakes

In [None]:
# import shapefile
glofas5 = gpd.read_file(PATH_GLOFAS5 / 'lakes' / 'LakesEfasV6.shp')

# keep only selected lakes
glofas5 = glofas5[glofas5.checked != 3]

# select columns
id_cols = ['LakID', 'GLWD_ID', 'Hylak_id']
glofas5 = glofas5[id_cols + ['geometry']]

# remove IDs equal to 0
for col in ['LakID', 'Hylak_id', 'GLWD_ID']:
    glofas5[col] = glofas5[col].astype('Int64')
    glofas5.loc[glofas5[col] == 0, col] = np.nan
    
# glofas5.columns = [col.upper() if col != 'geometry' else col for col in glofas5.columns]

# correct HydroLakes IDs
with open('lakes_GLOFAS4_HydroLakes.yaml', 'r') as file:
    map_efas_hylak = yaml.safe_load(file)
correct_hylak_ids = glofas5.LakID.map(map_efas_hylak).astype('Int64').dropna()
glofas5.loc[correct_hylak_ids.index, 'Hylak_id'] = correct_hylak_ids.values

# add GLWD IDs
with open('lakes_GLOFAS4_GLWD.yaml', 'r') as file:
    map_efas_glwd = yaml.safe_load(file)
correct_glwd_ids = glofas5.LakID.map(map_efas_glwd).astype('Int64').dropna()
glofas5.loc[correct_glwd_ids.index, 'GLWD_ID'] = correct_glwd_ids.values

glofas5.sort_values(['Hylak_id'], ascending=True, inplace=True)
glofas5.reset_index(drop=True, inplace=True)
glofas5.index.name = 'FID'
print('{0} lakes in GLOFAS5'.format(glofas5.shape[0]))
print('\t* {0} were in GLOFAS4'.format(glofas5.LakID.notnull().sum()))
print('\t* {0} are in HydroLakes'.format(glofas5.Hylak_id.notnull().sum()))
print('\t* {0} are in GLWD'.format(glofas5.GLWD_ID.notnull().sum()))

In [None]:
# points = pd.concat((glofas5.geometry.x, glofas5.geometry.y), axis=1)
# points.columns = ['lon', 'lat']
# points.index.name = 'ID'

# points.to_csv(PATH_GLOFAS5 / 'ncextract' / f'lakes_glofas5_{glofas5.shape[0]}.csv')

#### Reservoirs with low degree of regulation

In [None]:
reservoirs = pd.read_excel(
    PATH_GLOFAS5 / 'reservoirs' / 'tables' / 'glofas5_reservoirs.xlsx',
    sheet_name='as_lakes',
    index_col='FID'
)
reservoirs[['ResID', 'GRanD_id', 'Hylak_id']] = reservoirs[['ResID', 'GRanD_id', 'Hylak_id']].astype('Int64')

### GLWD

In [None]:
# level 1
glwd1 = gpd.read_file(PATH_GLWD / 'level1' / 'glwd_1.shp').set_index('GLWD_ID', drop=True)
glwd1 = glwd1.loc[glwd1.index.intersection(glofas5.GLWD_ID)]

# level 2
glwd2 = gpd.read_file(PATH_GLWD / 'level2' / 'glwd_2.shp').set_index('GLWD_ID', drop=True)
glwd2 = glwd2.loc[glwd2.index.intersection(glofas5.GLWD_ID)]

# concatenate
glwd = pd.concat((glwd1, glwd2), axis=0)

if not glwd.shape[0] == glofas5.GLWD_ID.notnull().sum():
    print('The number of lakes in "glwd" does not match the number of lakes in "glofas5" with assigned GLWD_ID')

### HydroLakes

In [None]:
hylak = gpd.read_file(PATH_GLOFAS5 / 'lakes' / 'hydrolakes_domain.shp').set_index('Hylak_id', drop=True)
hylak = hylak.loc[hylak.index.intersection(glofas5.Hylak_id)]
hylak.Grand_id = hylak.Grand_id.replace(0, np.nan).astype('Int64')

if not hylak.shape[0] == glofas5.Hylak_id.nunique():
    print('The number of lakes in "hylak" ({0}) does not match the number of lakes in "glofas5" with assigned Hylak_id ({1})'.format(hylak.shape[0], glofas5.Hylak_id.nunique()))

In [None]:
hylak.Lake_type.value_counts()

There are 4 lakes in the selection which are considered as controlled lakes by HydroLakes, therefore, they have a `GRAND_ID` that I will add to the `glofas5` table.

In [None]:
# add GRAND_ID to lakes in GLOFAS5
glofas5['GRanD_id'] = glofas5.Hylak_id.map(hylak.Grand_id)

## Attributes

### Catchment area

In [None]:
catchment = pd.DataFrame(index=glofas5.index, columns=['UPAREA', 'GLWD', 'HYLAK', 'GLOFAS5', 'SOURCE'], dtype=float)
catchment.SOURCE = catchment.SOURCE.astype(str)

# upstream area map
for ID, point in glofas5.geometry.items():
    catchment.loc[ID, 'UPAREA'] = uparea.sel(x=point.x, y=point.y, method='nearest').item() * 1e-6

# GLWD
idx = glofas5[glofas5.GLWD_ID.notnull()].GLWD_ID
catchment.loc[idx.index, 'GLWD'] = glwd.loc[idx.values, 'CATCH_TSKM'].values * 1e3

# HydroLakes
idx = glofas5[glofas5.Hylak_id.isin(hylak.index)].Hylak_id
catchment.loc[idx.index, 'HYLAK'] = hylak.loc[idx.values, 'Wshd_area'].values

#### Comparison

In [None]:
compare_attributes(
    catchment[['UPAREA', 'GLWD', 'HYLAK']],
    thr=MIN_AREA,
    vmin=1,
    vmax=1e7,
    title='catchment (km²)'
)
# plt.savefig(PATH_OUT / 'plots' / 'catchment_pairplot.jpg', dpi=300, bbox_inches='tight')

#### Select values

In [None]:
# select value from GRAND, GLOFAS4, HYLAK or ICOLD in that order
for source in ['HYLAK', 'GLWD', 'UPAREA']:
    missing = catchment.GLOFAS5.isnull()
    mask = catchment[source].notnull()
    catchment.loc[missing & mask, 'GLOFAS5'] = catchment.loc[missing & mask, source]
    catchment.loc[missing & mask, 'SOURCE'] = source

fig, ax = plt.subplots(figsize=(20, 5), subplot_kw=dict(projection=ccrs.PlateCarree()))
ax.add_feature(cfeature.NaturalEarthFeature('physical', 'land', '10m', edgecolor='face', facecolor='lightgray'), alpha=.5, zorder=0)
sct = ax.scatter(
    glofas5.geometry.x,
    glofas5.geometry.y,
    c=np.log10(catchment.GLOFAS5),
    cmap='viridis_r',
    s=5,
    # vmin=0,
    # vmax=2.5
)
cbar = plt.colorbar(sct, shrink=.5, label='catchment (km2)')
ticks = [2, 3, 4, 5]
cbar.set_ticks(ticks)
cbar.set_ticklabels([10**x for x in ticks])
plt.axis('off');
plt.savefig(PATH_OUT / 'plots' / 'catchment_map.jpg', dpi=300, bbox_inches='tight')

# add values to GLOFAS5
glofas5.loc[catchment.index, ['CATCH_SKM', 'CATCH_SRC']] = catchment[['GLOFAS5', 'SOURCE']].values
print('{0} reservoirs do not comply with the minimum catchment area of {1} km²'.format((glofas5.CATCH_SKM < MIN_AREA).sum(),
                                                                                       MIN_CATCHMENT))

### Lake area

In [None]:
area = pd.DataFrame(index=glofas5.index, columns=['GLOFAS4', 'GLWD', 'HYLAK', 'GLOFAS5', 'SOURCE'], dtype=float)
area.SOURCE = area.SOURCE.astype(str)

# GLOFAS4
idx = glofas5[glofas5.LakID.notnull()].LakID
area.loc[idx.index, 'GLOFAS4'] = glofas4.loc[idx.values, 'area'].values

# GLWD
idx = glofas5[glofas5.GLWD_ID.notnull()].GLWD_ID
area.loc[idx.index, 'GLWD'] = glwd.loc[idx.values, 'AREA_SKM'].values

# HydroLakes
idx = glofas5[glofas5.Hylak_id.isin(hylak.index)].Hylak_id
# idx = glofas5[glofas5.Hylak_id.notnull()].Hylak_id
area.loc[idx.index, 'HYLAK'] = hylak.loc[idx.values, 'Lake_area'].values

In [None]:
area.isnull().sum()

#### Comparison

In [None]:
compare_attributes(
    area[['GLOFAS4', 'GLWD', 'HYLAK']],
    thr=MIN_AREA,
    vmin=1e-1,
    vmax=1e6,
    title='area (km2)'
)
plt.savefig(PATH_OUT / 'plots' / 'area_pairplot.jpg', dpi=300, bbox_inches='tight')

#### Select values

Some `Hylak_id` are repeated in GLOFAS5 because HydroLakes considers as a single lake cases where GLWD (and therefore GLOFAS4) considers several lakes. Therefore, using the HYLAK value will overestimate total area. I will use as main source GLWD, then GLOFAS4 and lastly HYLAK.

In [None]:
# select value from GRAND, GLOFAS4, HYLAK or ICOLD in that order
for source in ['GLWD', 'GLOFAS4', 'HYLAK']:
    missing = area.GLOFAS5.isnull()
    mask = area[source].notnull()
    area.loc[missing & mask, 'GLOFAS5'] = area.loc[missing & mask, source]
    area.loc[missing & mask, 'SOURCE'] = source

fig, ax = plt.subplots(figsize=(20, 5), subplot_kw=dict(projection=ccrs.PlateCarree()))
ax.add_feature(cfeature.NaturalEarthFeature('physical', 'land', '10m', edgecolor='face', facecolor='lightgray'), alpha=.5, zorder=0)
sct = ax.scatter(
    glofas5.geometry.x,
    glofas5.geometry.y,
    c=np.log10(area.GLOFAS5),
    cmap='viridis_r',
    s=5,
    # vmin=0,
    # vmax=2.5
)
cbar = plt.colorbar(sct, shrink=.5, label='area (km2)')
ticks = [2, 3, 4]
cbar.set_ticks(ticks)
cbar.set_ticklabels([10**x for x in ticks])
plt.axis('off');
plt.savefig(PATH_OUT / 'plots' / 'area_map.jpg', dpi=300, bbox_inches='tight')

# add values to GLOFAS5
glofas5.loc[area.index, ['AREA_SKM', 'AREA_SRC']] = area[['GLOFAS5', 'SOURCE']].values

print('{0} lakes do not comply with the minimum lake area of {1} km2'.format((glofas5.AREA_SKM < MIN_AREA).sum(),
                                                                             MIN_AREA))

### $\alpha$: width of the outlet

In [None]:
alpha = pd.DataFrame(index=glofas5.index, columns=['GLOFAS4', 'BUREK', 'EXP', 'CHANBW', 'GLOFAS5', 'SOURCE'], dtype=float)
alpha.SOURCE = alpha.SOURCE.astype(str)

# GLOFAS4
idx = glofas5[glofas5.LakID.notnull()].LakID
alpha.loc[idx.index, 'GLOFAS4'] = glofas4.loc[idx.values, 'alpha'].values

# using Burek's formula
alpha['BUREK'] = catchment.UPAREA * 0.0032

# fit a exponential function to the GLOFAS4 data
aux = glofas4[glofas4.alpha != glofas4.alpha.min()]
params, covariance = curve_fit(width_from_area, aux.CATCH_AREA, aux.alpha, p0=[.5])
print('c = {0:.2f}'.format(params[0]))
alpha['EXP'] = width_from_area(glofas5.CATCH_SKM, c=params[0])

# channel width
for ID, point in glofas5.geometry.items():
    alpha.loc[ID, 'CHANBW'] = outlet_width(chanbw, uparea, point.x, point.y, n_points=3)

#### Comparison 

In [None]:
# fig, axes = plt.subplots(ncols=3, figsize=(10, 3), sharex=True, sharey=True)

# vmin, vmax = .01, 1000
# for ax, col in zip(axes, ['CHANBW', 'BUREK', 'EXP']):
#     ax.scatter(alpha[col], alpha.GLOFAS4, s=8, alpha=.5)
#     ax.plot([vmin, vmax], [vmin, vmax], 'k', lw=.5, zorder=0)
#     ax.set(
#         xlabel=col,
#         xscale='log',
#         yscale='log'
#     )
#     if ax == axes[0]:
#         ax.set_ylabel('GLOFAS4')
#         ax.set(
#             xlim=(vmin, vmax),
#             ylim=(vmin, vmax)
#         )

In [None]:
compare_attributes(
    alpha[['GLOFAS4', 'BUREK', 'EXP', 'CHANBW']],
    thr=0,
    vmin=.1,
    vmax=2000,
    title='outlet width (m)'
)
plt.savefig(PATH_OUT / 'plots' / 'outlet_width_pairplot.jpg', dpi=300, bbox_inches='tight')

#### Select values
I select GLOFAS4 values, if possible. If not, I will use the exponential function.

In [None]:
# select value from GRAND, GLOFAS4, HYLAK or ICOLD in that order
for source in ['GLOFAS4', 'EXP']:
    missing = alpha.GLOFAS5.isnull()
    mask = alpha[source].notnull()
    alpha.loc[missing & mask, 'GLOFAS5'] = alpha.loc[missing & mask, source]
    alpha.loc[missing & mask, 'SOURCE'] = source

fig, ax = plt.subplots(figsize=(20, 5), subplot_kw=dict(projection=ccrs.PlateCarree()))
ax.add_feature(cfeature.NaturalEarthFeature('physical', 'land', '10m', edgecolor='face', facecolor='lightgray'), alpha=.5, zorder=0)
sct = ax.scatter(
    glofas5.geometry.x,
    glofas5.geometry.y,
    c=alpha.GLOFAS5,
    cmap='viridis_r',
    s=5,
    # vmin=0,
    # vmax=2.5
)
cbar = plt.colorbar(sct, shrink=.5, label='outlet width (m)')
plt.axis('off');
# plt.savefig(PATH_OUT / 'plots' / 'outlet_width_map.jpg', dpi=300, bbox_inches='tight')

# add values to GLOFAS5
glofas5.loc[alpha.index, ['WIDTH_M', 'WIDTH_SRC']] = alpha[['GLOFAS5', 'SOURCE']].values

### Average inflow

In [None]:
dis_avg = pd.DataFrame(index=glofas5.index, columns=['GLOFAS4', 'NAT_FLOW', 'GLWD', 'HYLAK', 'GLOFAS5', 'SOURCE'], dtype=float)
dis_avg.SOURCE = alpha.SOURCE.astype(str)

# GLOFAS4
idx = glofas5[glofas5.LakID.notnull()].LakID
dis_avg.loc[idx.index, 'GLOFAS4'] = glofas4.loc[idx.values, 'avinflow'].values

# GLOFAS4 naturalised long-term run 
dis = xr.open_dataset(PATH_GLOFAS5 / 'lakes' / 'ncextract' / 'dis_215.nc')['dis']
dis = dis.sel(time=slice('1993-01-02', None))
dis.close()
dis_avg['NAT_FLOW'] = dis.mean('time').to_pandas()

# GLWD
idx = glofas5[glofas5.GLWD_ID.notnull()].GLWD_ID
dis_avg.loc[idx.index, 'GLWD'] = glwd.loc[idx.values, 'INFLOW_CMS'].values

# HydroLakes
idx = glofas5[glofas5.Hylak_id.isin(hylak.index)].Hylak_id
dis_avg.loc[idx.index, 'HYLAK'] = hylak.loc[idx.values, 'Dis_avg'].values

#### Comparison

In [None]:
compare_attributes(
    dis_avg[['GLOFAS4', 'NAT_FLOW', 'GLWD', 'HYLAK']],
    thr=0,
    vmin=0.01,
    vmax=1000,
    title='average inflow (m3/s)'
)
plt.savefig(PATH_OUT / 'plots' / 'dis_avg_pairplot.jpg', dpi=300, bbox_inches='tight')

#### Select value

I will select in all cases the value from the GLOFAS4 naturalised run.

In [None]:
dis_avg.GLOFAS5 = dis_avg.NAT_FLOW
dis_avg.SOURCE = 'GLOFAS4_natflow' 

In [None]:
fig, ax = plt.subplots(figsize=(20, 5), subplot_kw=dict(projection=ccrs.PlateCarree()))
ax.add_feature(cfeature.NaturalEarthFeature('physical', 'land', '10m', edgecolor='face', facecolor='lightgray'), alpha=.5, zorder=0)
sct = ax.scatter(
    glofas5.geometry.x,
    glofas5.geometry.y,
    c=dis_avg.GLOFAS5 / glofas5.CATCH_SKM * 3.6 * 24,
    cmap='viridis_r',
    s=5,
)
cbar = plt.colorbar(sct, shrink=.5, label='average discharge\n(mm/day)')
plt.axis('off');
# plt.savefig(PATH_OUT / 'plots' / 'dis_avg_map.jpg', dpi=300, bbox_inches='tight')

In [None]:
# add values to GLOFAS5
glofas5.loc[alpha.index, ['DISAVG_CMS', 'DIS_SRC']] = dis_avg[['GLOFAS5', 'SOURCE']].values

## Export

In [None]:
lakes = glofas5.copy()

# add coordinates in the LISFLOOD grid
lakes['LisfloodX'] = lakes.geometry.x
lakes['LisfloodY'] = lakes.geometry.y

# add info from HydroLakes
hylak_ids = lakes.Hylak_id.dropna()
hylak_cols = {'Lake_name': 'LAKE_NAME', 
              'Country': 'COUNTRY',
              'Pour_long': 'LONG_DD',
              'Pour_lat': 'LAT_DD'}
lakes.loc[hylak_ids.index, hylak_cols.values()] = hylak.loc[hylak_ids.values, hylak_cols].rename(columns=hylak_cols).values

# reorder columns
cols = ['LakID', 'ResID', 'GLWD_ID', 'Hylak_id', 'GRanD_id', 'ICOLD_id',
        'LAKE_NAME', 'COUNTRY', 
        'LONG_DD', 'LAT_DD', 'LisfloodX', 'LisfloodY', 
        'CATCH_SKM', 'CATCH_SRC', 'AREA_SKM', 'AREA_SRC', 'WIDTH_M', 'WIDTH_SRC', 'DISAVG_CMS', 'DIS_SRC']
lakes = lakes[lakes.columns.intersection(cols)]

In [None]:
# compute outlet widht of reservoirs
reservoirs['WIDTH_M'] = width_from_area(reservoirs.CATCH_SKM, c=params[0])
reservoirs['WIDTH_SRC'] = 'EXP'
# rename and reorder columns
reservoirs.rename(columns={'RES_NAME': 'LAKE_NAME'}, inplace=True, errors='ignore')
reservoirs = reservoirs[reservoirs.columns.intersection(cols)]

In [None]:
# merge
lakes_reservoirs = pd.concat((lakes, reservoirs), axis=0)[cols]
lakes_reservoirs.sort_values(['LakID', 'ResID'], inplace=True)

In [None]:
# reset index
# reservoirs['FID'] = np.arange(1, reservoirs.shape[0] + 1)
# reservoirs.loc[reservoirs.ResID > 5000, 'FID'] = reservoirs.loc[reservoirs.ResID > 5000, 'ResID']
idx = []
i = 1000
for lak_id in lakes_reservoirs.LakID.values:
    if pd.isna(lak_id):
        i += 1
        idx.append(i)
    else:
        idx.append(lak_id)
lakes_reservoirs['FID'] = idx
lakes_reservoirs.set_index('FID', drop=True, inplace=True)
lakes_reservoirs.sort_index(axis=0, inplace=True)

In [None]:
# export
lakes_reservoirs.to_excel(PATH_OUT / 'glofas5_lakes.xlsx', float_format='%.4f')