# Download DAHITI
***

***Author:** Chus Casado Rodríguez*<br>
***Date:** 10-02-2025*<br>

**Description**<br>

This code downloads data from the [DAHITI](https://dahiti.dgfi.tum.de/en/) dataset using its API version 2. 

It requires two inputs:
1. A TXT file (_api_key.txt_) with the API key associated to your DAHITI user.
2. A YAML file with the DAHITI ID of the points of interes.

It searches for available time series of water level, surface area and volume variation.

In [16]:
import os
os.environ['USE_PYGEOS'] = '0'
import numpy as np
import pandas as pd
import requests
import json
import yaml
from tqdm.auto import tqdm
import geopandas as gpd
from shapely import Point

In [3]:
import pprint
from pathlib import Path

## Configuration

In [4]:
PATH_DAHITI = Path('Z:/nahaUsers/casadje/datasets/DAHITI')

API_URL = 'https://dahiti.dgfi.tum.de/api/v2/'

# personal API key
with open('api_key.txt', 'r') as txt:
    api_key = txt.readline().strip()

# selection of targets
with open('arguments.yml', 'r') as file:
    args = yaml.safe_load(file)
args['api_key'] = api_key

print(args)

{'api_key': '2676835AF649F11F825EB24729EEACFDC945263C0F12F2561CF24D315F8F10D1'}


## Data
### Targets

In [10]:
# requests targets
response = requests.post(
    url=f'{API_URL}/list-targets/',
    json=args
)

if response.status_code == 200:
    targets = json.loads(response.text)
else:
    error = json.loads(response.text)
    print('Error {0}:\t{1}'.format(error['code'], error['message']))

In [11]:
# export list of targets
with open(PATH_DAHITI / 'list-targets.json', 'w') as f:
    json.dump(targets['data'], f, indent=4)
print('DAHITI contains {0} targets.'.format(len(targets['data'])))

DAHITI contains 11241 targets.


In [33]:
# filter reservoirs by country
reservoirs = [target for target in targets['data'] if target['type'].lower() == 'reservoir']
print('DAHITI contains {0} reservoirs.'.format(len(reservoirs)))

# filter reservoirs by country
reservoirs = [reservoir for reservoir in reservoirs if reservoir['country'] in ['Brazil', 'Argentina', 'Paraguay']]
print('DAHITI contains {0} reservoirs in Brazil, Argentina and Paraguay.'.format(len(reservoirs)))

# convert to DataFrame
reservoir_list = []
variables = []
for reservoir in reservoirs:
    serie = pd.concat((
        pd.Series({key: value for key, value in reservoir.items() if key != 'data_access'}),
        pd.Series(reservoir['data_access'])
    ), axis=0)
    variables += list(reservoir['data_access'])
    reservoir_list.append(serie)
reservoirs = pd.concat(reservoir_list, axis=1).T
reservoirs.dahiti_id = reservoirs.dahiti_id.astype(int)
reservoirs[['longitude', 'latitude']] = reservoirs[['longitude', 'latitude']].astype(float)
variables = list(set(variables))
reservoirs[variables] = reservoirs[variables].astype(bool)
reservoirs.set_index('dahiti_id', drop=True, inplace=True)

# convert to GeoDataFrame
geometry = [Point(xy) for xy in zip(reservoirs.longitude, reservoirs.latitude)]
reservoirs = gpd.GeoDataFrame(reservoirs, geometry=geometry)
reservoirs = reservoirs.set_crs(epsg=4326)

DAHITI contains 528 reservoirs.
DAHITI contains 70 reservoirs in Brazil, Argentina and Paraguay.


In [29]:
# export shapefile of reservoirs
reservoirs.to_file(PATH_DAHITI / 'reservoirs_BrArPy.shp', driver='ESRI Shapefile')

  reservoirs.to_file(PATH_DAHITI / 'reservoirs_BrArPy.shp', driver='ESRI Shapefile')


In [131]:
# reservoirs.to_fi(PATH_DAHITI / 'reservoirs_BrArPy.csv')

### Timeseries

In [42]:
variables = [var.replace('_', '-') for var in variables]

In [72]:
reservoirs_parana = gpd.read_file(PATH_DAHITI / 'reservoirs_parana.shp').set_index('dahiti_id', drop=True)
reservoirs_parana.index = reservoirs_parana.index.astype(int)
reservoirs_parana.GRAND_ID = reservoirs_parana.GRAND_ID.astype('Int64')
rename_cols = {
    'water_leve': 'water-level',
    'surface_ar': 'surface-area',
    'water_occu': 'water-occurrence-mask',
    'land_water': 'land-water-mask',
    'volume_var': 'volume-variation',
    'hypsometry': 'hypsometry',
    'bathymetry': 'bathymetry',
    'water_le_1': 'water-level-hypsometry',
    'discharge': 'discharge'
}
reservoirs_parana[list(rename_cols)] = reservoirs_parana[list(rename_cols)].astype(bool)
reservoirs_parana.rename(columns=rename_cols, inplace=True)

In [74]:
# with open('reservoirs_Parana.yml', 'r', encoding='utf-8') as file:
#     reservoirs = yaml.safe_load(file)

VARS = [
    'water-level',
    'surface-area',
    'volume-variation',
    'hypsometry'
]

attributes = pd.DataFrame()
timeseries = {}
for ID in tqdm(reservoirs_parana.index):
    ts_id = pd.DataFrame(dtype=float)
    for var in VARS:
        if reservoirs_parana.loc[ID, var] is False:
            continue

        # request info to the API
        response = requests.post(
            f'{API_URL}download-{var}/',
            json={
                'api_key': api_key,
                'format': 'json',
                'dahiti_id': ID,
            }
        )

        if response.status_code == 200:
            # load as dictionary
            data = json.loads(response.text)
            
            # extract attributes
            try:
                attributes = pd.concat(
                    (
                        attributes,
                        pd.DataFrame.from_dict([data['info']]).set_index('dahiti_id')
                    ),
                    axis=0)
            except Exception as e:
                print(f'Attributes from ID {ID} could not be retrieved:\n{e}', flush=True)

            # extract timeseries
            try:
                df = pd.DataFrame.from_dict(data['data'])
                index_col = [col for col in df.columns if col.startswith('date')][0]
                df.set_index(index_col, drop=True, inplace=True)
                df.index.name = 'date'
                df.index = pd.to_datetime(df.index).date
                # concatenate to the timeseries of other variables
                ts_id = pd.concat((ts_id, df), axis=1)
            except Exception as e:
                print(f'Time series from ID {ID} could not be retrieved:\n{e}', flush=True)
            
            del data, df
        else:
            error = json.loads(response.text)
            print('Error {0}:\t{1}'.format(error['code'], error['message']))
            continue
            
    if len(ts_id) > 0:
        ts_id.sort_index(inplace=True)
        timeseries[ID] = ts_id

  0%|          | 0/25 [00:00<?, ?it/s]

Error 403:	Permission Denied (to access target `11473`)!
Error 403:	Permission Denied (to access target `11473`)!
Error 403:	Permission Denied (to access target `11473`)!
Error 403:	Permission Denied (to access target `41419`)!
Error 403:	Permission Denied (to access target `41419`)!
Error 403:	Permission Denied (to access target `41419`)!
Error 403:	Permission Denied (to access target `41420`)!
Error 403:	Permission Denied (to access target `41420`)!
Error 403:	Permission Denied (to access target `41420`)!
Error 403:	Permission Denied (to access target `10344`)!
Error 403:	Permission Denied (to access target `10344`)!
Error 403:	Permission Denied (to access target `10344`)!
Error 403:	Permission Denied (to access target `10346`)!
Error 403:	Permission Denied (to access target `10346`)!
Error 403:	Permission Denied (to access target `10346`)!
Error 403:	Permission Denied (to access target `11266`)!
Error 403:	Permission Denied (to access target `11266`)!
Error 403:	Permission Denied (t

NameError: name 'df' is not defined

In [None]:
# add boolean columns with available variables
variables = attributes.dataset.unique().tolist()
attributes[variables] = 0
attributes.variables = attributes.variables.astype(boolean)
for ID in attributes.index.unique():
    vars_id = attributes.loc[[ID], 'dataset'].unique().tolist()
    attributes.loc[ID, vars_id] = 1
attributes.drop('dataset', axis=1, inplace=True)

In [None]:
# remove duplicated IDs
attributes.drop_duplicates(keep='first', inplace=True)

## Export