# Download DAHITI
***

***Author:** Chus Casado Rodríguez*<br>
***Date:** 10-02-2025*<br>

**Description**<br>

This code downloads data from the [DAHITI](https://dahiti.dgfi.tum.de/en/) dataset using its API version 2. 

It requires two inputs:
1. A TXT file (_api_key.txt_) with the API key associated to your DAHITI user.
2. A YAML file with the DAHITI ID of the points of interes.

It searches for available time series of water level, surface area and volume variation.

In [7]:
import numpy as np
import pandas as pd
import requests
import json
import yaml
from tqdm.auto import tqdm

with open('api_key.txt', 'r') as txt:
    api_key = txt.readline().strip()

with open('reservoirs_Parana.yml', 'r', encoding='utf-8') as file:
    reservoirs = yaml.safe_load(file)

URL = 'https://dahiti.dgfi.tum.de/api/v2/'
VARS = [
    'water-level',
    'surface-area',
    'volume-variation',
    # 'hypsometry'
]

attributes = pd.DataFrame()
timeseries = {}
for ID in tqdm(reservoirs):
    ts_id = pd.DataFrame(dtype=float)
    for var in VARS:

        # request info to the API
        response = requests.post(f'{URL}download-{var}/',
                                 json={
                                     'api_key': api_key,
                                     'dahiti_id': ID,
                                     'format': 'json'
                                 })

        if response.status_code == 200:
            # load as dictionary
            data = json.loads(response.text)
            
            # extract attributes
            try:
                attributes = pd.concat(
                    (
                        attributes,
                        pd.DataFrame.from_dict([data['info']]).set_index('dahiti_id')
                    ),
                    axis=0)
            except Exception as e:
                print(f'Attributes from ID {ID} could not be retrieved:\n{e}', flush=True)

            # extract timeseries
            try:
                df = pd.DataFrame.from_dict(data['data'])
                index_col = [col for col in df.columns if col.startswith('date')][0]
                df.set_index(index_col, drop=True, inplace=True)
                df.index.name = 'date'
                df.index = pd.to_datetime(df.index).date
                # concatenate to the timeseries of other variables
                ts_id = pd.concat((ts_id, df), axis=1)
            except Exception as e:
                print(f'Time series from ID {ID} could not be retrieved:\n{e}', flush=True)
            
            del data, df
        else:
            error = json.loads(response.text)
            print('Error {0}:\t{1}'.format(error['code'], error['message']))
            continue
    break
            
    if len(ts_id) > 0:
        ts_id.sort_index(inplace=True)
        timeseries[ID] = ts_id

  0%|          | 0/24 [00:00<?, ?it/s]

{"code": 429, "message": "You have reached the maximum number of allowed requests in last 24 hours (251 of 250 requests)"}
429


In [None]:
# add boolean columns with available variables
variables = attributes.dataset.unique().tolist()
attributes[variables] = 0
attributes.variables = attributes.variables.astype(boolean)
for ID in attributes.index.unique():
    vars_id = attributes.loc[[ID], 'dataset'].unique().tolist()
    attributes.loc[ID, vars_id] = 1
attributes.drop('dataset', axis=1, inplace=True)

In [None]:
# remove duplicated IDs
attributes.drop_duplicates(keep='first', inplace=True)