# Download DAHITI
***

***Author:** Chus Casado Rodríguez*<br>
***Date:** 17-02-2025*<br>

**Description**<br>

This code downloads data from the [DAHITI](https://dahiti.dgfi.tum.de/en/) dataset using its API version 2. 

It requires two inputs:
1. A TXT file (_api_key.txt_) with the API key associated to your DAHITI user.
2. A YAML file with the configuration: URL, type of point of interest (river, reservoir), countries of interest, variables of interest, and output path.

It searches for available time series of water level, surface area and volume variation.

In [1]:
import os
os.environ['USE_PYGEOS'] = '0'
import numpy as np
import pandas as pd
import requests
import json
import yaml
from tqdm.auto import tqdm
import geopandas as gpd
from shapely import Point
import pprint
from pathlib import Path

## Configuration

In [9]:
# configuration
with open('config.yml', 'r') as file:
    cfg = yaml.safe_load(file)
API_URL = cfg.get('api_url', 'https://dahiti.dgfi.tum.de/api/v2/')
TYPE = cfg.get('type', None)
COUNTRY = cfg.get('country', None)
VARS = cfg.get('variables', 'water-level')
PATH_DAHITI = Path(cfg.get('output_path', './'))

# personal API key
with open('api_key.txt', 'r') as txt:
    api_key = txt.readline().strip()

In [3]:
MAP_VAR_NAMES = {
    'target_name': 'name',
    'water_level_altimetry': 'level',
    'surface_area': 'area',
    'water_occurrence_mask': 'occurrence',
    'land_water_mask': 'land_water',
    'volume_variation': 'volume',
    'water_level_hypsometry': 'level_hyps'
}

## Targets

### Load

In [8]:
# load targets
input_file = PATH_DAHITI / TYPE / 'targets' / f'DAHITI_{TYPE}.shp'
targets = gpd.read_file(input_file).set_index('dahiti_id')
print(f'Loaded input file: {input_file}')
print('DAHITI contains {0} targets of type {1}'.format(len(targets), TYPE))

# filter targets by country
if COUNTRY is not None:
    mask_country = targets.country.isin(COUNTRY)
    targets_sel = targets[mask_country].copy()
    print('DAHITI contains {0} targets of type {1} in {2}.'.format(len(targets_sel), TYPE, ', '.join(COUNTRY)))

# # load shapefile of selected targets
# targets_sel = gpd.read_file(PATH_DAHITI / TYPE / 'targets' / 'DAHITI_reservoir_krishna.shp').set_index('dahiti_id', drop=True)
# targets_sel.index = targets_sel.index.astype(int)
# if 'GRAND_ID' in targets_sel.columns:
#     targets_sel.GRAND_ID = targets_sel.GRAND_ID.astype('Int64')

# rename columns
map_columns = {new: old.replace('_', '-') for old, new in MAP_VAR_NAMES.items() if 'name' not in old}
targets_sel.rename(columns=map_columns, inplace=True)
cols = list(map_columns.values())
targets_sel[cols] = targets_sel[cols].astype(int)
targets_sel.rename(columns={'water-level-altimetry': 'water-level'}, inplace=True)

## Time series
### Download

In [10]:
timeseries = {}
for ID in tqdm(targets_sel.index):
    ts_id = pd.DataFrame(dtype=float)
    
    for var in VARS:
        if targets_sel.loc[ID, var] == 0:
            continue

        # request info to the API
        response = requests.post(
            f'{API_URL}download-{var}/',
            json={
                'api_key': api_key,
                'format': 'json',
                'dahiti_id': ID,
            }
        )

        if response.status_code == 200:
            # load as dictionary
            data = json.loads(response.text)

            # extract timeseries
            try:
                df = pd.DataFrame.from_dict(data['data'])
                index_col = [col for col in df.columns if col.startswith('date')][0]
                df.set_index(index_col, drop=True, inplace=True)
                df.index = pd.to_datetime(df.index).date
                df.rename(columns={'error': f'{var}_error'}, inplace=True)
                df.index.name = 'date'
                # concatenate to the timeseries of other variables
                ts_id = pd.concat((ts_id, df), axis=1)
                del df
            except Exception as e:
                print(f'Time series from ID {ID} could not be retrieved:\n{e}', flush=True)
            
            del data
        else:
            error = json.loads(response.text)
            print('Error while downloading {0} for target {1}:\n{2}:\t{3}'.format(var, ID, error['code'], error['message']))
            continue
            
    if len(ts_id) > 0:
        ts_id.sort_index(inplace=True)
        timeseries[ID] = ts_id

  0%|          | 0/38 [00:00<?, ?it/s]

### Export

In [11]:
# export timeseries
PATH_OUT = PATH_DAHITI / TYPE / 'time_series'
PATH_OUT.mkdir(parents=True, exist_ok=True)
for ID, ts in timeseries.items():
    output_file = PATH_OUT / f'{ID}.csv'
    ts.to_csv(PATH_OUT / f'{ID}.csv')
    print(f'File saved: {output_file}')

File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\41627.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\14940.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\41625.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\7031.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\41626.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\13024.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\2256.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\2257.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\13028.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\39439.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\13075.csv
File saved: Z:\nahaUsers\casadje\datasets\DAHITI\reservoir\time_series\13073.csv
File saved: Z:\nahaUsers\casadj