Global Runoff Data Center (GRDC) https://portal.grdc.bafg.de/

Download process:
1. Pick sub-region(s) or station(s) of interest.
2. Submit data request form. Select to include watershed boundaries.
3. Download zip file from request email response.

Download components:
- stationbasins.geojson: metadata and geoemetry of the watershed upstream of the gauge.
- subregions.geojson: global geodataframe of subregions
- stationID_Q_.txt: time series of Q (m3/s) for each station

In [34]:
import pandas as pd
import os
import geopandas as gpd
import numpy as np
import xarray as xr
import rioxarray as rxr
import codebase
from multiprocessing import Pool

In [46]:
datadir = '/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/'
basin = 'orange'

In [47]:
subregions_meta = gpd.read_file(datadir + basin + '_subregions.geojson')
stations_meta = gpd.read_file(datadir + basin + '_stationbasins.geojson')
stations_meta.index = stations_meta['grdc_no']

In [48]:
grdc_ids = stations_meta['grdc_no']
len(grdc_ids)

122

In [49]:
def get_start_stop_dates(grdc_id_OI):
    _ , data_df = codebase.load_data.load_GRDC_station_data_by_ID(grdc_id_OI, filepath=datadir,basin_str=basin)
    if data_df.empty:
        start_date = np.nan
        stop_date = np.nan
    else:
        start_date = data_df.index[0]
        stop_date = data_df.index[-1]
    return start_date, stop_date

## REMOVE EMPTY DATA FILES

In [50]:
def identify_empty_files(stations_meta):
    date_check_sum = stations_meta[['start_date','stop_date']].isnull().sum(axis=1) # find rows with empty start and stop dates
    empty_ids = date_check_sum[date_check_sum == 2].index.astype(int) # subset to IDs with no data
    print(empty_ids)

    empty_files = list(empty_ids.astype('str').to_frame().add_suffix('_Q_Day.Cmd.txt',axis=0).index) # convert IDs into filenames
    return empty_files

In [51]:
def remove_empty_files(stations_meta,data_dir,safety_on=True):

    empty_files = identify_empty_files(stations_meta)

    if not safety_on:
        for empty_file in empty_files:
            try:
                os.remove(datadir+empty_file)
            except FileNotFoundError:
                print('No file',empty_file)

In [52]:
with Pool() as pool:
    result = pool.map(get_start_stop_dates,
        grdc_ids,
    )

In [53]:
result_df = pd.DataFrame(result)

In [55]:
stations_meta[['start_date','stop_date']] = result_df.values
stations_meta[['start_date','stop_date']]

Unnamed: 0_level_0,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1159100.0,2000-01-01,2024-12-31
1159103.0,2000-01-01,2024-12-31
1159105.0,2014-05-14,2024-12-31
1159110.0,2000-01-01,2023-05-29
1159120.0,2000-01-01,2023-04-17
...,...,...
1259520.0,NaT,NaT
1259600.0,NaT,NaT
1259620.0,NaT,NaT
1259800.0,NaT,NaT


In [56]:
identify_empty_files(stations_meta);

Index([1159131, 1159321, 1159325, 1159410, 1159512, 1159600, 1159800, 1159850,
       1159855, 1159860, 1259111, 1259150, 1259151, 1259152, 1259170, 1259200,
       1259220, 1259230, 1259240, 1259250, 1259260, 1259300, 1259301, 1259310,
       1259320, 1259321, 1259360, 1259500, 1259510, 1259520, 1259600, 1259620,
       1259800, 1259810],
      dtype='int64', name='grdc_no')


In [None]:
safety_on = True
remove_empty_files(stations_meta,datadir,safety_on)

Index([1159131, 1159321, 1159325, 1159410, 1159512, 1159600, 1159800, 1159850,
       1159855, 1159860, 1259111, 1259150, 1259151, 1259152, 1259170, 1259200,
       1259220, 1259230, 1259240, 1259250, 1259260, 1259300, 1259301, 1259310,
       1259320, 1259321, 1259360, 1259500, 1259510, 1259520, 1259600, 1259620,
       1259800, 1259810],
      dtype='int64', name='grdc_no')


## CALCULATE ATTRIBUTES