Global Runoff Data Center (GRDC) https://portal.grdc.bafg.de/

Download process:
1. Pick sub-region(s) or station(s) of interest.
2. Submit data request form. Select to include watershed boundaries.
3. Download zip file from request email response.

Download components:
- stationbasins.geojson: metadata and geoemetry of the watershed upstream of the gauge.
- subregions.geojson: global geodataframe of subregions
- stationID_Q_.txt: time series of Q (m3/s) for each station

In [18]:
import pandas as pd
import os
import geopandas as gpd
import numpy as np
import xarray as xr
import rioxarray as rxr
import codebase
from multiprocessing import Pool

In [19]:
datadir = '/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/'
basin = 'niger'

In [20]:
subregions_meta = gpd.read_file(datadir + basin + '_subregions.geojson')
stations_meta = gpd.read_file(datadir + basin + '_stationbasins.geojson')
stations_meta.index = stations_meta['grdc_no']

In [21]:
grdc_ids = stations_meta['grdc_no']
len(grdc_ids)

110

In [22]:
def get_start_stop_dates(grdc_id_OI):
    _ , data_df = codebase.load_data.load_GRDC_station_data_by_ID(grdc_id_OI, filepath=datadir,basin_str=basin)
    if data_df.empty:
        start_date = np.nan
        stop_date = np.nan
    else:
        start_date = data_df.index[0]
        stop_date = data_df.index[-1]
    return start_date, stop_date

## REMOVE EMPTY DATA FILES

In [23]:
def identify_empty_files(stations_meta):
    date_check_sum = stations_meta[['start_date','stop_date']].isnull().sum(axis=1) # find rows with empty start and stop dates
    empty_ids = date_check_sum[date_check_sum == 2].index.astype(int) # subset to IDs with no data
    print(empty_ids)

    empty_files = list(empty_ids.astype('str').to_frame().add_suffix('_Q_Day.Cmd.txt',axis=0).index) # convert IDs into filenames
    return empty_files

In [24]:
def remove_empty_files(stations_meta,data_dir,safety_on=True):

    empty_files = identify_empty_files(stations_meta)

    if not safety_on:
        for empty_file in empty_files:
            try:
                os.remove(datadir+empty_file)
            except FileNotFoundError:
                print('No file',empty_file)

In [25]:
with Pool() as pool:
    result = pool.map(get_start_stop_dates,
        grdc_ids,
    )

In [26]:
result_df = pd.DataFrame(result)

In [28]:
get_start_stop_dates(grdc_ids.iloc[0])

(nan, nan)

In [29]:
grdc_ids

grdc_no
1134020.0    1134020.0
1134030.0    1134030.0
1134040.0    1134040.0
1134050.0    1134050.0
1134060.0    1134060.0
               ...    
1934575.0    1934575.0
1934645.0    1934645.0
1934680.0    1934680.0
1934715.0    1934715.0
1934890.0    1934890.0
Name: grdc_no, Length: 110, dtype: float64

In [30]:
stations_meta[['start_date','stop_date']] = result_df.values
stations_meta[['start_date','stop_date']]

Unnamed: 0_level_0,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1134020.0,NaT,NaT
1134030.0,2000-01-01,2001-04-29
1134040.0,NaT,NaT
1134050.0,NaT,NaT
1134060.0,NaT,NaT
...,...,...
1934575.0,NaT,NaT
1934645.0,NaT,NaT
1934680.0,NaT,NaT
1934715.0,NaT,NaT


In [31]:
identify_empty_files(stations_meta);

Index([1134020, 1134040, 1134050, 1134060, 1134080, 1134110, 1134150, 1134200,
       1134220, 1134310, 1134320, 1134350, 1134400, 1134420, 1134450, 1134460,
       1134480, 1134505, 1134600, 1134620, 1134630, 1134650, 1134705, 1134850,
       1234050, 1234080, 1234100, 1234180, 1234190, 1234200, 1234250, 1234500,
       1234550, 1234650, 1234680, 1234700, 1335011, 1335014, 1335122, 1335181,
       1335241, 1335381, 1335451, 1335500, 1434200, 1434300, 1434350, 1434370,
       1434390, 1434500, 1434510, 1434700, 1434740, 1434750, 1434780, 1434790,
       1434810, 1434850, 1535100, 1535110, 1634100, 1634350, 1634420, 1634440,
       1634550, 1634600, 1634610, 1634650, 1634700, 1634800, 1734200, 1734300,
       1734400, 1734410, 1734450, 1734480, 1734550, 1734560, 1734600, 1834100,
       1835300, 1835500, 1835800, 1934575, 1934645, 1934680, 1934715, 1934890],
      dtype='int64', name='grdc_no')


In [None]:
safety_on = True
remove_empty_files(stations_meta,datadir,safety_on)

Index([1134020, 1134040, 1134050, 1134060, 1134080, 1134110, 1134150, 1134200,
       1134220, 1134310, 1134320, 1134350, 1134400, 1134420, 1134450, 1134460,
       1134480, 1134505, 1134600, 1134620, 1134630, 1134650, 1134705, 1134850,
       1234050, 1234080, 1234100, 1234180, 1234190, 1234200, 1234250, 1234500,
       1234550, 1234650, 1234680, 1234700, 1335011, 1335014, 1335122, 1335181,
       1335241, 1335381, 1335451, 1335500, 1434200, 1434300, 1434350, 1434370,
       1434390, 1434500, 1434510, 1434700, 1434740, 1434750, 1434780, 1434790,
       1434810, 1434850, 1535100, 1535110, 1634100, 1634350, 1634420, 1634440,
       1634550, 1634600, 1634610, 1634650, 1634700, 1634800, 1734200, 1734300,
       1734400, 1734410, 1734450, 1734480, 1734550, 1734560, 1734600, 1834100,
       1835300, 1835500, 1835800, 1934575, 1934645, 1934680, 1934715, 1934890],
      dtype='int64', name='grdc_no')


## CALCULATE ATTRIBUTES

In [6]:
stations_meta

Unnamed: 0_level_0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source,geometry
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1159100.0,1159100.0,ORANGE RIVER,VIOOLSDRIF (27811003),866486.0,152.0,-28.7580,17.7216,-28.76040,17.72290,0.3,784765.190,Medium,Automatic,Area difference 5-10% and distance <= 5 km,hydrosheds,"MULTIPOLYGON (((17.17920 -22.50420, 17.18330 -..."
1159103.0,1159103.0,ORANGE RIVER,PELLA MISSION,859920.0,-999.0,-28.9634,19.1519,-28.96040,19.15210,0.3,763221.400,Low,Automatic,Area difference 10-50% and distance <= 5 km,hydrosheds,"MULTIPOLYGON (((17.17920 -22.50420, 17.18330 -..."
1159105.0,1159105.0,ORANGE RIVER,SENDELINGSDRIF,985370.0,-999.0,-28.0758,16.8983,-28.07710,16.89790,0.1,879726.400,Low,Automatic,Area difference 10-50% and distance <= 5 km,hydrosheds,"MULTIPOLYGON (((17.17920 -22.50420, 17.18330 -..."
1159110.0,1159110.0,VISRIVIER-OOS,HARDEHEUWEL (27814003),1502.0,1077.0,-31.8069,20.3581,-31.80708,20.35792,0.0,1500.000,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((20.33130 -31.88290, 20.33130 -31.882..."
1159120.0,1159120.0,RENOSTERRIVIER,BONEKRAAL (27814011),1679.0,1140.0,-31.8155,20.5781,-31.81625,20.57708,0.1,1680.000,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((20.54130 -31.82040, 20.54130 -31.820..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259520.0,1259520.0,MOTETE RIVER,MAHLASELA (66819314),81.0,-999.0,-28.8333,28.7200,-28.83292,28.71958,0.1,69.977,Low,Automatic,Area difference 10-50% and distance <= 5 km,merit,"POLYGON ((28.73120 -28.80790, 28.73120 -28.809..."
1259600.0,1259600.0,MANTSONYANE RIVER,SESHOTES (66819342),699.0,-999.0,-29.2817,28.5667,-29.28208,28.56708,0.1,673.950,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((28.56880 -29.27710, 28.56880 -29.277..."
1259620.0,1259620.0,MANTSONYANE RIVER,MANTSONYANE (66819364),268.0,-999.0,-29.4706,28.3531,-29.51460,28.29790,7.2,258.000,High,Manual,"Station and river name could be identified, ar...",hydrosheds,"POLYGON ((28.32500 -29.38750, 28.33750 -29.387..."
1259800.0,1259800.0,ORANGE RIVER,MOKHOTLONG (66819306),1701.0,2000.0,-29.2833,28.9833,-29.29000,28.98833,0.9,1675.600,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((29.07210 -29.32120, 29.06880 -29.321..."


In [7]:
with Pool() as pool:
    result = pool.map(get_start_stop_dates,
        grdc_ids,
    )

In [10]:
result_df = pd.DataFrame(result)

In [11]:
stations_meta[['start_date','stop_date']] = result_df.values
stations_meta[['start_date','stop_date']]

Unnamed: 0_level_0,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1159100.0,2000-01-01,2024-12-31
1159103.0,2000-01-01,2024-12-31
1159105.0,2014-05-14,2024-12-31
1159110.0,2000-01-01,2023-05-29
1159120.0,2000-01-01,2023-04-17
...,...,...
1259520.0,NaT,NaT
1259600.0,NaT,NaT
1259620.0,NaT,NaT
1259800.0,NaT,NaT


In [14]:
subregions_meta

Unnamed: 0,GmlID,FID,OBJECTID,WMOBB,SUBREGNUM,SUBREGNAME,SUBREG_DES,SUM_SUB_AR,REGNUM,REGNAME,RIVERBASIN,OCEANNUM,OCEAN,geometry
0,hydroreg_portal.0,0,1,101.0,1011.0,MEDJERDA (also MAJARDAH),Medjerda river basin,23171.5,1,Africa,MEDJERDA (also MAJARDAH),2,Atlantic Ocean,"MULTIPOLYGON (((10.17917 37.12083, 10.13333 37..."
1,hydroreg_portal.1,1,2,102.0,1021.0,MELRHIR,Chott Melrhir interior basin,146405,1,Africa,MELRHIR,9,---,"MULTIPOLYGON (((1.97917 33.99583, 1.95933 34.0..."
2,hydroreg_portal.2,2,3,102.0,1022.0,RHARSA,Chott el Rharsa interior basin,114355.5,1,Africa,RHARSA,9,---,"MULTIPOLYGON (((7.19167 35.37917, 7.12696 35.3..."
3,hydroreg_portal.3,3,4,103.0,1031.0,DJERID,Chott el Djerid interior basin,59021,1,Africa,DJERID,9,---,"MULTIPOLYGON (((10.11667 32.43750, 10.14498 32..."
4,hydroreg_portal.4,4,5,104.0,1040.0,"Mediterranean Sea (ex 161, 627, 628, 691, 693)",basins draining into Mediterranean Sea between...,102164.9,1,Africa,---,2,Atlantic Ocean,"MULTIPOLYGON (((-1.53253 35.23700, -1.49238 35..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,hydroreg_portal.836,836,837,698.0,6983.0,Greenland (Atlantic Ocean),basins on Greenland island draining into the A...,269795.7,6,Europe,---,2,Atlantic Ocean,"MULTIPOLYGON (((-43.23750 59.92083, -43.21343 ..."
837,hydroreg_portal.837,837,838,698.0,6984.0,"Greenland (Arctic Ocean, Baffin Bay)",basins on Greenland island draining into Baffi...,844154.7,6,Europe,---,2,Atlantic Ocean,"MULTIPOLYGON (((-51.04895 69.92576, -51.09316 ..."
838,hydroreg_portal.838,838,839,487.0,4870.0,Barbados,basins on Barbados island,463.3,4,"North America, Central America and the Caribbean",---,2,Atlantic Ocean,"MULTIPOLYGON (((-59.46862 13.18693, -59.53665 ..."
839,hydroreg_portal.839,839,840,488.0,4880.0,San Andres y Providencia,"basins on San Andres, Providencia and Santa Ca...",71.1,4,"North America, Central America and the Caribbean",---,2,Atlantic Ocean,"MULTIPOLYGON (((-80.36576 14.28250, -80.35777 ..."


In [13]:
subregions_meta.columns

Index(['GmlID', 'FID', 'OBJECTID', 'WMOBB', 'SUBREGNUM', 'SUBREGNAME',
       'SUBREG_DES', 'SUM_SUB_AR', 'REGNUM', 'REGNAME', 'RIVERBASIN',
       'OCEANNUM', 'OCEAN', 'geometry'],
      dtype='object')

In [16]:
stations_meta[['area','area_calc']]

Unnamed: 0_level_0,area,area_calc
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1159100.0,866486.0,784765.190
1159103.0,859920.0,763221.400
1159105.0,985370.0,879726.400
1159110.0,1502.0,1500.000
1159120.0,1679.0,1680.000
...,...,...
1259520.0,81.0,69.977
1259600.0,699.0,673.950
1259620.0,268.0,258.000
1259800.0,1701.0,1675.600


In [15]:
stations_meta.loc[stations_meta['area']< 0]

Unnamed: 0_level_0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source,geometry,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1159547.0,1159547.0,MOOIRIVIER,HOOGEKRAAL,-999.0,-999.0,-26.8805,26.9643,-26.8813,26.9604,0.4,5739.6,High,Manual,Station and river name could be identified,hydrosheds,"POLYGON ((26.98750 -25.90000, 27.04170 -25.900...",2000-01-01,2023-10-05
1259370.0,1259370.0,AUOB,STAMPRIET,-999.0,1150.0,-24.3167,18.4333,-24.3313,18.4271,1.7,20082.0,Low,Manual,Stations were relocated manually,hydrosheds,"POLYGON ((17.25420 -22.61670, 17.25830 -22.616...",2005-10-01,2020-04-14
