Global Runoff Data Center (GRDC) https://portal.grdc.bafg.de/

Download process:
1. Pick sub-region(s) or station(s) of interest.
2. Submit data request form. Select to include watershed boundaries.
3. Download zip file from request email response.

Download components:
- stationbasins.geojson: metadata and geoemetry of the watershed upstream of the gauge.
- subregions.geojson: global geodataframe of subregions
- stationID_Q_.txt: time series of Q (m3/s) for each station

In [1]:
import pandas as pd
import os
import geopandas as gpd
import numpy as np
import xarray as xr
import rioxarray as rxr
import codebase
from multiprocessing import Pool

In [2]:
datadir = '/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/'
basin = 'santee'

In [3]:
subregions_meta = gpd.read_file(datadir + 'subregions.geojson')
stations_meta = gpd.read_file(datadir + basin + '_stationbasins.geojson')
stations_meta.index = stations_meta['grdc_no']

In [4]:
grdc_ids = stations_meta['grdc_no']
len(grdc_ids)

8

In [5]:
def get_start_stop_dates(grdc_id_OI):
    _ , data_df = codebase.load_data.load_GRDC_station_data_by_ID(grdc_id_OI, filepath=datadir,basin_str=basin)
    if data_df.empty:
        start_date = np.nan
        stop_date = np.nan
    else:
        start_date = data_df.index[0]
        stop_date = data_df.index[-1]
    return start_date, stop_date

## REMOVE EMPTY DATA FILES

In [6]:
def identify_empty_files(stations_meta):
    date_check_sum = stations_meta[['start_date','stop_date']].isnull().sum(axis=1) # find rows with empty start and stop dates
    empty_ids = date_check_sum[date_check_sum == 2].index.astype(int) # subset to IDs with no data
    print(empty_ids)

    empty_files = list(empty_ids.astype('str').to_frame().add_suffix('_Q_Day.Cmd.txt',axis=0).index) # convert IDs into filenames
    return empty_files

In [7]:
def remove_empty_files(stations_meta,data_dir,safety_on=True):

    empty_files = identify_empty_files(stations_meta)

    if not safety_on:
        for empty_file in empty_files:
            try:
                os.remove(datadir+empty_file)
            except FileNotFoundError:
                print('No file',empty_file)

In [8]:
with Pool() as pool:
    result = pool.map(get_start_stop_dates,
        grdc_ids,
    )

In [9]:
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,0,1
0,2000-01-01,2024-11-07
1,2000-01-01,2024-10-24
2,2000-01-01,2024-11-12
3,2000-01-01,2014-07-14
4,2000-01-01,2024-12-31
5,2000-01-01,2024-12-31
6,2000-01-01,2024-12-31
7,NaT,NaT


In [10]:
get_start_stop_dates(grdc_ids.iloc[0])

(Timestamp('2000-01-01 00:00:00'), Timestamp('2024-11-07 00:00:00'))

In [11]:
grdc_ids

grdc_no
4148410.0    4148410.0
4148530.0    4148530.0
4148535.0    4148535.0
4148540.0    4148540.0
4148541.0    4148541.0
4148545.0    4148545.0
4148546.0    4148546.0
4148550.0    4148550.0
Name: grdc_no, dtype: float64

In [12]:
stations_meta[['start_date','stop_date']] = result_df.values
stations_meta[['start_date','stop_date']]

Unnamed: 0_level_0,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
4148410.0,2000-01-01,2024-11-07
4148530.0,2000-01-01,2024-10-24
4148535.0,2000-01-01,2024-11-12
4148540.0,2000-01-01,2014-07-14
4148541.0,2000-01-01,2024-12-31
4148545.0,2000-01-01,2024-12-31
4148546.0,2000-01-01,2024-12-31
4148550.0,NaT,NaT


In [13]:
identify_empty_files(stations_meta);

Index([4148550], dtype='int64', name='grdc_no')


In [None]:
safety_on = True
remove_empty_files(stations_meta,datadir,safety_on)

Index([4148550], dtype='int64', name='grdc_no')


## CALCULATE ATTRIBUTES

In [17]:
stations_meta

Unnamed: 0_level_0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source,geometry,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
4150600.0,4150600.0,TRINITY RIVER,"ROMAYOR, TEX.",44511.7,7.9,30.4252,-94.8508,30.425,-94.85083,0.0,44137.0,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-98.27040 33.19380, -98.27380 33.193...",2000-01-01,2024-10-14
4150601.0,4150601.0,TRINITY RIVER,"LIBERTY, TX",45242.1,-0.68,30.0577,-94.8183,30.05708,-94.81792,0.1,44965.0,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-98.27040 33.19380, -98.27380 33.193...",2000-01-01,2024-10-14
4150605.0,4150605.0,ELM FORK TRINITY RIVER,"NEAR CARROLLTON, TX",6368.8,131.49,32.966,-96.9444,32.96583,-96.94417,0.0,6366.0,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-97.46880 33.10460, -97.46880 33.105...",2000-01-01,2024-12-31
4150680.0,4150680.0,NECHES RIVER,"EVADALE, TEX.",20593.0,2.52,30.3558,-94.0932,30.35583,-94.09333,0.0,20534.0,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-95.43380 31.44960, -95.43540 31.449...",2000-01-01,2024-12-17
4150681.0,4150681.0,NECHES RIVER,"NEAR ROCKLAND, TX",9417.2,26.95,31.025,-94.3994,31.025,-94.39917,0.0,9369.1,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-95.36620 32.26620, -95.36620 32.265...",2000-01-01,2024-12-31
4150682.0,4150682.0,NECHES RIVER,"NEAR NECHES,TX",2965.6,80.49,31.8924,-95.4308,31.8925,-95.43167,0.1,2962.8,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-95.68040 31.94960, -95.68210 31.949...",2000-01-01,2024-12-09
4150700.0,4150700.0,SABINE RIVER,"NEAR RULIFF, TEX.",24162.1,-1.8,30.3038,-93.7438,30.305,-93.74083,0.3,24075.0,High,Automatic,Area difference <= 5% and distance <= 5 km,merit,"POLYGON ((-95.81960 32.52710, -95.81960 32.526...",2000-01-01,2024-12-09


In [7]:
with Pool() as pool:
    result = pool.map(get_start_stop_dates,
        grdc_ids,
    )

In [10]:
result_df = pd.DataFrame(result)

In [11]:
stations_meta[['start_date','stop_date']] = result_df.values
stations_meta[['start_date','stop_date']]

Unnamed: 0_level_0,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
1159100.0,2000-01-01,2024-12-31
1159103.0,2000-01-01,2024-12-31
1159105.0,2014-05-14,2024-12-31
1159110.0,2000-01-01,2023-05-29
1159120.0,2000-01-01,2023-04-17
...,...,...
1259520.0,NaT,NaT
1259600.0,NaT,NaT
1259620.0,NaT,NaT
1259800.0,NaT,NaT


In [14]:
subregions_meta

Unnamed: 0,GmlID,FID,OBJECTID,WMOBB,SUBREGNUM,SUBREGNAME,SUBREG_DES,SUM_SUB_AR,REGNUM,REGNAME,RIVERBASIN,OCEANNUM,OCEAN,geometry
0,hydroreg_portal.0,0,1,101.0,1011.0,MEDJERDA (also MAJARDAH),Medjerda river basin,23171.5,1,Africa,MEDJERDA (also MAJARDAH),2,Atlantic Ocean,"MULTIPOLYGON (((10.17917 37.12083, 10.13333 37..."
1,hydroreg_portal.1,1,2,102.0,1021.0,MELRHIR,Chott Melrhir interior basin,146405,1,Africa,MELRHIR,9,---,"MULTIPOLYGON (((1.97917 33.99583, 1.95933 34.0..."
2,hydroreg_portal.2,2,3,102.0,1022.0,RHARSA,Chott el Rharsa interior basin,114355.5,1,Africa,RHARSA,9,---,"MULTIPOLYGON (((7.19167 35.37917, 7.12696 35.3..."
3,hydroreg_portal.3,3,4,103.0,1031.0,DJERID,Chott el Djerid interior basin,59021,1,Africa,DJERID,9,---,"MULTIPOLYGON (((10.11667 32.43750, 10.14498 32..."
4,hydroreg_portal.4,4,5,104.0,1040.0,"Mediterranean Sea (ex 161, 627, 628, 691, 693)",basins draining into Mediterranean Sea between...,102164.9,1,Africa,---,2,Atlantic Ocean,"MULTIPOLYGON (((-1.53253 35.23700, -1.49238 35..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
836,hydroreg_portal.836,836,837,698.0,6983.0,Greenland (Atlantic Ocean),basins on Greenland island draining into the A...,269795.7,6,Europe,---,2,Atlantic Ocean,"MULTIPOLYGON (((-43.23750 59.92083, -43.21343 ..."
837,hydroreg_portal.837,837,838,698.0,6984.0,"Greenland (Arctic Ocean, Baffin Bay)",basins on Greenland island draining into Baffi...,844154.7,6,Europe,---,2,Atlantic Ocean,"MULTIPOLYGON (((-51.04895 69.92576, -51.09316 ..."
838,hydroreg_portal.838,838,839,487.0,4870.0,Barbados,basins on Barbados island,463.3,4,"North America, Central America and the Caribbean",---,2,Atlantic Ocean,"MULTIPOLYGON (((-59.46862 13.18693, -59.53665 ..."
839,hydroreg_portal.839,839,840,488.0,4880.0,San Andres y Providencia,"basins on San Andres, Providencia and Santa Ca...",71.1,4,"North America, Central America and the Caribbean",---,2,Atlantic Ocean,"MULTIPOLYGON (((-80.36576 14.28250, -80.35777 ..."


In [13]:
subregions_meta.columns

Index(['GmlID', 'FID', 'OBJECTID', 'WMOBB', 'SUBREGNUM', 'SUBREGNAME',
       'SUBREG_DES', 'SUM_SUB_AR', 'REGNUM', 'REGNAME', 'RIVERBASIN',
       'OCEANNUM', 'OCEAN', 'geometry'],
      dtype='object')

In [18]:
stations_meta[['area','area_calc']]

Unnamed: 0_level_0,area,area_calc
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1
4150600.0,44511.7,44137.0
4150601.0,45242.1,44965.0
4150605.0,6368.8,6366.0
4150680.0,20593.0,20534.0
4150681.0,9417.2,9369.1
4150682.0,2965.6,2962.8
4150700.0,24162.1,24075.0


In [15]:
stations_meta.loc[stations_meta['area']< 0]

Unnamed: 0_level_0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source,geometry,start_date,stop_date
grdc_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1159547.0,1159547.0,MOOIRIVIER,HOOGEKRAAL,-999.0,-999.0,-26.8805,26.9643,-26.8813,26.9604,0.4,5739.6,High,Manual,Station and river name could be identified,hydrosheds,"POLYGON ((26.98750 -25.90000, 27.04170 -25.900...",2000-01-01,2023-10-05
1259370.0,1259370.0,AUOB,STAMPRIET,-999.0,1150.0,-24.3167,18.4333,-24.3313,18.4271,1.7,20082.0,Low,Manual,Stations were relocated manually,hydrosheds,"POLYGON ((17.25420 -22.61670, 17.25830 -22.616...",2005-10-01,2020-04-14
