In [2]:
import numpy as np
import pandas as pd
import os
import geopandas as gpd
from datetime import date
from pathlib import Path
import pickle
import codebase
from codebase import ml_pipeline

topo_vars: gauge_id;gauge_lat;gauge_lon;elev_mean;slope_mean;area_gages2;

| topo_vars    | GRDC eq.  | Notes      |
|--------------|-----------|------------|
| gauge_id     | grdc_no   |            |
| gauge_lat    | lat_pp    | from Google Maps inspection, more accurate than lat_org   |
| gauge_lon    | long_pp   | from Google Maps inspection, more accurate than long_org  |
| elev_mean    |           |            |
| slope_mean   |           |            |
| area_gages2  | area_calc | some 'area' columns have -999, so using 'area_calc'       |

All subregion files are the same.

In [28]:
## define experimental set-up

# grdc stored as floats in the downlaod jsons
grdc_id = 1834101
basin_str = 'magdalena'

In [3]:
## Other variables and filepaths
grdc_dir = "/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/"
met_dir = "/global/scratch/users/ann_scheliga/era5_data/"
res_dir = "/global/scratch/users/ann_scheliga/CYGNSS_daily/time_series/"
basin_data_dir = "/global/scratch/users/ann_scheliga/basin_forcing_processed/"

In [29]:
subregions_meta = gpd.read_file(grdc_dir + basin_str + '_subregions.geojson')
stations_meta = gpd.read_file(grdc_dir + basin_str + '_stationbasins.geojson')

In [8]:
test_subregion = gpd.read_file(grdc_dir + 'colorado' + '_subregions.geojson')

In [30]:
print(test_subregion.shape, subregions_meta.shape)

(841, 14) (841, 14)


In [31]:
(test_subregion == subregions_meta).sum()

GmlID         841
FID           841
OBJECTID      841
WMOBB         841
SUBREGNUM     841
SUBREGNAME    841
SUBREG_DES    841
SUM_SUB_AR    841
REGNUM        841
REGNAME       841
RIVERBASIN    841
OCEANNUM      841
OCEAN         841
geometry      841
dtype: int64

In [32]:
stations_meta['lat_lon_pp'] = gpd.points_from_xy(y =stations_meta['lat_pp'], x =  stations_meta['long_pp'])
pp_points = stations_meta.set_geometry('lat_lon_pp', crs = "EPSG:4326")

In [33]:
stations_meta['lat_lon'] = gpd.points_from_xy(y =stations_meta['lat_org'], x =  stations_meta['long_org'])
org_points = stations_meta.set_geometry('lat_lon', crs = "EPSG:4326")

In [60]:
# m = org_points.explore(style_kwds=dict(color='red'),tooltip=['grdc_no','station','river','quality'])
# pp_points.explore(m=m,style_kwds=dict(color='blue'),tooltip=['grdc_no','station','river','quality'])
# m

In [67]:
def write_topo_features(stations_meta, output_path):
    if isinstance(stations_meta, (Path, str)):
        stations_meta = gpd.read_file(stations_meta)
    topo_df = stations_meta[['grdc_no','lat_pp','long_pp','area_calc']]
    topo_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path),sep=';',index=False)
    print('Saved to',output_path)

In [4]:
station_fns = [Path(grdc_dir) / f for f in os.listdir(grdc_dir) if 'stationbasins' in f]

output_path = Path(basin_data_dir)/'attributes' / 'topo_attr.csv'
# [write_topo_features(fn,output_path) for fn in station_fns]

In [7]:
full_df

Unnamed: 0,grdc_no;lat_pp;long_pp;area_calc
0,5608090.0;-16.13333;128.74;45190.0
1,5608091.0;-17.42583;127.60083;550.86
2,5608093.0;-16.6475;128.0925;2529.5
3,5608095.0;-17.37375;128.85375;19404.0
4,5608096.0;-15.5729;128.6937;51205.5
...,...
533,1259520.0;-28.83292;28.71958;69.977
534,1259600.0;-29.28208;28.56708;673.95
535,1259620.0;-29.5146;28.2979;258.0
536,1259800.0;-29.29;28.98833;1675.6


In [6]:
full_df = pd.read_csv(output_path)


In [77]:
sorted_df = full_df.sort_values(by='grdc_no')
sorted_df.to_csv(output_path, header=True,sep=';',index=False)