In [1]:
import numpy as np
import pandas as pd
import os
import geopandas as gpd
from datetime import date
from pathlib import Path
import pickle
import codebase
from codebase import ml_pipeline

All subregion files are the same.

In [2]:
## define experimental set-up

# grdc stored as floats in the downlaod jsons
grdc_id = 4150600
basin_str = 'east_texas'

In [3]:
## Other variables and filepaths
grdc_dir = "/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/"
met_dir = "/global/scratch/users/ann_scheliga/era5_data/"
res_dir = "/global/scratch/users/ann_scheliga/CYGNSS_daily/time_series/"
basin_data_dir = "/global/scratch/users/ann_scheliga/basin_forcing_processed/"

In [5]:
subregions_meta = gpd.read_file(grdc_dir + 'subregions.geojson')
stations_meta = gpd.read_file(grdc_dir + basin_str + '_stationbasins.geojson')

## TOPO_VARS
topo_vars: gauge_id;gauge_lat;gauge_lon;elev_mean;slope_mean;area_gages2;

| topo_vars    | GRDC eq.  | Notes      |
|--------------|-----------|------------|
| gauge_id     | grdc_no   |            |
| gauge_lat    | lat_pp    | from Google Maps inspection, more accurate than lat_org   |
| gauge_lon    | long_pp   | from Google Maps inspection, more accurate than long_org  |
| elev_mean    |           |            |
| slope_mean   |           |            |
| area_gages2  | area_calc | some 'area' columns have -999, so using 'area_calc'       |


In [32]:
stations_meta['lat_lon_pp'] = gpd.points_from_xy(y =stations_meta['lat_pp'], x =  stations_meta['long_pp'])
pp_points = stations_meta.set_geometry('lat_lon_pp', crs = "EPSG:4326")

In [33]:
stations_meta['lat_lon'] = gpd.points_from_xy(y =stations_meta['lat_org'], x =  stations_meta['long_org'])
org_points = stations_meta.set_geometry('lat_lon', crs = "EPSG:4326")

In [60]:
# m = org_points.explore(style_kwds=dict(color='red'),tooltip=['grdc_no','station','river','quality'])
# pp_points.explore(m=m,style_kwds=dict(color='blue'),tooltip=['grdc_no','station','river','quality'])
# m

In [67]:
def write_topo_features(stations_meta, output_path):
    if isinstance(stations_meta, (Path, str)):
        stations_meta = gpd.read_file(stations_meta)
    topo_df = stations_meta[['grdc_no','lat_pp','long_pp','area_calc']]
    topo_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path),sep=';',index=False)
    print('Saved to',output_path)

In [12]:
basins = ['']

In [13]:
station_fns = [Path(grdc_dir) / f for f in os.listdir(grdc_dir) if any(basin +'_stationbasins' in f for basin in basins)]

output_path = Path(basin_data_dir)/'attributes' / 'topo_attr.csv'
# [write_topo_features(fn,output_path) for fn in station_fns]

station_fns

[PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/ord_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/santee_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/east_texas_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/shire_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/narmada_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/zambezi_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/colorado_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/magdalena_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/niger_stationbasins.geojson'),
 PosixPath('/global/scratch/users/ann_scheliga/au

In [4]:
full_df = pd.read_csv(output_path)


## CLIM_VARS
clim_vars: gauge_id;p_mean;pet_mean;p_seasonality;frac_snow;aridity;high_prec_freq;high_prec_dur;high_prec_timing;low_prec_freq;low_prec_dur;low_prec_timing
| clim_vars       | GRDC eq.  | Notes      |
|-----------------|-----------|------------|
| gauge_id        | grdc_no   |            |
| p_mean          |           |            |
| pet_mean        |           |            |
| p_seasonality   |           |            |
| frac_snow       |           |            |
| aridity         |           |            |
| high_prec_freq  |           |            |
| high_prec_dur   |           |            |
| high_prec_timing|           |            |
| low_prec_freq   |           |            |
| low_prec_dur    |           |            |
| low_prec_timing |           |            |

In [28]:
stations_to_run = ['4150600','4150680']

In [29]:
all_pkls = [fn for fn in os.listdir(basin_data_dir) if any((station in fn) and ('.pkl' in fn) for station in stations_to_run)]
all_pkls.sort()
all_pkls

['4150600_livingston_dam.pkl', '4150680_sam_rayburn_dam_and_reservoir.pkl']

In [44]:
fn = all_pkls[0]
test_read = pickle.load(open(basin_data_dir+fn, 'rb'))
grdc_id = list(test_read.keys())[0]
print('GRDC ID:',grdc_id)
basin_df = test_read[grdc_id]

GRDC ID: 1159302


In [37]:
output_path = Path(basin_data_dir)/'attributes' / 'clim_attr.csv'

In [89]:
precip_ts = basin_df['precipm_tot0'].loc[(basin_df['precipm_tot0'].index.date >= date(2000,1,1)) * (basin_df['precipm_tot0'].index.date < date(2024,1,1))]

p_test = precip_ts.mean()

In [62]:
np.array([[grdc_id,p_test]]).shape

(1, 2)

In [72]:
test_df = pd.DataFrame(columns=['grdc_no','p_mean'],data=np.array([[grdc_id,p_test]]))

In [90]:
cats_to_run = ['clim','topo']

In [92]:
'vege' in cats_to_run

False

In [85]:
from codebase.dataprocessing import standard_precip_mean
def write_clim_features(basin_clim, output_path,grdc_id=''):
    if isinstance(stations_meta, (Path, str)):
        data_dict = pickle.load(open(basin_clim, 'rb'))
        grdc_id = list(data_dict.keys())[0]
        print('GRDC ID:',grdc_id)
        basin_clim = data_dict[grdc_id]
    # precip_ts = basin_clim['precipm_tot0'].loc[(basin_clim['precipm_tot0'].index.date >= date(2000,1,1)) * (basin_clim['precipm_tot0'].index.date < date(2024,1,1))]
    p_mean = standard_precip_mean(basin_clim)

    clim_df = pd.DataFrame(columns=['grdc_no','p_mean'],data=np.array([[grdc_id,p_mean]]))
    clim_df['grdc_no'] = clim_df['grdc_no'].astype(int)

    print(clim_df)
    clim_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path),sep=';',index=False)
    print('Saved to',output_path)

In [86]:
write_clim_features(basin_df,output_path,grdc_id)

   grdc_no    p_mean
0  1159302  0.050178
Saved to /global/scratch/users/ann_scheliga/basin_forcing_processed/attributes/clim_attr.csv
