In [215]:
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import date
from statistics import mode
import codebase

- below Hoover Dam: 4152103
- Lee's Ferry: 4152450
- US-Mexico border: 4152050
- upstream of Lake Powell (San Juan River trib): 4152600

In [55]:
## define experimental set-up

# grdc stored as floats in the downlaod jsons
grdc_id = 4152450
grdc_sub_ids = [4152600]
dam_name = 'glen canyon'
start_year = 2018
stop_year_ex = 2024

In [76]:
## Other variables and filepaths
grdc_dir = "/global/scratch/users/ann_scheliga/aux_dam_datasets/GRDC_CRB/"
met_dir = "/global/scratch/users/ann_scheliga/era5_data/"
res_dir = "/global/scratch/users/ann_scheliga/CYGNSS_daily/"

In [None]:
# For debugging
def check_data_format(df):
    print(df.head(2))
    print(df.tail(2))
    print('structure type:',type(df))
    print('index type:',type(df.index))
    print('first index:',df.index[0])
    print('Inferred frequency:',df.index.inferred_freq)


In [153]:
# Create output dataframe
full_time = pd.date_range(start=date(start_year,1,1), end=date(stop_year_ex,1,1),freq='D')
output_df = pd.DataFrame(index = full_time)

In [154]:
## import sw_area
sw_area = codebase.load_data.load_daily_reservoir_CYGNSS_area(
    dam_name, filepath=res_dir
)

output_df['SW_area'] = sw_area
check_data_format(sw_area)

2019-01-01    380.752060
2019-01-02    362.621009
Name: Area km2, dtype: float64
2023-12-31    298.685200
2024-01-01    293.913871
Name: Area km2, dtype: float64
structure type: <class 'pandas.core.series.Series'>
index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
first index: 2019-01-01 00:00:00
Inferred frequency: D


In [155]:
## Calculate SW_flag
output_df['SW_flag'] = 0
# where SW_area has a value, SW_flag is true
output_df.loc[~output_df['SW_area'].isna(),'SW_flag'] = 1 

check_data_format(output_df['SW_flag'])

2018-01-01    0
2018-01-02    0
Freq: D, Name: SW_flag, dtype: int64
2023-12-31    1
2024-01-01    1
Freq: D, Name: SW_flag, dtype: int64
structure type: <class 'pandas.core.series.Series'>
index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
first index: 2018-01-01 00:00:00
Inferred frequency: D


In [156]:
## import GRDC
watershed_gpd, grdc_Q = codebase.load_data.load_GRDC_station_data_by_ID(
    grdc_id,
    filepath=grdc_dir,
    timeseries_dict={"start_year": start_year, "stop_year": stop_year_ex},
)

output_df['Q'] = grdc_Q
check_data_format(grdc_Q)

              Q m3s
Date               
2018-01-01  356.792
2018-01-02  396.435
              Q m3s
Date               
2023-12-03  252.303
2023-12-04  288.831
structure type: <class 'pandas.core.frame.DataFrame'>
index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
first index: 2018-01-01 00:00:00
Inferred frequency: D


In [96]:
## Can't get mode function to work easily.
# Tried scipy and statistics modules
# type_precip_test = codebase.area_subsets.era5_shape_subset_and_concat_from_file_pattern(
#     filepath = met_dir,
#     input_pattern = r'daily_precip_type',
#     subset_gpd = watershed_gpd,
#     concat_dict = concat_dict,
#     agg_function = mode
# )

In [118]:
def add_era5_met_data_by_shp(input_gpd):
    
    concat_dict = {"dim": "valid_time"}

    __ , tempK_1dim = codebase.area_subsets.era5_shape_subset_and_concat_from_file_pattern(
        filepath = met_dir,
        input_pattern = r'daily_tempK',
        subset_gpd = input_gpd,
        concat_dict = concat_dict,
        agg_function = np.nanmean
    )
    tempK_1dim.rename('tempK',inplace=True)

    __ , precip_1dim = codebase.area_subsets.era5_shape_subset_and_concat_from_file_pattern(
        filepath = met_dir,
        input_pattern = r'daily_tot_precip',
        subset_gpd = input_gpd,
        concat_dict = concat_dict,
        agg_function = np.nansum
    )
    precip_1dim.rename('precipm',inplace=True)
    met_df = pd.concat([tempK_1dim, precip_1dim],axis=1)
    return met_df

In [None]:
sub_extension = '_tot0'
met_df = add_era5_met_data_by_shp(watershed_gpd).add_suffix(sub_extension)
output_df = output_df.join(met_df, how='left')

In [164]:
output_df

Unnamed: 0,SW_area,SW_flag,Q,tempK_tot00,precipm_tot00
2018-01-01,,0,356.792,,1.053810e-04
2018-01-02,,0,396.435,,4.053116e-05
2018-01-03,,0,413.425,,9.536743e-07
2018-01-04,,0,402.099,,4.768372e-07
2018-01-05,,0,407.762,,0.000000e+00
...,...,...,...,...,...
2023-12-28,311.090655,1,,,2.384186e-06
2023-12-29,293.913871,1,,,4.768372e-06
2023-12-30,298.685200,1,,,1.359463e-03
2023-12-31,298.685200,1,,,1.325607e-03


In [204]:
# Given that sub-basins exist
subbasins_GRDC = list(map(
    lambda id: codebase.load_data.load_GRDC_station_data_by_ID(
        id,filepath=grdc_dir,
        timeseries_dict={"start_year": start_year, "stop_year": stop_year_ex}
        ),
    grdc_sub_ids))
# subbasin_zipped = list(zip(grdc_sub_ids,subbasins_GRDC))
# drop flow timeseries from loaded into dict, leave just the geoDataFrame
subbasin_shps = [output[0] for output in subbasins_GRDC] 

In [None]:
processed_shps = gpd.GeoDataFrame()
processed_shps

Unnamed: 0,grdc_no,river,station,area,altitude,lat_org,long_org,lat_pp,long_pp,dist_km,area_calc,quality,type,comment,source,geometry
11,4152450.0,COLORADO RIVER,"LEES FERRY, ARIZ.",289562.0,946.76,36.8647,-111.5882,36.8646,-111.5896,0.1,279509.1,High,Automatic,Area difference <= 5% and distance <= 5 km,hydrosheds,"MULTIPOLYGON (((-110.90830 41.35830, -110.9042..."


In [221]:
shp_to_diff = subbasin_shps[0].iloc[0,-1]

In [223]:
processed_shps.difference(shp_to_diff)

11    MULTIPOLYGON (((-110.90420 41.35830, -110.9042...
dtype: geometry

In [132]:
if grdc_sub_ids:
    print('hello')

hello
