In [None]:
import csv
from io import StringIO
import json
import os
import tempfile
import zipfile

import pandas as pd
import intake
from shapely.geometry import Point, Polygon
import geopandas as gpd
import numpy as np
import xarray as xr
import dask
import panel as pn
from dask.distributed import progress
from dask.distributed import Client
from climakitae.cluster import Cluster

pn.extension()

xr.set_options(keep_attrs=True)
dask.config.set({"array.slicing.split_large_chunks": True})


#Load dask Area for faster computing. 
#Note, this will take awhile but in long run processing should be faster when compute is called.
cluster = Cluster()
#cluster.adapt(minimum=0, maximum=16)
cluster.adapt(minimum=0, maximum=30)
client = cluster.get_client()

#Get client link.
client

In [None]:
# VARIABLES
#Use these cordinates to clip around the watershed of interest.
# latitude = [34.775317,42.432494]
# longitude = [-123.097421,-117.980799]
bbox = {
    "maxy": 42.432494,
    "miny": 34.775317,
    "minx": -123.097421,
    "maxx": -117.980799,
}


# run_list_path = "data/GCM_Run_List_All.csv"
# file_zip = "GCM_All.zip"

#run_list_path = "data/GCM_Run_List_ACCESS-CM2_4.csv"
#file_zip = "GCM_Test_4.zip"

run_list_path = "data/GCM_Run_List_1-43.csv"
file_zip = "GCM_1-43.zip"

run_list_path = "data/GCM_Run_List_44-82.csv"
file_zip = "GCM_44-82.zip"

run_list_path = "data/GCM_Run_List_83-117.csv"
file_zip = "GCM_83-117.zip"


basin_weights_csv = "data/Basin_Weights.csv"
esm_datastore = "https://cadcat.s3.amazonaws.com/cae-collection.json"
output_folder = "outputs"
mask_path = "mask/mask.npy"
dir_area_weighted = 'AREA_WEIGHTED_CENTRALVALLEY'
dir_flow_weighted = 'FLOW_WEIGHTED_CENTRALVALLEY'
dir_individual = 'INDIVIDUAL_BASIN_LOCA2'

In [None]:
region_dict = {}
with open(basin_weights_csv, "r") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        region_dict[int(row["ID"])] = {
                            "region_name": row["Regions"],
                            "flow_ratio": float(row["Flow Ratio"]),
                            "area_ratio": float(row["Area Ratio"]),
                        }
for k, v in region_dict.items():
    print(f"{k}: {v}")

In [None]:
# removed 99: {'region_name': 'Diversion from Echo Lake', 'weighting_factor': 0.0},
# DEPRECATED
# region_dict = {
#     0:  {'region_name': 'Goose Lake', 'weighting_factor': 0.0},
#     11: {'region_name': 'Westside Streams of SJR', 'weighting_factor': 0.002758980030193925},
#     16: {'region_name': 'Other Rim Inflows of Sac', 'weighting_factor': 0.06520559638738632},
#     9:  {'region_name': 'Eastside Streams of Delta', 'weighting_factor': 0.047054700553417206},
#     17: {'region_name': 'Other Rim Inflows of SJR', 'weighting_factor': 0.008974390104413033},
#     5:  {'region_name': 'Upper Stanislaus River', 'weighting_factor': 0.03673909977078438},
#     8:  {'region_name': 'Lake Millerton', 'weighting_factor': 0.05571430176496506},
#     10: {'region_name': 'Westside Streams of Sac', 'weighting_factor': 0.0789882019162178},
#     12: {'region_name': 'Valley Floor of Sac', 'weighting_factor': 0.06745839864015579},
#     18: {'region_name': 'Lower Yuba-Bear Rim Inflow', 'weighting_factor': 0.018660200759768486},
#     14: {'region_name': 'Tulare Basin', 'weighting_factor': 0.0},
#     1:  {'region_name': 'Lake Shasta', 'weighting_factor': 0.1778690069913864},
#     15: {'region_name': 'Lake Trinity', 'weighting_factor': 0.04051230102777481},
#     2:  {'region_name': 'Upper Feather River', 'weighting_factor': 0.13809999823570251},
#     13: {'region_name': 'Valley Floor of SJR', 'weighting_factor': 0.008356500416994095},
#     3:  {'region_name': 'Upper Yuba River', 'weighting_factor': 0.07005230337381363},
#     19: {'region_name': 'Delta', 'weighting_factor': 0.026663200929760933},
#     4:  {'region_name': 'Upper American River', 'weighting_factor': 0.08627369999885559},
#     6:  {'region_name': 'Upper Tuolumne River', 'weighting_factor': 0.05876690149307251},
#     7:  {'region_name': 'Upper Merced River', 'weighting_factor': 0.030512800440192223}
# }

In [None]:
def get_model_params(run_list_path):
    """Read each set of model parameters into dictionary from csv file.
    
    Return list of dictionaries.
    """
    model_params = []
    with open(run_list_path, "r") as src:
        d = csv.DictReader(src)
        for row in d:
            model_params.append(row)
    return model_params

In [None]:
def get_dataset(esm_datastore, model_params):
    """Return xarray.DataSet from model parameters."""
    # Open catalog of available data sets using intake-esm package
    cat = intake.open_esm_datastore(esm_datastore)
    cat_item = cat.search(
        activity_id=model_params["activity_id"],
        institution_id=model_params["institution_id"],
        table_id=model_params["table_id"], 
        variable_id=['pr','tasmax','tasmin'],
        experiment_id=model_params["experiment_id"],
        grid_label=model_params["grid_label"],
        member_id=model_params["member_id"],
        source_id=model_params["source_id"],  
    )
    
    # Add catalog item to dataset dict
    data_dict = cat_item.to_dataset_dict(
      #  xarray_open_kwargs={'consolidated': True},
        storage_options={'anon': True}
    )
    
    
    # Construct dataset key to retrieve from the dictionary
    key = "{}.{}.{}.{}.{}.{}".format(
            model_params['activity_id'],
            model_params['institution_id'],
            model_params['source_id'],
            model_params['experiment_id'],
            model_params['table_id'],
            model_params['grid_label'],)
    
    # Slice the dataset to the input time window.
    
    ds = slice_by_time_years_dataset(data_dict[key],model_params['start_year'],model_params['end_year'])
    ds = convert_daily_to_monthly_dataset(ds)
    # Trim trim down to cordinates.
    #ds = trim_to_lat_lon_dataset(ds)
    return ds

In [None]:
def add_mask_to_dataset(mask_path, ds):
    # attach the mask
    with open(mask_path, 'rb') as f:
        mask = np.load(f, allow_pickle=True)
    ds.coords['mask'] = (('lat', 'lon'), mask)
    return ds

In [None]:
def convert_daily_to_monthly_dataset(ds):
    #Convert our daily values to monthly.  Precip is the accumulated and temperature is the average.
     #86400 x kg/m2/s = daily value (mm)  Check this!!!!
    ds['pr'] = ds.pr * 86400
    ds.pr.attrs["units"] = 'mm/day' 
    ds_precip = ds['pr'].resample(time="M").sum()
    ds_precip.attrs["units"] = 'mm/mon' 
    ds_temp = ds[['tasmin','tasmax']].resample(time="M").mean()
    
    #Change the temp to C
    ds_temp = ds_temp[['tasmin','tasmax']] - 273.15
    ds_temp.tasmin.attrs["units"]  = 'degC'
    ds_temp.tasmax.attrs["units"]  = 'degC'
    
    
    #Merge the dataset back into on dataset.
    ds= xr.merge([ds_precip,ds_temp])
    return ds

In [None]:
def slice_by_time_years_dataset(ds,startyear,endyear):
    # Time slice
    ds = ds.sel(
        time=slice(str(startyear), str(endyear))
        )
    return ds

In [None]:
def trim_dataset_to_bbox(ds, bbox):
    #This needs to be done for the cliping.
    ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
    ds.rio.write_crs("EPSG:4326", inplace=True)

    #Get the subset of data for watershed.
    ds = ds.rio.clip_box(
        minx=bbox["minx"],
        miny=bbox["miny"],
        maxx=bbox["maxx"],
        maxy=bbox["maxy"],
    )
    return ds

In [None]:
def get_output_file_name_monthly(model_params,end_part):
    return '%s_%s_%s_%s.csv'%(model_params['source_id'],model_params['experiment_id'],model_params['member_id'],end_part)

In [None]:
def load_dataset_with_mask(esm_datastore_in, model_params_in, mask_path,bbox):
    #Loads current dataset
    ds = get_dataset(esm_datastore, model_params)
    ds = add_mask_to_dataset(mask_path, ds)
    ds = trim_dataset_to_bbox(ds, bbox)
    return ds
    

In [None]:
def process_one(esm_datastore_in, model_params_in):
    ds = load_dataset_with_mask(esm_datastore, model_params)
    #for each subasin:
        #Process outputs without wieghts and save to CSV.
        #Process 30 year rolling average and save to CSV.
    #Process with weights. This should be 1 value for all subasins.
    #Process 30 year rolling average. 
        

In [None]:
def get_df_map_mask(id_region,ds,use_full_mask = False):
    # Returns a dataframe for the id_region. Set id_region = -1 and use_full_mask to get the entire domain. 
       # print("\tSpatial mask...")
        if use_full_mask:
            map_data = ds.where(ds.mask != id_region)
        else:
            map_data = ds.where(ds.mask == id_region)
        
        #print("\tCalculating precip...")
        results_precip = map_data.pr.mean(['lat','lon'],skipna=True)
        results_precip.attrs["units"]  = 'mm/mon'
        
        #print("\tCalculating tasmin...")
        results_tasmin = map_data.tasmin.mean(['lat','lon'])
        results_tasmin.attrs["units"]  = 'degC'
        
        #print("\tCalculating tasmax...")
        results_tasmax = map_data.tasmax.mean(['lat','lon'])
        results_tasmax.attrs["units"]  = 'degC'
        
        #print("\tMerging...")
        ds_all= xr.merge([results_precip,results_tasmax,results_tasmin])
        
        #print("\tConverting to pandas dataframe...")
        df = ds_all.to_pandas()

        df.drop('spatial_ref',axis=1, inplace=True)

        df['Year'] = df.index.strftime('%Y')
        df['Month'] = df.index.month
        df['Tave (degC)'] = df[['tasmax','tasmin']].mean(axis=1)
        df.rename({'pr': 'Pr (mm)','tasmax':'Tasmax (degC)','tasmin' : 'Tasmin (degC)'}, axis=1,inplace=True)
        
        df_r = df.iloc[:,[3,4,0,1,2,5]]
        df_n = df_r.reset_index()
        # don't drop the time to writing the output df.  We need this for the rolling average.
        #df_n.drop('time' , axis=1, inplace=True)
        return df_n
    

In [None]:
def get_weighted_dataframe(df_in, weight):
    # Computes the weighted from from df_in and returns the resulting dataframe. 
    df_in['Pr (mm) Weighted'] = df_in['Pr (mm)'] * weighting_factor
    df_in['Tasmax (degC) Weighted'] = df_in['Tasmax (degC)'] * weighting_factor
    df_in['Tasmin (degC) Weighted'] = df_in['Tasmin (degC)'] * weighting_factor
    df_in['Tave (degC) Weighted'] = df_in['Tave (degC)'] * weighting_factor
    df_in = df_in.drop(['Pr (mm)', 'Tasmax (degC)', 'Tasmin (degC)', 'Tave (degC)'], axis=1)
    return df_in
    

In [None]:
def get_sum_dataframes(df_in, df_to_add):
    #Adds the weighted results from df_to_add to the df_in and returns the resulting dataframe.
    df_in['Pr (mm) Weighted'] = df_in['Pr (mm) Weighted'].add(df_to_add['Pr (mm) Weighted'], fill_value=0)
    df_in['Tasmax (degC) Weighted'] =  df_in['Tasmax (degC) Weighted'].add(df_to_add['Tasmax (degC) Weighted'], fill_value=0)
    df_in['Tasmin (degC) Weighted'] =  df_in['Tasmin (degC) Weighted'].add(df_to_add['Tasmin (degC) Weighted'], fill_value=0)
    df_in['Tave (degC) Weighted'] =  df_in['Tave (degC) Weighted'].add(df_to_add['Tave (degC) Weighted'], fill_value=0)
    return df_in

The loop goes through all GCMs and writes the individual subbasin, the area weighted, and the flow weighted to results dictionaries with filname as the key and value equal to result dataframe. 

In [None]:
# main loop
all_model_params = get_model_params(run_list_path)

results_dict = {}
weighted_results_dict = {}
non_weighted_basin_results_dict = {}
for model_params in all_model_params:
    #Add masking to the dataset.
    ds = load_dataset_with_mask(esm_datastore, model_params, mask_path,bbox)
   
    #Force load the dataset.
    ds = ds.compute()
        
    df_w = None
    df_a = None
    for id_region, v in region_dict.items():
      
        # Get this regions results
        df_n = get_df_map_mask(id_region,ds)
        output_filename = get_output_file_name_monthly(model_params, '%s-19'%'{:02d}'.format(id_region))
        #print('Adding.. %s'%output_filename)
        df_out = df_n.drop('time' , axis=1)
        results_dict[output_filename] = df_out
        
        # Get Area Weighted dataframe
        weighting_factor = v['area_ratio']
        df_weighted_a = get_weighted_dataframe(df_n.copy(deep=True),weighting_factor)
        if df_a is None:
            df_a = df_weighted_a.copy(deep=True)
        else:
            df_a = get_sum_dataframes(df_a,df_weighted_a)
            
        # Get Flow Weighted results 
        weighting_factor = v['flow_ratio']
        df_weighted = get_weighted_dataframe(df_n.copy(deep=True),weighting_factor)
        if df_w is None:
            df_w = df_weighted.copy(deep=True)
        else:
            df_w = get_sum_dataframes(df_w,df_weighted)
    print('Processed %s...'%get_output_file_name_monthly(model_params,'').replace('.csv',''))       
    #Add weighted dataframes to output.
    output_filename = get_output_file_name_monthly(model_params, "19FlowWeighted")
    weighted_results_dict[output_filename] = df_w
    output_filename = get_output_file_name_monthly(model_params, "19AreaWeighted")
    non_weighted_basin_results_dict[output_filename] = df_a
   


import matplotlib.pyplot as plt
weighted_results_dict

dftest = weighted_results_dict['ACCESS-CM2_historical_r1i1p1f1_19FlowWeighted.csv'].copy(deep=True)
dftest['Rolling_pr'] = dftest['Pr (mm) Weighted'].rolling(360).mean()

weighted_results_dict['ACCESS-CM2_historical_r1i1p1f1_19FlowWeighted.csv']  = dftest
plt.plot(dftest['time'], dftest['Pr (mm) Weighted'])
plt.plot(dftest['time'], dftest['Rolling_pr'])
plt.title('Data With Rolling Average')
plt.show()


In [None]:
#Write to ouput.
zip_path = os.path.join(output_folder, file_zip)

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    for k, v in results_dict.items():
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        fileout = dir_individual + '/' + k
        zf.writestr(fileout, text_stream.getvalue())
    for k, v in weighted_results_dict.items():
        fileout = dir_flow_weighted + '/' + k
        fileout_raw = dir_flow_weighted + '/Raw/' + k
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout_raw, text_stream.getvalue())
        v.drop('time' , axis=1, inplace=True)
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout, text_stream.getvalue())
    for k, v in non_weighted_basin_results_dict.items():
        fileout = dir_area_weighted + '/' + k
        fileout_raw = dir_area_weighted + '/Raw/' + k
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout_raw, text_stream.getvalue())
        v.drop('time' , axis=1, inplace=True)
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        zf.writestr(fileout, text_stream.getvalue())
    

In [None]:
client.close()

In [None]:
df = results_dict['ACCESS-CM2_historical_r1i1p1f1_1.csv']

In [None]:
len(results_dict)

In [None]:
len(weighted_results_dict)

In [None]:
weighted_df_list = []
for k, v in weighted_results_dict.items():
    weighted_df_list.append(v)

In [None]:
from functools import reduce

In [None]:
weighted_result_df = reduce(lambda a, b: a.add(b, fill_value=0), weighted_df_list)

In [None]:
weighted_result_df

In [None]:
for k, v in results_dict.items():
    print(k)

In [None]:
#Write to ouput.
zip_path = os.path.join(output_folder, "test5.zip")

#output_filename = get_output_file_name_monthly(model_params,id_subbasin)

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    for k, v in results_dict.items():
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        
        zf.writestr(k, text_stream.getvalue())