Script description goes here.

In [None]:
import csv
from io import StringIO
import json
import os
import tempfile
import zipfile

import pandas as pd
import intake
from shapely.geometry import Point, Polygon
import geopandas as gpd
import numpy as np
import xarray as xr
import dask
import panel as pn
from dask.distributed import progress
from dask.distributed import Client
from climakitae.cluster import Cluster

# VARIABLES
#Use these cordinates to clip around the watershed of interest.
# latitude = [34.775317,42.432494]
# longitude = [-123.097421,-117.980799]
bbox = {
    "maxy": 42.432494,
    "miny": 34.775317,
    "minx": -123.097421,
    "maxx": -117.980799,
}


# run_list_path = "data/GCM_Run_List_All.csv"
# file_zip = "GCM_All.zip"

#run_list_path = "data/GCM_Run_List_ACCESS-CM2_4.csv"
#file_zip = "GCM_Test_4.zip"


run_list_path = "data/GCM_Run_List_1-43.csv"
file_zip = "GCM_1-43.zip"

run_list_path = "data/GCM_Run_List_44-82.csv"
file_zip = "GCM_44-82.zip"

run_list_path = "data/GCM_Run_List_83-117.csv"
file_zip = "GCM_83-117.zip"

run_list_path = "data/GCM_Run_List_QC_1Year.csv"
file_zip = "GCM_QC.zip"

basin_weights_csv = "data/Basin_Weights.csv"
esm_datastore = "https://cadcat.s3.amazonaws.com/cae-collection.json"
output_folder = "outputs"
mask_path = "mask/mask.npy"
dir_area_weighted = 'AREA_WEIGHTED_CENTRALVALLEY'
dir_flow_weighted = 'FLOW_WEIGHTED_CENTRALVALLEY'
dir_individual = 'INDIVIDUAL_BASIN_LOCA2'
dir_area_weighted_rolling = 'AREA_WEIGHTED_30_YEAR_ROLLING_AVE_CENTRALVALLEY'
dir_flow_weighted_rolling = 'FLOW_WEIGHTED_30_YEAR_ROLLING_AVE_CENTRALVALLEY'

pn.extension()
xr.set_options(keep_attrs=True)
dask.config.set({"array.slicing.split_large_chunks": True})
#Load dask Area for faster computing. 
#Note, this will take awhile but in long run processing should be faster when compute is called.
cluster = Cluster()
#cluster.adapt(minimum=0, maximum=16)
cluster.adapt(minimum=0, maximum=16)
client = cluster.get_client()

#Get client link.
client

In [None]:
region_dict = {}
with open(basin_weights_csv, "r") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        region_dict[int(row["ID"])] = {
                            "region_name": row["Regions"],
                            "flow_ratio": float(row["Flow Ratio"]),
                            "area_ratio": float(row["Area Ratio"]),
                        }
#for k, v in region_dict.items():
#    print(f"{k}: {v}")

In [None]:
def get_model_params(run_list_path):
    """Read each set of model parameters into dictionary from csv file.
    
    Return list of dictionaries.
    """
    model_params = []
    with open(run_list_path, "r") as src:
        d = csv.DictReader(src)
        for row in d:
            model_params.append(row)
    return model_params

In [None]:
def get_dataset(esm_datastore, model_params):
    """Return xarray.DataSet from model parameters."""
    # Open catalog of available data sets using intake-esm package
    cat = intake.open_esm_datastore(esm_datastore)
    cat_item = cat.search(
        activity_id=model_params["activity_id"],
        institution_id=model_params["institution_id"],
        table_id=model_params["table_id"], 
        variable_id=['pr','tasmax','tasmin'],
        experiment_id=model_params["experiment_id"],
        grid_label=model_params["grid_label"],
        member_id=model_params["member_id"],
        source_id=model_params["source_id"],  
    )
    
    # Add catalog item to dataset dict
    data_dict = cat_item.to_dataset_dict(
      #  xarray_open_kwargs={'consolidated': True},
        storage_options={'anon': True}
    )
    
    
    # Construct dataset key to retrieve from the dictionary
    key = "{}.{}.{}.{}.{}.{}".format(
            model_params['activity_id'],
            model_params['institution_id'],
            model_params['source_id'],
            model_params['experiment_id'],
            model_params['table_id'],
            model_params['grid_label'],)
    
    # Slice the dataset to the input time window.
    ds = slice_by_time_years_dataset(data_dict[key],model_params['start_year'],model_params['end_year'])
    ds = convert_units_only_dataset(ds)
    return ds

In [None]:
def add_mask_to_dataset(mask_path, ds):
    """ Attach the mask input dataset ds. """
    with open(mask_path, 'rb') as f:
        mask = np.load(f, allow_pickle=True)
    ds.coords['mask'] = (('lat', 'lon'), mask)
    return ds

In [None]:
def convert_units_only_dataset(ds):
    """ Convert our daily values.  Precip is the accumulated and temperature is the average.
        Precip is converted to mm from kg/m2/s: 86400 x kg/m2/s = daily value (mm).
        Temperature is converts to C.  C = K - 273.15
    """
    #Convert our daily values to monthly.  Precip is the accumulated and temperature is the average.
    # 86400 x kg/m2/s = daily value (mm)
    ds_precip = ds.pr * 86400
    ds_precip.attrs["units"] = 'mm/day' 
    
    #Change the temp to C
    ds_temp = ds[['tasmin','tasmax']] - 273.15
    ds_temp.tasmin.attrs["units"]  = 'degC'
    ds_temp.tasmax.attrs["units"]  = 'degC'
    
    
    #Merge the dataset back into on dataset.
    ds= xr.merge([ds_precip,ds_temp])
    return ds

In [None]:
def convert_daily_to_monthly_dataset(ds):
    """ Convert our daily values to monthly.  Precip is the accumulated and temperature is the average.
        Precip is converted to mm from kg/m2/s: 86400 x kg/m2/s = daily value (mm).
        Temperature is converts to C.  C = K - 273.15
    """
    #Convert our daily values to monthly.  Precip is the accumulated and temperature is the average.
    # 86400 x kg/m2/s = daily value (mm)
    ds['pr'] = ds.pr * 86400
    ds.pr.attrs["units"] = 'mm/day' 
    ds_precip = ds['pr'].resample(time="M").sum()
    ds_precip.attrs["units"] = 'mm/mon' 
    ds_temp = ds[['tasmin','tasmax']].resample(time="M").mean()
    
    #Change the temp to C
    ds_temp = ds_temp[['tasmin','tasmax']] - 273.15
    ds_temp.tasmin.attrs["units"]  = 'degC'
    ds_temp.tasmax.attrs["units"]  = 'degC'
    
    
    #Merge the dataset back into on dataset.
    ds= xr.merge([ds_precip,ds_temp])
    return ds

In [None]:
def slice_by_time_years_dataset(ds,startyear,endyear):
    """ Slice the dataset to years of interest. """
    ds = ds.sel(
        time=slice(str(startyear), str(endyear))
        )
    return ds

In [None]:
def trim_dataset_to_bbox(ds, bbox):
    """ Clip the dataset to a box. """
    
    #This needs to be done for the cliping.
    ds.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
    ds.rio.write_crs("EPSG:4326", inplace=True)

    #Get the subset of data for watershed.
    ds = ds.rio.clip_box(
        minx=bbox["minx"],
        miny=bbox["miny"],
        maxx=bbox["maxx"],
        maxy=bbox["maxy"],
    )
    return ds

In [None]:
def get_output_file_name_monthly(model_params,end_part):
    """ Format the output file name from model_params """
    return '%s_%s_%s_%s.csv'%(model_params['source_id'],model_params['experiment_id'],model_params['member_id'],end_part)

In [None]:
def load_dataset_with_mask(esm_datastore_in, model_params_in, mask_path,bbox):
    """ Loads the dataset, adds mask, and trims dataset to box arround area of interest"""
    ds = get_dataset(esm_datastore, model_params)
    ds = add_mask_to_dataset(mask_path, ds)
    ds = trim_dataset_to_bbox(ds, bbox)
    return ds
    

In [None]:
def get_df_map_mask(id_region,ds,use_full_mask = False):
    """ Returns a dataframe for the id_region. Set id_region = -1 and use_full_mask to get the entire domain. """ 
   # print("\tSpatial mask...")
    if use_full_mask:
        map_data = ds.where(ds.mask != id_region)
    else:
        map_data = ds.where(ds.mask == id_region)

    #print("\tCalculating precip...")
    results_precip = map_data.pr.mean(['lat','lon'],skipna=True)
    results_precip.attrs["units"]  = 'mm/day'

    #print("\tCalculating tasmin...")
    results_tasmin = map_data.tasmin.mean(['lat','lon'])
    results_tasmin.attrs["units"]  = 'degC'

    #print("\tCalculating tasmax...")
    results_tasmax = map_data.tasmax.mean(['lat','lon'])
    results_tasmax.attrs["units"]  = 'degC'

    #print("\tMerging...")
    ds_all= xr.merge([results_precip,results_tasmax,results_tasmin])

    #print("\tConverting to pandas dataframe...")
    df = ds_all.to_pandas()

    df.drop('spatial_ref',axis=1, inplace=True)

    df['Year'] = df.index.strftime('%Y')
    df['Month'] = df.index.month
    df['Tave (degC)'] = df[['tasmax','tasmin']].mean(axis=1)
    df.rename({'pr': 'Pr (mm)','tasmax': 'Tasmax (degC)','tasmin' : 'Tasmin (degC)'}, axis=1,inplace=True)

    df_r = df.iloc[:,[3,4,0,1,2,5]]
    df_n = df_r.reset_index()
    # don't drop the time to writing the output df.  We need this for the rolling average.
    #df_n.drop('time' , axis=1, inplace=True)
    return df_n
    

In [None]:
def get_weighted_dataframe(df_in, weight):
    """ Computes the weighted from from df_in and returns the resulting dataframe. """
    df_in['Pr (mm)'] = df_in['Pr (mm)'] * weighting_factor
    df_in['Tasmax (degC)'] = df_in['Tasmax (degC)'] * weighting_factor
    df_in['Tasmin (degC)'] = df_in['Tasmin (degC)'] * weighting_factor
    df_in['Tave (degC)'] = df_in['Tave (degC)'] * weighting_factor
    #df_in = df_in.drop(['Pr (mm)', 'Tasmax (degC)', 'Tasmin (degC)', 'Tave (degC)'], axis=1)
    return df_in
    

In [None]:
def get_sum_dataframes(df_in, df_to_add):
    """ Adds the weighted results from df_to_add to the df_in and returns the resulting dataframe. """
    df_in['Pr (mm)'] = df_in['Pr (mm)'].add(df_to_add['Pr (mm)'], fill_value=0)
    df_in['Tasmax (degC)'] =  df_in['Tasmax (degC)'].add(df_to_add['Tasmax (degC)'], fill_value=0)
    df_in['Tasmin (degC)'] =  df_in['Tasmin (degC)'].add(df_to_add['Tasmin (degC)'], fill_value=0)
    df_in['Tave (degC)'] =  df_in['Tave (degC)'].add(df_to_add['Tave (degC)'], fill_value=0)
    return df_in

In [None]:
def get_rolling_ave(dict_df_weighted_all,num_month_rolling,append_name):
    """Gets rolling average for each ssp"""
    dict_fil_rolling_df = {}  # dict with filename keys and rolling average dataframe as value.
    dict_gcm_hist_realization = {}  # dict with historical dataframes.
    dict_gcm_other_realization = {} # dict with ssp dataframes.
    #Key is the formated file name and value is dataframe.
    for key_file, value_df in dict_df_weighted_all.items():
        lst_file_parts = key_file.split('_')
        if 'historical' in key_file:
            if not lst_file_parts[0] in dict_gcm_hist_realization:
                dict_gcm_hist_realization[lst_file_parts[0]] = {}
            if not lst_file_parts[1] in dict_gcm_hist_realization[lst_file_parts[0]]:
                dict_gcm_hist_realization[lst_file_parts[0]][lst_file_parts[1]] = {}
            if not lst_file_parts[2] in dict_gcm_hist_realization[lst_file_parts[0]][lst_file_parts[1]]:
                dict_gcm_hist_realization[lst_file_parts[0]][lst_file_parts[1]][lst_file_parts[2]] = []
            dict_gcm_hist_realization[lst_file_parts[0]][lst_file_parts[1]][lst_file_parts[2]].append(value_df)
        else:
            if not lst_file_parts[0] in dict_gcm_other_realization:
                dict_gcm_other_realization[lst_file_parts[0]] = {}
            if not lst_file_parts[1] in dict_gcm_other_realization[lst_file_parts[0]]:
                dict_gcm_other_realization[lst_file_parts[0]][lst_file_parts[1]] = {}
            if not lst_file_parts[2] in dict_gcm_other_realization[lst_file_parts[0]][lst_file_parts[1]]:
                dict_gcm_other_realization[lst_file_parts[0]][lst_file_parts[1]][lst_file_parts[2]] = []
            dict_gcm_other_realization[lst_file_parts[0]][lst_file_parts[1]][lst_file_parts[2]].append(value_df)        
    
    # Do rolling average and output with dictionary key as filename and values as rolling average dataframe.
    df_rolling = pd.DataFrame
    for key_gcm in dict_gcm_hist_realization:
        for key_ssp in dict_gcm_hist_realization[key_gcm]:
            for key_relization in dict_gcm_hist_realization[key_gcm][key_ssp]:
                for df_history in dict_gcm_hist_realization[key_gcm][key_ssp][key_relization]:
                    for key_ssp_other in dict_gcm_other_realization[key_gcm]:
                        for df_ssp in dict_gcm_other_realization[key_gcm][key_ssp_other][key_relization]:
                            df_rolling = pd.concat([df_history,df_ssp], axis=0)
                            df_rolling['Pr (mm)'] = df_rolling['Pr (mm)'].rolling(360).mean()
                            df_rolling['Tasmax (degC)'] = df_rolling['Tasmax (degC)'].rolling(360).mean()
                            df_rolling['Tasmin (degC)'] = df_rolling['Tasmin (degC)'].rolling(360).mean()
                            df_rolling['Tave (degC)'] = df_rolling['Tave (degC)'].rolling(360).mean()
                            file_out = '%s_%s_%s_%s.csv'%(key_gcm,key_ssp_other,key_relization,append_name)
                            dict_fil_rolling_df[file_out] = df_rolling
    return dict_fil_rolling_df
    

The loop goes through all GCMs and writes the individual subbasin, the area weighted, and the flow weighted to results dictionaries with filname as the key and value equal to result dataframe. 

In [None]:
# main loop
all_model_params = get_model_params(run_list_path)

#Defile output dicts.
results_dict = {}
flow_weighted_results_dict = {}
area_weighted_basin_results_dict = {}

for model_params in all_model_params:
    #Add masking to the dataset.
    ds = load_dataset_with_mask(esm_datastore, model_params, mask_path,bbox)
   
    #Force load the dataset.
    ds = ds.compute()
    output_filename = '%s_%s_%s_%s.nc'%(model_params['source_id'],model_params['experiment_id'],model_params['member_id'],'1950')
    map_data = ds.where(ds.mask != -1)
    map_data.to_netcdf(os.path.join(output_folder, output_filename))
        
    df_w = None
    df_a = None
    for id_region, v in region_dict.items():     
        # Get this regions results
        df_n = get_df_map_mask(id_region,ds)
        output_filename = get_output_file_name_monthly(model_params, '%s-19'%'{:02d}'.format(id_region))
        #print('Adding.. %s'%output_filename)
        #df_out = df_n.drop('time' , axis=1)
        results_dict[output_filename] = df_n
    
   

In [None]:
#Write to ouput.
zip_path = os.path.join(output_folder, file_zip)

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    for k, v in results_dict.items():
        text_stream = StringIO();
        v.to_csv(text_stream, index=False)
        fileout = dir_individual + '/' + k
        zf.writestr(fileout, text_stream.getvalue())

    

In [None]:
client.close()