# Investigate the impacts of all realizations on all users 

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import duckdb
import ast
import matplotlib.pyplot as plt
import itertools

## Load drought counts and classify each realization

In [2]:
# Load in all drought counts for all realizations
droughts_df = pd.read_csv('../rival_framings_demand/drought_counts_all_realizations.csv', index_col=0)
droughts_df.index = np.arange(1, len(droughts_df) + 1)

In [3]:
# We will use these counts to classify whether a realization belongs 
# in history, paleo variability, or all-encompassing experiments

# create a list of our conditions
conditions = [
    (droughts_df['Decadal counts']/105*100 <= 21) & (droughts_df['Multidecadal counts']/105*100 == 0), #history criteria
    ((droughts_df['Decadal counts']/105*100 > 21) | (droughts_df['Multidecadal counts']/105*100 > 0)) & #paleo criteria
    (droughts_df['Decadal counts']/105*100 <= 57) & (droughts_df['Multidecadal counts']/105*100 <= 42),
    (droughts_df['Decadal counts']/105*100 > 57) | (droughts_df['Multidecadal counts']/105*100 > 42)] #all-encompassing critera

# create a list of the values we want to assign for each condition
values = ['History', 'Paleo', 'Encompassing']

# create a new column and use np.select to assign values to it using our lists as arguments
droughts_df['Classification'] = np.select(conditions, values)

# display updated DataFrame
droughts_df.head()

Unnamed: 0,Decadal counts,Multidecadal counts,Total counts,Drought years,Classification
1,0,0,0,"[[], []]",History
2,21,0,21,"[[], [79, 80, 81, 82, 83, 84, 85, 86, 87, 88, ...",History
3,0,50,50,"[[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, ...",Encompassing
4,15,0,15,"[[], [96, 97, 98, 99, 100, 101, 102, 103, 89, ...",History
5,22,0,22,"[[], [91, 92, 93, 94, 95, 96, 97, 98, 99, 36, ...",History


In [4]:
droughts_df['Classification'].value_counts()

Encompassing    533
History         304
Paleo           163
Name: Classification, dtype: int64

In [5]:
# directory where the experiment data is stored
flow_data_dir = '../rival_framings_demand/xdd_parquet_flow'

## Loop through every realization and calculate summary impact metrics

Function converting order number to sample and realization for file retrieval

In [6]:
no_to_realization = lambda x: (int((x-1)/10)+1, (x-1)%10+1)

Create empty dataframe to store impacts summary per realization

In [7]:
dtypes = np.dtype(
    [
        ("Duration", int),
        ("Magnitude", float),
        ("%_users", float)
    ]
)
df_impacts = pd.DataFrame(np.zeros(1000, dtype=dtypes))
df_impacts.index = np.arange(1, len(df_impacts) + 1)

Define function to calculate duration of continuous occurences

In [8]:
def shortage_duration(sequence, threshold):
    cnt_shrt = [sequence[i]>=threshold for i in range(len(sequence))] # Returns a list of True values when there's a shortage
    shrt_dur = [ sum( 1 for _ in group ) for key, group in itertools.groupby( cnt_shrt ) if key ] # Counts groups of True values
    return shrt_dur

Loop through all realizations and calculate user impacts

In [9]:
for k in df_impacts.index:

    # Create three arrays to store impacts for each user (no of users = 343)
    duration_array = np.zeros(343)
    magnitude_array = np.zeros(343)
    experienced_shortage_array = np.zeros(343)

    realization_index = k
    # get sample and realization for file retrieval
    sample, real = no_to_realization(realization_index)
    # get years that are in drought in specific realization
    years = list(itertools.chain.from_iterable(ast.literal_eval(droughts_df.at[int(realization_index),'Drought years'])))
    years = tuple([x+1908 for x in years])
    
    # only do the following if there's at least one drought year
    # if there is no drought, values will stay default (zero)
    if years:
        # target glob path
        glob_path = os.path.join(flow_data_dir, f'S{sample}_{real}.parquet')
        #filter out gauge structures and keep only drought years

        sql = f"""
        SELECT 
            *
        FROM
            '{glob_path}'
        WHERE
            year in {years}
            AND structure_id NOT LIKE '09%' ; 
        """
        try:
            df = duckdb.query(sql).df()
            # Calculate shortage to drought percentage
            df['ratio'] = df['shortage']/df['demand']*100
            df['ratio'] = df['ratio'].fillna(0)

            # Get list of users
            users = df['structure_id'].unique()

            # Loop through all users and calculate individual impacts
            for j in range(len(users)):
                user_index = j
                user = users[user_index]
                user_impacts = df[df['structure_id']==user]
                if np.sum(user_impacts['ratio']) > 0: # check if user experienced any impacts
                    experienced_shortage_array[user_index] = 1
                    # calculate mean user impacts
                    magnitude_array[user_index] = np.around(np.mean(user_impacts['ratio']), decimals= 1) 
                    # calculate longest duration of at least mean
                    user_shrt_dur = shortage_duration(user_impacts['ratio'].values, 
                                                      magnitude_array[user_index]) 
                    if user_shrt_dur:
                        duration_array[user_index] = np.around(np.max(user_shrt_dur)/12, decimals= 1) 
            # Summarize impacts across all users 
            df_impacts.at[realization_index, "%_users"] = np.around(np.sum(experienced_shortage_array)/343*100, decimals= 0)
            df_impacts.at[realization_index, "Magnitude"] = np.around(np.mean(magnitude_array), decimals= 0)
            df_impacts.at[realization_index, "Duration"] = np.around(np.mean(user_shrt_dur), decimals= 0)
        except RuntimeError:
            print(f'missing file S{sample}_{real}')

missing file S7_3
missing file S28_6
missing file S49_1
missing file S93_2
missing file S93_7
missing file S94_5


In [12]:
df_impacts['Classification'] = droughts_df['Classification']
df_impacts.to_csv('drought_impacts_all_realizations.csv')