In [1]:
import os
import pandas as pd
from datetime import datetime
from math import log10
from scipy.stats import ecdf
from multiprocessing import Pool, cpu_count

In [3]:
f = pd.read_csv("../data/pnwNP_modeledData/10378500.csv")

In [4]:
f

Unnamed: 0.1,Unnamed: 0,streamflow_VIC,time,streamflow_PRMS,streamflow_NWM2d0,streamflow_NWM2d1,streamflow_NWIS,gage
0,0,0.767205,1950-01-01,12.308974,,,0.045307,10378500
1,1,1.181407,1950-01-02,17.368828,,,0.042475,10378500
2,2,1.245326,1950-01-03,15.873023,,,0.042475,10378500
3,3,1.250622,1950-01-04,13.633054,,,0.045307,10378500
4,4,1.247074,1950-01-05,11.714673,,,0.048139,10378500
...,...,...,...,...,...,...,...,...
25928,25928,,2020-12-27,,,0.720000,,10378500
25929,25929,,2020-12-28,,,0.718333,,10378500
25930,25930,,2020-12-29,,,0.710000,,10378500
25931,25931,,2020-12-30,,,0.710000,,10378500


In [None]:
files = [f for f in os.listdir('data/daily_data_with_ climate_and_PET/csv') if f.endswith('.csv')]

In [None]:
def metrics_fun(n):
    def recession_fun(m):
        t = df[df['event_id'] == m].copy()
        t['nf_start'].fillna(0, inplace=True)
        
        if t['nf_start'].sum() != 0 and t['q'].iloc[0] != 0 and t['q_peak'].sum() != 0:
            t['dry_event_id'] = t['nf_start'].cumsum()
            t['dry_event_id'] = t['dry_event_id'].where(t['q'] > 0, 0)
            
            dry_date = t[t['dry_event_id'] > 0].groupby('dry_event_id').agg({'date': 'min'}).reset_index()
            dry_date = dry_date.sort_values(by=['n', 'date'], ascending=[False, True]).iloc[0]['date']
            
            t = t[t['date'] <= dry_date]
            
            event_id = t['event_id'].iloc[0]
            peak_date = datetime.strptime(t['date'].iloc[0], '%Y-%m-%d').timetuple().tm_yday
            peak_value = t['q'].iloc[0]
            peak_quantile = ecdf(df['q'])(peak_value)
            peak2zero = len(t)
            
            t['dQ'] = t['q'].shift() - t['q']
            t = t[t['dQ'] >= 0]
            
            model = pd.DataFrame({'log10_dQ': log10(t['dQ'] + 0.1), 'log10_q': log10(t['q'] + 0.1)})
            model = model.dropna()
            
            drying_rate = model['log10_q'].iloc[1]
            p_value = model['log10_dQ'].iloc[1]
            
            output = pd.DataFrame({'event_id': [event_id], 
                                   'peak_date': [peak_date], 
                                   'peak_value': [peak_value], 
                                   'peak_quantile': [peak_quantile], 
                                   'peak2zero': [peak2zero], 
                                   'drying_rate': [drying_rate], 
                                   'p_value': [p_value]})
        else:
            output = pd.DataFrame({'event_id': [t['event_id'].iloc[0]],
                                   'peak_date': [None], 'peak_value': [None],
                                   'peak_quantile': [None],
                                   'peak2zero': [None],
                                   'drying_rate': [None],
                                   'p_value': [None]})
        
        return output
    
    def dry_fun(m):
        t = df[df['event_id'] == m].copy()
        t['nf_start'].fillna(0, inplace=True)
        
        if t['nf_start'].sum() != 0:
            t['dry_event_id'] = t['nf_start'].cumsum()
            t['dry_event_id'] = t['dry_event_id'].where(t['q'] > 0, 0)
            
            dry_event = t[t['dry_event_id'] > 0].groupby('dry_event_id').agg({'date': 'min'}).reset_index()
            dry_event = dry_event.sort_values(by=['n', 'date'], ascending=[False, True]).iloc[0]['dry_event_id']
            
            t = t[t['dry_event_id'] == dry_event]
            
            output = pd.DataFrame({'event_id': [t['event_id'].iloc[0]],
                                   'calendar_year': [t['date'].iloc[0].year],
                                   'season': ['Winter' if t['date'].iloc[0].month <= 3 else 'Spring' if t['date'].iloc[0].month <= 6 else 'Summer' if t['date'].iloc[0].month <= 9 else 'Fall'],
                                   'meteorologic_year': [t['date'].iloc[0].year - 1 if t['season'].iloc[0] == 'Winter' else t['date'].iloc[0].year],
                                   'dry_date_start': [datetime.strptime(t['date'].iloc[0], '%Y-%m-%d').timetuple().tm_yday],
                                   'dry_date_mean': [t['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').timetuple().tm_yday).mean()],
                                   'dry_dur': [len(t)]})
        else:
            output = pd.DataFrame({'event_id': [t['event_id'].iloc[0]],
                                   'calendar_year': [None],
                                   'season': [None], 
                                   'meteorologic_year': [None],
                                   'dry_date_start': [None],
                                   'dry_date_mean': [None],
                                   'dry_dur': [None]})
        
        return output
    
    gage = os.path.splitext(files[n])[0]
    
    df = pd.read_csv(os.path.join('data/daily_data_with_ climate_and_PET/csv', files[n]), parse_dates=['Date'])
    df['q'] = df['X_00060_00003']
    df = df[['Date', 'q']].dropna()
    df['q'] = df['q'].round(1)
    df['q_peak'] = df['q'].where(df['q'] > df['q'].quantile(0.25), 0)
    
    df['slp_b'] = (df['q_peak'] - df['q_peak'].shift()) / (df['Date'] - df['Date'].shift())
    df['slp_f'] = (df['q_peak'].shift(-1) - df['q_peak']) / (df['Date'].shift(-1) - df['Date'])
    df['slp_f'] = (df['q_peak'].shift(-1) - df['q_peak']) / (df['Date'].shift(-1) - df['Date'])
    
    df['peak_flag'] = (df['slp_b'] > 0.0001) & (df['slp_f'] < 0)
    df['peak_flag'] = df['peak_flag'].fillna(0)
    
    df['event_id'] = df['peak_flag'].cumsum() + 1
    df['nf_start'] = (df['q'] == 0) & (df['q'].shift() != 0)
    
    metrics = pd.concat([recession_fun(m) for m in range(1, df['event_id'].max() + 1)]).reset_index(drop=True)
    metrics = pd.concat([metrics, dry_fun(m) for m in range(1, df['event_id'].max() + 1)]).reset_index(drop=True)
    metrics['gage'] = gage
    metrics = metrics.dropna(subset=['dry_dur'])
    
    return metrics

In [None]:
def execute(a):
    try:
        return metrics_fun(a)
    except Exception as e:
        return pd.DataFrame({'event_id': [None],
                             'peak_date': [None],
                             'peak2zero': [None],
                             'drying_rate': [None],
                             'calendar_year': [None],
                             'season': [None],
                             'meteorologic_year': [None],
                             'dry_date_start': [None],
                             'dry_date_mean': [None],
                             'dry_dur': [None],
                             'p_value': [None], 
                             'gage': [os.path.splitext(files[a])[0]]})

if __name__ == '__main__':
    t0 = datetime.now()
    
    with Pool(cpu_count()) as p:
        output = pd.concat(p.map(execute, range(len(files))))
    
    output.to_csv('./data/metrics_by_event.csv', index=False)
    
    tf = datetime.now()
    tf - t0