In [None]:
from importlib import reload
#reload(Utilities)
#reload(clm)

import sys, os
import re

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns
from packaging import version
import itertools
from dateutil.parser import parse
from operator import itemgetter

from pmdarima import auto_arima
import statsmodels.api as sm
from statsmodels.tsa.stattools import acovf, acf, pacf, pacf_yw, pacf_ols
from pandas.plotting import lag_plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.graphics.tsaplots import month_plot, quarter_plot, seasonal_plot
from statsmodels.tsa.arima_model import ARMA, ARIMA, ARMAResults, ARIMAResults
from statsmodels.tsa.statespace.sarimax import SARIMAX

from arch import arch_model

from scipy.stats.mstats import trim

#---------------------------------------------------------------------
import pyodbc
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
import CommonLearningMethods as clm
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
# import constants for the days of the week
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
import Utilities_dt
import Plot_Box_sns
import GrubbsTest
import DickeyFullerTest as dft

In [3]:
def convert_timestamp_to_utc_in_df(df, timestamp_col, placement_col=None, inplace=False):
    if not inplace:
        df = df.copy()
    if placement_col is None:
        placement_col = f'{timestamp_col}_from_timestamp'
    df[placement_col] = df[timestamp_col].apply(datetime.datetime.utcfromtimestamp)
    return df

def build_utc_time_column(df, time_col, placement_col=None, naive=True, inplace=False):
    # If naive=True, the timezone information is dropped from each entrty
    # If naive=False, the timezone information is kept, but will always be 
    #   equal to +00:00 as all returned times will be in UTC
    #
    # NOTE: time_col can be a single column or a list of columns
    #-----------------------------------
    if not inplace:
        df = df.copy()    
    #-----------------------------------
    assert(isinstance(time_col, str) or isinstance(time_col, list))
    if isinstance(time_col, list):
        # Note: inplace already taken care of above, so don't need to waste memory
        #       copying df for each iteration, which is why inplace=True below
        assert((isinstance(placement_col, list) and len(placement_col)==len(time_col)) 
               or placement_col is None)
        for i,col in enumerate(time_col):
            df = build_utc_time_column(df, time_col=col, 
                                       placement_col=None if placement_col is None else placement_col[i], 
                                       naive=naive, inplace=True)
        return df
    #-----------------------------------    
    if placement_col is None:
        placement_col = f'{time_col}_utc'
    # NOTE: dt.tz_localize(None) drops the timezone information, creating timezone naive entries
    #       Calling simply .tz_localize(None) doesn't seem to do anything
    df[placement_col] = pd.to_datetime(df[time_col], utc=True)
    if naive:
        df[placement_col] = df[placement_col].dt.tz_localize(None)
    return df

def convert_timezoneoffset_col_to_timedelta(df, timezoneoffset_col, inplace=False):
    if not inplace:
        df = df.copy()
    if not is_timedelta64_dtype(df[timezoneoffset_col]):
        df[timezoneoffset_col] = df[timezoneoffset_col].apply(lambda x: Utilities_dt.get_timedelta_from_timezoneoffset(x))    
    return df

def strip_tz_info_and_convert_to_dt(df, time_col, placement_col=None, 
                                    run_quick=True, n_strip=6, inplace=True):
    # Entries in time_col should be strings of format e.g., '2020-01-01T00:00:00-05:00'
    # n_strip is the number of elements to strip off the back of the time
    # For the example given, n_strip=6 as len('-05:00')==6
    #
    # NOTE: time_col can be a single column or a list of columns
    #-----------------------------------
    if not inplace:
        df = df.copy()    
    #-----------------------------------
    assert(isinstance(time_col, str) or isinstance(time_col, list))
    if isinstance(time_col, list):
        # Note: inplace already taken care of above, so don't need to waste memory
        #       copying df for each iteration, which is why inplace=True below
        assert((isinstance(placement_col, list) and len(placement_col)==len(time_col)) 
               or placement_col is None)
        for i,col in enumerate(time_col):
            df = strip_tz_info_and_convert_to_dt(df, time_col=col, 
                                                 placement_col=None if placement_col is None else placement_col[i], 
                                                 run_quick=run_quick, 
                                                 n_strip=n_strip, 
                                                 inplace=True)
        return df
    #-----------------------------------  
    if placement_col is None:
        placement_col = time_col
    if run_quick:
        df[placement_col] = df[time_col].str[:-n_strip]
        df[placement_col] = pd.to_datetime(df[placement_col])
    else:
        df[placement_col] = df[time_col].apply(Utilities_dt.clean_timeperiod_entry)
    return df

In [None]:
def plot_usage_around_outage(fig, ax, 
                             data, x, y, hue, 
                             out_t_beg, out_t_end, expand_time, data_label='', 
                             title_args=None, ax_args=None, 
                             xlabel_args=None, ylabel_args=None, 
                             df_mean=None, df_mean_col=None, mean_args=None, 
                             draw_outage_limits=True, draw_without_hue_also=False, 
                             seg_line_freq=None, palette='colorblind'):
    # Setting hue=None will aggregate over repeated values to show the mean and 95% confidence interval
    # seg_line_freq can be set to e.g., seg_line_freq='D'
    if df_mean_col is None:
        df_mean_col = y
    sns.lineplot(ax=ax, data=data[out_t_beg-expand_time:out_t_end+expand_time], 
                 x=x, y=y, hue=hue, 
                 palette=palette, label=data_label)
    if draw_without_hue_also and hue is not None:
        sns.lineplot(ax=ax, data=data[out_t_beg-expand_time:out_t_end+expand_time], 
                     x=x, y=y, hue=None, 
                     color='deeppink', linestyle='--', label='AVG')
    #----------------------------
    # Note: if hue=None is drawn, then average will already be drawn!
    if (df_mean is not None 
        and hue is not None 
        and not draw_without_hue_also):
        if mean_args is None:
            avg_label = 'AVG'
            if data_label:
                avg_label = f'{data_label} AVG'
            mean_args = dict(style='--', linewidth=3, alpha=0.50, color='deeppink', label=avg_label, legend=True)
        df_mean[out_t_beg-expand_time:out_t_end+expand_time][df_mean_col].plot(ax=ax, **mean_args)
    #----------------------------
    if draw_outage_limits:
        ax.axvline(out_t_beg, color='red')
        ax.axvline(out_t_end, color='lawngreen')
    #----------------------------
    if seg_line_freq is not None:
        seg_min = pd.to_datetime(out_t_beg-expand_time).round(seg_line_freq)
        seg_max = pd.to_datetime(out_t_end+expand_time).round(seg_line_freq)
        all_segs = pd.date_range(seg_min, seg_max, freq=seg_line_freq)
        for seg in all_segs:
            ax.axvline(seg, color='black', linestyle='--');
    #----------------------------
    if isinstance(title_args, str):
        title_args = dict(label=title_args)
    if title_args is not None:
        ax.set_title(**title_args)
    #----------------------------
    if ax_args is not None:
        ax.set(**ax_args)
    if xlabel_args is not None:
        ax.set_xlabel(**xlabel_args)
    if ylabel_args is not None:
        ax.set_ylabel(**ylabel_args)
        
    return fig,ax

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
def get_agg_cols_for_rounds_of_aggregation(agg_cols, agg_types, mix_agg_functions, n_rounds=2, identifiers=None):
    # Note: Keys in all_rounds from 1 to n_rounds (not 0 to n_rounds-1)
    # identifiers are tags that will be added to the aggregate names in each round
    #   If identifiers is not None, it must be a list whose length equals at least n_rounds
    #   Typical case: First, aggregate of meters for each time.  Second, time resampling
    #               ---> identifiers = ['_mtrs', '_TRS']
    #                                  '_mtrs' represents aggregate of meters
    #                                  '_TRS' stands for Time ReSampled
    #---------------------------
    if identifiers is not None:
        assert(len(identifiers)>=n_rounds)
    all_rounds = {}
    rd_1 = {}
    rd_1_cols = []
    idfr=''
    if identifiers is not None:
        idfr = identifiers[1-1]
    for agg_col in agg_cols:
        for agg_type in agg_types:
            curr_mult = (agg_col, agg_type)
            curr_flat = f'{agg_type}{idfr} {agg_col}'
            rd_1_cols.append({'mult':curr_mult, 'flat':curr_flat})
    rd_1['columns'] = rd_1_cols
    rd_1['rename_dict'] = {col['mult']:col['flat'] for col in rd_1['columns']}
    #-----
    all_rounds[1] = rd_1
    #-----------------
    for i_round in list(range(2,n_rounds+1)):
        rd_i = {}
        rd_i_cols = []
        rd_im1_cols = all_rounds[i_round-1]['columns']
        idfr=''
        if identifiers is not None:
            idfr = identifiers[i_round-1]
        for prev_col in rd_im1_cols:
            for agg_type in agg_types:
                prev_agg_type = prev_col['mult'][1]
                if not mix_agg_functions and prev_agg_type!=agg_type:
                    continue
                prev_col_flat = prev_col['flat']
                #-----
                curr_agg_type = agg_type
                curr_mult = (prev_col_flat, curr_agg_type)
                curr_flat = f'{curr_agg_type}{idfr} {prev_col_flat}'
                #-----
                rd_i_cols.append({'mult':curr_mult, 'flat':curr_flat})
        rd_i['columns'] = rd_i_cols
        rd_i['rename_dict'] = {col['mult']:col['flat'] for col in rd_i['columns']}
        #-----
        all_rounds[i_round] = rd_i
    return all_rounds


def rename_agg_and_other_cols_for_round_of_aggregation(df, agg_cols, agg_types, mix_agg_functions, t_round, 
                                                       identifiers=None, other_cols_to_keep=[]):
    # Note: Keys in agg_cols_for_rounds from 1 to n_rounds
    # t_round stands for 'this round' and should be 1 or greater
    # It is assumed that the 'first' aggregation function called on other_cols_to_keep
    agg_cols_for_rounds = get_agg_cols_for_rounds_of_aggregation(agg_cols, agg_types, 
                                                                 mix_agg_functions=mix_agg_functions, 
                                                                 n_rounds=t_round, identifiers=identifiers)
    full_rename_dict = {**agg_cols_for_rounds[t_round]['rename_dict'], **{(f'{col}', 'first'):col for col in other_cols_to_keep}}
    df.columns = df.columns.to_flat_index()
    df = df.rename(columns=full_rename_dict)
    return df

In [None]:
#TODO Probably don't use ensure_other_cols_to_keep_are_appropriate or decide_other_cols_to_keep
#     Either improve methods or remove functionality
def build_df_aggregated_for_each_time_index(df_15T, time_col_for_agg, agg_cols, agg_types, other_cols_to_keep=[], 
                                            ensure_other_cols_to_keep_are_appropriate=False, 
                                            decide_other_cols_to_keep=False):
    # TODO should I implement methods allowing one to group by index?  e.g. through  df.groupby(level=0)
    #---------------------------------
    # This groups the DataFrame by the time (in time_col_for_agg) and outputs a DataFrame built by aggregating 
    #   each group by the functions specified in agg_types.
    # Typically, this corresponds an average over all meters in the collection for each time index.
    # -----
    # The time column by which to group is specified in time_col_for_agg.
    #   Note: If the timestamp is in the index of df_15T, the easiest method will be to give the index
    #           a name (e.g. df_15T.index.name = 'time_idx'), and then feed the index name into this 
    #           function via time_col_for_agg.
    # -----
    # The columns to be aggregated are specified in agg_cols.
    # -----
    # The aggregate functions to use are specified in agg_types
    #   e.g. agg_types = ['mean', 'sum', 'std']
    # -----
    # If other_cols_to_keep=[] or other_cols_to_keep=None, only time_col_for_agg (as the index) and agg_cols 
    #   will be contained in df_15T_agg.
    # If other_cols_to_keep are included, these columns should only have a single unique value for each 
    #   group, as the value from the first row in the group will be used
    # The user can ensure other_cols_to_keep are appropriate by setting 
    #   ensure_other_cols_to_keep_are_appropriate=True
    # The user can let the program decide which columns to keep by setting 
    #   decide_other_cols_to_keep=True
    #   Note: if decide_other_cols_to_keep=True, anything provided in other_cols_to_keep will 
    #         be ignored
    #-------------------------------------------------------------------------
    # For the code below, assume the following to understand explanations:
    # time_col_for_agg='endtimeperiod_utc'
    # agg_cols = ['value']
    # agg_types=['mean', 'sum']
    # other_cols_to_keep = ['srvc_pole_nb', 'aep_derived_uom', 'aep_srvc_qlty_idntfr']
    #-------------------------------------------------------------------------
    if other_cols_to_keep is None:
        other_cols_to_keep=[]
    if decide_other_cols_to_keep:
        other_cols_to_keep = decide_which_other_cols_to_keep(df_15T, time_col_for_agg, other_cols_to_keep)
    if ensure_other_cols_to_keep_are_appropriate and not decide_other_cols_to_keep:
        # Don't need to check if decide_which_other_cols_to_keep run already
        assert(are_other_cols_to_keep_appropriate(df_15T, time_col_for_agg))
    #---------------------------        
    # Create initial agg_dict, which will cause agg_types to be applied to all columns in agg_cols
    # and the 'first' aggregate function to be used on all columns in other_cols_to_keep
    # --- The assumption is each grouping has only a single unique value for each
    #     column in other_cols_to_keep, therefore using 'first' is appropriate
    agg_dict = {**{x:agg_types for x in agg_cols}, **{x:'first' for x in other_cols_to_keep}}
    df_15T_agg = df_15T.groupby(time_col_for_agg).agg(agg_dict)

    #--------------------------------------------------------------
    # Flatten down the columns
    # After aggregating, the columns of df_15T_agg become multi-levelled (i.e. becomes a multiindex)
    # level 0: the original name of the column
    # level 1: the corresponding aggregate function applied
    # ===> column = 'value' --> columns [('value', 'mean'), ('value', 'sum')]
    # The code below flattens down the columns and names them 'orignal name' + ' agg function'
    # ===> ('value', 'mean') --> mean value
    # ===> ('value', 'sum') --> sum value
    # For columns not included in aggregation (i.e. those not in agg_cols, for which the 'first'
    #   aggregation funciton was used), the original column name is retained
    identifiers = ['_mtrs', '_TRS']
    df_15T_agg = rename_agg_and_other_cols_for_round_of_aggregation(df_15T_agg, agg_cols, agg_types, 
                                                                    mix_agg_functions=False, 
                                                                    t_round=1, identifiers=identifiers, 
                                                                    other_cols_to_keep=other_cols_to_keep)
    return df_15T_agg


def build_resampled_df_aggregated_for_each_time_index(df_15T_agg, agg_cols, agg_types, 
                                                      other_cols_to_keep=[], freq='H', mix_agg_functions=True):
    #-----------------------------------
    # Time resample df_15T_agg
    # The intent of this function is to be used on a DataFrame built with build_df_aggregated_for_each_time_index.
    #   - df_15T contains multiple meters for multiple time period
    #   - df_15T_agg = build_df_aggregated_for_each_time_index(df_15T, ...) then contains aggregate meter values
    #     for each time period
    #   - This function will resample df_15T_agg to, e.g. df_H_agg (hourly)
    #-----------------------------------
    # NOTE: freq can be a single frequency (of type str, e.g. 'H')
    #         In this case, a single pd.DataFrame is returned
    #       OR freq can be a list of frequencies (e.g., ['H', '4H', 'D', 'MS'])
    #         In this case, a dict is returned whose keys are the frequencies
    #         and whose values are the corresponding pd.DataFrames
    assert(isinstance(freq, str) or isinstance(freq, list))
    if isinstance(freq, list):
        return_dict = {}
        for f in freq:
            return_dict[f] = build_resampled_df_aggregated_for_each_time_index(df_15T_agg, agg_cols, agg_types, 
                                                                               other_cols_to_keep, freq=f, 
                                                                               mix_agg_functions=mix_agg_functions)
        return return_dict
    #-----------------------------------
    # For the code below, assume the following to understand explanations:
    # agg_cols = ['value']
    # agg_types=['mean', 'sum']
    # other_cols_to_keep = ['srvc_pole_nb', 'aep_derived_uom', 'aep_srvc_qlty_idntfr']
    #-----------------------------------
    # The same agg_types will be used to build df_15T_agg as df_resampled_agg.
    # mix_agg_functions allows the user to set how these two aggregations will work.
    # If mix_agg_functions=True:
    #   Final aggregate columns will be: 'mean mean value', 'sum mean value', 'mean sum value', 'sum sum value'
    # If mix_agg_functions=False:
    #   Final aggregate columns will be: 'mean mean value' and 'sum sum value'
    #-----------------------------------------------------------------------------
    identifiers = ['_mtrs', '_TRS']
    agg_cols_for_rounds = get_agg_cols_for_rounds_of_aggregation(agg_cols, agg_types, mix_agg_functions, n_rounds=2, identifiers=identifiers)
    agg_dict_2 = {}
    for col in agg_cols_for_rounds[1]['columns']:
        assert(col['flat'] not in agg_dict_2)
        agg_dict_2[col['flat']] = agg_types
    agg_dict_2 = {**agg_dict_2, **{x:'first' for x in other_cols_to_keep}}
    #----------------------
    df_resampled_agg = df_15T_agg.resample(freq).agg(agg_dict_2)
    # Flatten down the columns of df_resampled_agg
    df_resampled_agg = rename_agg_and_other_cols_for_round_of_aggregation(df_resampled_agg, agg_cols, agg_types, 
                                                                          mix_agg_functions=mix_agg_functions, 
                                                                          t_round=2, identifiers=identifiers, 
                                                                          other_cols_to_keep=other_cols_to_keep)
    return df_resampled_agg

In [None]:
def build_resampled_df(df_15T, freq, other_grouper_cols, agg_cols, agg_types, other_cols_to_keep=[], identifier=['_TRS']):
    #-----------------------------------
    # NOTE: freq can be a single frequency (of type str, e.g. 'H')
    #         In this case, a single pd.DataFrame is returned
    #       OR freq can be a list of frequencies (e.g., ['H', '4H', 'D', 'MS'])
    #         In this case, a dict is returned whose keys are the frequencies
    #         and whose values are the corresponding pd.DataFrames
    assert(isinstance(freq, str) or isinstance(freq, list))
    if isinstance(freq, list):
        return_dict = {}
        for f in freq:
            return_dict[f] = build_resampled_df(df_15T, f, 
                                                other_grouper_cols, agg_cols, agg_types, other_cols_to_keep, identifier)
        return return_dict
    #-----------------------------------
    #-----------------------------------
    # Create initial agg_dict, which will cause agg_types to be applied to all columns in agg_cols
    # and the 'first' aggregate function to be used on all columns in other_cols_to_keep
    # --- The assumption is each grouping has only a single unique value for each
    #     column in other_cols_to_keep, therefore using 'first' is appropriate
    agg_dict = {**{x:agg_types for x in agg_cols}, **{x:'first' for x in other_cols_to_keep}}
    df_resampled = df_15T.groupby([pd.Grouper(freq=freq)] + other_grouper_cols).agg(agg_dict)
    #--------------------------------------------------------------
    # Flatten down the columns
    # After aggregating, the columns of df_resampled become multi-levelled (i.e. becomes a multiindex)
    # level 0: the original name of the column
    # level 1: the corresponding aggregate function applied
    # ===> column = 'value' --> columns [('value', 'mean'), ('value', 'sum')]
    # The code below flattens down the columns and names them 'orignal name' + ' agg function'
    # ===> ('value', 'mean') --> mean value
    # ===> ('value', 'sum') --> sum value
    # For columns not included in aggregation (i.e. those not in agg_cols, for which the 'first'
    #   aggregation funciton was used), the original column name is retained
    df_resampled = rename_agg_and_other_cols_for_round_of_aggregation(df_resampled, agg_cols, agg_types, 
                                                                      mix_agg_functions=False, 
                                                                      t_round=1, identifiers=identifier, 
                                                                      other_cols_to_keep=other_cols_to_keep)
    
    return df_resampled    

In [None]:
def get_resampled_dfs(df_15T, base_freq='15T', freqs=['H', '4H', 'D', 'MS'], 
                      other_grouper_cols=['serialnumber'], other_cols_to_keep=[], flatten_idxs=True, 
                      build_agg_dfs=True, time_col_for_agg='endtimeperiod_utc', agg_cols=['value'], agg_types=['mean'], 
                      other_cols_to_keep_agg=[], mix_agg_functions=True, 
                      df_key='df', df_agg_key='df_agg'):
    # By default, when grouping the grouped columns become indices
    # If flatten_idxs = True, the indices will be flattened back out
    # to one-dimensional
    #   Note: As df_15T is not grouped, it does not need to be flattened
    #-------------------------------------------------------------
    return_dict = {}
    return_dict[base_freq] = {df_key:df_15T}
    #------------------
    # Build resampled dfs and add to return_dict
    resampled_dfs_dict = build_resampled_df(df_15T, freqs, other_grouper_cols, agg_cols, agg_types, other_cols_to_keep)
    for freq in resampled_dfs_dict.keys():
        assert(freq not in return_dict and freq in freqs)
        return_dict[freq] = {df_key:resampled_dfs_dict[freq]}
    # Add 'date' column to each df, as that was done in first version of code
    for freq in return_dict.keys():
        return_dict[freq][df_key]['date'] = return_dict[freq][df_key].index.get_level_values(0)
        if flatten_idxs and freq != base_freq:
            n_levels = return_dict[freq][df_key].index.nlevels
            return_dict[freq][df_key] = return_dict[freq][df_key].reset_index(level=list(range(1, n_levels)))
        if not flatten_idxs and freq==base_freq:
            return_dict[freq][df_key] = return_dict[freq][df_key].set_index(other_grouper_cols, append=True)
    #------------------
    # Build resampled aggregate dfs if build_agg_dfs==True
    if build_agg_dfs:
        # First, build 15_T aggregate df, which has all entries for each time index aggregated
        df_15T_agg = build_df_aggregated_for_each_time_index(df_15T, time_col_for_agg, agg_cols, agg_types, other_cols_to_keep_agg)
        return_dict[base_freq][df_agg_key] = df_15T_agg
        # Resample df_15T_agg to other desired frequencies
        resampled_agg_dfs_dict = build_resampled_df_aggregated_for_each_time_index(df_15T_agg, agg_cols, agg_types, 
                                                                                   other_cols_to_keep_agg, freqs, mix_agg_functions)
        # Add resampled agg dfs to return_dict
        for freq in resampled_agg_dfs_dict.keys():
            assert(freq in return_dict)
            return_dict[freq][df_agg_key] = resampled_agg_dfs_dict[freq]
        # Add index name and create time_col_for_agg for each df_agg, as that was done in first version of code
        for freq in return_dict.keys():
            return_dict[freq][df_agg_key].index.name='time_idx'
            return_dict[freq][df_agg_key][time_col_for_agg] = return_dict[freq][df_agg_key].index
        #------------------        
    return return_dict

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
def get_index_level(index_name, df):
    index_names = list(df.index.names)
    index_level=None
    for i,idx_name_i in enumerate(index_names):
        if idx_name_i==index_name:
            assert(index_level is None)
            index_level = i
    assert(index_level is not None)
    return index_level

def find_datetime_idx(df):
    # NOTE: For now, this will fail if more than one datetime index is found
    # This is enforced through the line: assert(datetime_idx is None)
    datetime_idx = None
    grouped_cols = list(df.index.names)
    if len(grouped_cols)==1:
        if isinstance(df.index[0], datetime.datetime):
            datetime_idx = {'idx_level':0, 'idx_name':grouped_cols[0]}
    else:
        assert(len(grouped_cols)==len(df.index[0]))
        for i in range(len(grouped_cols)):
            if isinstance(df.index[0][i], datetime.datetime):
                assert(datetime_idx is None)
                datetime_idx = {'idx_level':i, 'idx_name':grouped_cols[i]}
    return datetime_idx

def find_in_df_columns_or_indices(df, names):
    # Determine where names are located in df (special case if name found in index and is datetime)
    # Returns dict {'names_in_cols':names_in_cols, 'names_in_idxs':names_in_idxs}
    #     names_in_cols is a list of all names found in the columns of df
    #     names_in_idxs is a dictionary with:
    #         'datetime_idx' = None or {'idx_level':idx_level, 'idx_name':idx_name} if found to be datetime
    #         'regular_idxs' = list of non-datetime indices, where eadch element
    #                          is a dict of the form {'idx_level':idx_level, 'idx_name':idx_name}
    #
    # NOTE: For now, this will fail if more than one datetime index is found
    # This is enforced through the line: assert(datetime_idx is None)
    #-------------------------
    names_in_idxs = {'datetime_idx':None, 'regular_idxs':[]}
    names_in_cols = []
    #-------------------------
    col_names = df.columns.tolist()
    idx_names = list(df.index.names)
    sample_idx = df.index[0] # Used for determining if datetime element exists
    #-------------------------
    for name in names:
        if name in col_names:
            names_in_cols.append(name)
        elif name in idx_names:
            if len(idx_names)==1:
                if isinstance(sample_idx, datetime.datetime):
                    assert(names_in_idxs['datetime_idx'] is None)
                    names_in_idxs['datetime_idx'] = {'idx_level':0, 'idx_name':name}
            else:
                idx_level = idx_names.index(name)
                if isinstance(sample_idx[idx_level], datetime.datetime):
                    assert(names_in_idxs['datetime_idx'] is None)
                    names_in_idxs['datetime_idx'] = {'idx_level':idx_level, 'idx_name':name}
                else:
                    regular_idx = {'idx_level':idx_level, 'idx_name':name}
                    names_in_idxs['regular_idxs'].append(regular_idx)
        else:
            print(f'Name: {name} NOT FOUND in df columns or indices')
            assert(0)
    names_in_cols_and_idxs_dict = {'names_in_cols':names_in_cols, 'names_in_idxs':names_in_idxs}
    return names_in_cols_and_idxs_dict

def get_list_of_idx_level_name_value_dicts(list_of_idx_level_name_dicts, names_vals_dict):
    # names_vals_dict = {'idx_name_1':idx_val_1, 'idx_name_2':idx_val_2, ... , 'idx_name_n':idx_val_n}
    # list_of_idx_level_name_dicts = [{'idx_level':idx_level_a, 'idx_name': idx_name_a}, 
    #                                 {'idx_level':idx_level_b, 'idx_name': idx_name_b}, 
    #                                                    ...
    #                                 {'idx_level':idx_level_m, 'idx_name': idx_name_m}]
    # 
    # return_list_dicts = [{'idx_level':idx_level_a, 'idx_name': idx_name_a, 'idx_value':idx_val_a}, 
    #                      {'idx_level':idx_level_b, 'idx_name': idx_name_b, 'idx_value':idx_val_b}, 
    #                                         ...
    #                      {'idx_level':idx_level_m, 'idx_name': idx_name_m, 'idx_value':idx_val_m}]    
    
    # Only the members included in list_of_idx_level_name_dicts will be in the returned list.
    # This implies each 'idx_name' value in list_of_idx_level_name_dicts must be a key in names_vals_dict.
    # -----> names_vals_dict must include at least all members in list_of_idx_level_name_dicts, 
    #        but can also include more without effect.
    # -----------------------------------------------------------------------
    return_list_dicts = []
    for level_name_dict in list_of_idx_level_name_dicts:
        assert(level_name_dict['idx_name'] in names_vals_dict)
        return_list_dicts.append({**level_name_dict, 
                                  **{'idx_value':names_vals_dict[level_name_dict['idx_name']]}
                                 })
    return return_list_dicts

In [None]:
def build_idx_select_arg_list(list_of_idx_level_name_value_dicts, df_idx_names):
    # list_of_idx_level_name_value_dicts can include entries for all levels in df, but does not have to.
    # Any missing level will be assigned a value of slice(None), which is equivalent for no slicing on that level
    #
    # list_of_idx_level_name_value_dicts = [{'idx_level':0, 'idx_name':'time_idx', 'idx_value':pd.to_datetime('2018-10-04 12:00:00')}, 
    #                                       {'idx_level':2, 'idx_name': 'prem_nb', 'idx_value':109826791}]
    # idx_select_arg_list = (pd.to_datetime('2018-10-04 12:00:00'), slice(None), 109826791)
    # ----------------------------------------------------------------
    n_idx_levels_in_df = len(df_idx_names)
    assert(len(list_of_idx_level_name_value_dicts) <= n_idx_levels_in_df)
    assert(all(x['idx_name'] in df_idx_names for x in list_of_idx_level_name_value_dicts))
    #-------------------------
    if len(list_of_idx_level_name_value_dicts) < n_idx_levels_in_df:
        for level,df_idx_name in enumerate(df_idx_names):
            if df_idx_name not in [x['idx_name'] for x in list_of_idx_level_name_value_dicts]:
                list_of_idx_level_name_value_dicts.append({'idx_level':level, 'idx_name': df_idx_name, 'idx_value':slice(None)})
    
    # Make sure each level has a value
    assert(all(x['idx_name'] in df_idx_names for x in list_of_idx_level_name_value_dicts))
    assert(all(x['idx_level'] in list(range(len(df_idx_names))) for x in list_of_idx_level_name_value_dicts))

    # Get a list of the index values on which to select
    # sorted by the index level
    list_of_idx_level_value_dicts = {x['idx_level']:x['idx_value'] for x in list_of_idx_level_name_value_dicts}
    idx_select_arg_list = []
    for level in range(len(df_idx_names)):
        idx_select_arg_list.append(list_of_idx_level_value_dicts[level])
    # Convert idx_select_arg_list to usable form
    idx_select_arg_list = tuple(x if x is not None else slice(None) for x in idx_select_arg_list)
    return idx_select_arg_list


def replace_cols_in_gpd_df_with_values_from_df_singlegroup(gpd_vals_dict, gpd_df, df, resample_freq, cols_to_replace):
    # TODO!!!!!!!!!!!!!!
    # if datetime index included, resample_freq MUST match the frequency used for gpd_df
    #
    # gpd_vals_dict should be a dictionary containing keys which are indice names from gpd_vals_dict
    #   and values are corresponding values to group
    # If all indices from gpd_vals_dict are included, this is the same thing as replacing the column
    #   values row by row.  This can be very time consuming
    # In most cases (indeed in the case for which this was designed) the time index can be excluded
    #
    # cols_to_replace should be columns in gpd_df, not indices
    # Enforce this
    tmp_dict = find_in_df_columns_or_indices(gpd_df, cols_to_replace)
    assert(tmp_dict['names_in_idxs']['datetime_idx'] is None)
    assert(len(tmp_dict['names_in_idxs']['regular_idxs'])==0)

    # All keys in gpd_vals_dict should be indices in gpd_df
    # Assert this
    # At the same time, find if any datetime indices supplied in gpd_vals_dict
    gpd_df_idx_names = list(gpd_df.index.names)
    gpd_datetime_idx=None
    for key in gpd_vals_dict:
        assert(key in gpd_df_idx_names)
        if isinstance(gpd_vals_dict[key], datetime.datetime):
            gpd_datetime_idx = {'idx_level': gpd_df_idx_names.index(key), 'idx_name': key}

    # Determine where in df the gpd_vals_dict.keys() are located
    names_in_cols_and_idxs_dict = find_in_df_columns_or_indices(df, gpd_vals_dict.keys())
    gpd_cols_in_df_cols = names_in_cols_and_idxs_dict['names_in_cols']
    gpd_cols_in_df_idxs = names_in_cols_and_idxs_dict['names_in_idxs']

    # If datetime index found in gpd_df, one should also be found in df
    # More generally, gpd_datetime_idx should equal gpd_cols_in_df_idxs['datetime_idx'], 
    #   whether they equal a datetime idx, or both equal None
    assert(gpd_cols_in_df_idxs['datetime_idx']==gpd_datetime_idx)
    #------------------------------------------------------------------
    # First, apply selection via indices, then columns
    # If datetime index, apply first
    if gpd_datetime_idx is not None:
        # The methodology here also only will work of the time index is the first index
        # TODO THIS COULD BE CHANGED USING THE 'idx_level' info!
        assert(gpd_cols_in_df_idxs['datetime_idx']['idx_level']==0)
        t_slice_beg = gpd_vals_dict[gpd_datetime_idx['idx_name']]
        t_slice_end = t_slice_beg + pd.to_timedelta(resample_freq if resample_freq[0].isnumeric() else '1'+resample_freq)
        df = df.loc[t_slice_beg:t_slice_end]
    # Now, other indices
    regular_idxs = gpd_cols_in_df_idxs['regular_idxs']
    if len(regular_idxs)>0:
        # Make sure entries in regular_idxs are in correct order
        regular_idxs = sorted(regular_idxs, key=lambda x: x['idx_level'])
        # Get values for indices
        regular_idx_vals = [gpd_vals_dict[x['idx_name']] for x in regular_idxs]
        regular_list_of_idx_level_name_value_dicts = get_list_of_idx_level_name_value_dicts(regular_idxs, gpd_vals_dict)
        regular_idx_select_arg_list = build_idx_select_arg_list(regular_list_of_idx_level_name_value_dicts, list(df.index.names))
                
        df = df.loc(axis=0)[pd.IndexSlice[regular_idx_select_arg_list]]
    # ------------------------------------------------------------
    # Now, apply selection via columns
    col_bool_mask = [True]*df.shape[0]
    for col in gpd_cols_in_df_cols:
        col_bool_mask = (col_bool_mask) & (df[col]==gpd_vals_dict[col])
    df = df.loc[col_bool_mask]
    # ------------------------------------------------------------
    # Now, df is reduced down to only the elements of interest
    # Need to grab the values for all columns in cols_to_replace
    # There should only be one unique value for each, let's enforce this with asset
    replace_dict = {}
    for col in cols_to_replace:
        value_counts = df[col].value_counts()
        n_unique = len(value_counts)
        # n_unique should be 1, with the only exception being
        # when a column is full of NaNs, in which case n_unique will be 0
        assert(n_unique==1 or n_unique==0)
        if n_unique==0:
            # All values should be NaNs
            assert(all(df[col].isna()))
            value = np.nan
        else:
            value = value_counts.index[0]
        assert(col not in replace_dict)
        replace_dict[col] = value
    # ------------------------------------------------------------
    # Finally, replace the value(s) in gpd_df with replace_dict
    list_of_idx_level_name_dicts_to_include = [{'idx_level': get_index_level(x, gpd_df), 'idx_name': x} for x in gpd_vals_dict]
    list_of_idx_level_name_value_dicts = get_list_of_idx_level_name_value_dicts(list_of_idx_level_name_dicts_to_include, gpd_vals_dict)
    idx_select_arg_list = build_idx_select_arg_list(list_of_idx_level_name_value_dicts, list(gpd_df.index.names))
    gpd_df.loc[pd.IndexSlice[idx_select_arg_list], cols_to_replace] = itemgetter(*tuple(cols_to_replace))(replace_dict)
    
    # Could also return df is wanted for debugging/answer checking
    return gpd_df


# For each entry in gpd_df, I need to grab the values of the variables used for grouping.
# Then, I need to get that group from df_kwh_15T to find the values of all other columns to be set.
#
# Typically (but not always): 
#     the time_idx grouped will be the index of df_kwh_15T
#     any other grouping variables are columns in df_kwh_15T
# However, it may occur that df_kwh_15T has a multi-index, in which the grouping
# variable are contained.  Any of these cases should work

def replace_cols_in_gpd_df_with_values_from_df(gpd_df, df, resample_freq, cols_to_replace, exclude_time_idx=True):
    # Not recommended to set exclude_time_idx=False unless dfs are small
    #   as this will perform the replacement row-by-row.
    # exclude_time_idx=True is far more efficient 
    #TODO build in option to allow user to input the names of indices to be included
    #    Right now, all indices in gpd_df are included, except possibly the time index
    gpd_datetime_idx = find_datetime_idx(gpd_df)
    n_idxs = len(gpd_df.index[0])
    level = []
    for i in range(n_idxs):
        if (exclude_time_idx and 
            gpd_datetime_idx is not None and 
            gpd_datetime_idx['idx_level']==i):
            continue
        level.append(i)
    #-------------------------
    all_groups = list(gpd_df.groupby(level=level).groups.keys())
    idx_names_to_include = [list(gpd_df.index.names)[lvl] for lvl in level]
    assert(len(all_groups[0])==len(idx_names_to_include))
    gpd_vals_dicts = [dict(zip(idx_names_to_include, grp)) for grp in all_groups]    
    #-------------------------
    for gpd_vals_dict in gpd_vals_dicts:
        gpd_df = replace_cols_in_gpd_df_with_values_from_df_singlegroup(gpd_vals_dict=gpd_vals_dict, 
                                                                        gpd_df=gpd_df, 
                                                                        df=df, 
                                                                        resample_freq=resample_freq, 
                                                                        cols_to_replace=cols_to_replace)
    return gpd_df

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
def determine_us_timezone(shifts, assert_found=False):
    # shifts should either be a single (negative) integer or a list/tuple of two (negative) integers
    #   Note: A list/tuple of one (negative) integer will work too
    # If only a single shift is given, this function can at best return two possible timezones
    utc_shifts = {
        'US/Eastern':  [-5, -4], 
        'US/Central':  [-6, -5], 
        'US/Mountain': [-7, -6], 
        'US/Pacific':  [-8, -7], 
        'US/Alaska':   [-9, -8], 
        'US/Hawaii':   [-10, -10]
    }
    #-------------------------------
    found_tz = None
    if isinstance(shifts,int) or len(shifts)==1:
        if not isinstance(shifts,int):
            shifts = shifts[0]
        found_tz = [tz for tz,tz_shifts in utc_shifts.items() if shifts>=tz_shifts[0] and shifts<=tz_shifts[1]]
    else:
        assert(len(shifts)==2)
        shifts=sorted(shifts)
        found_tz = [tz for tz,tz_shifts in utc_shifts.items() if shifts==tz_shifts]
    #-------------------------------
    if len(found_tz)==0:
        found_tz=None
    if found_tz is None:
        if assert_found:
            assert(0)
        else:
            return found_tz
    #-----
    if len(found_tz)==1:
        found_tz=found_tz[0]
    return found_tz

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
def convert_local_to_utc_time(t_local, timezone):
    # timezone should be e.g., 'US/Eastern'
    #-----------------------------------
    if isinstance(t_local, list) or isinstance(t_local, tuple):
        return_list = []
        for t in t_local:
            return_list.append(convert_local_to_utc_time(t, timezone))
        return return_list
    #-----------------------------------    
    t_utc = pd.to_datetime(t_local).tz_localize(timezone).tz_convert(None)
    return t_utc

def determine_timezone_and_convert_local_to_utc_time(t_local, unique_tz_offsets, 
                                                     timezone_aware_times=None, **kwargs):
    # Should be complete hour 
    #  (there do exist half-hour and 45-minute time zones in t
    #   the world, but there shouldn't be any in AEP data)
    #
    # As convert_local_to_utc_time works whether t_local is single time or a list
    # of times, this function does as well.
    #--------------------
    # Preferable to use unique_tz_offsets, 
    #   which is a list of timezone offsets
    #     can be strings (e.g., '-05:00'), datetime.timedeltas, or pd./np.Timedeltas
    # HOWEVER, can set unique_tz_offsets to None and use timezone_aware_times instead
    #--------------------
    if unique_tz_offsets is None:
        assert(timezone_aware_times is not None)
        dflt_pattern = r'(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}:\d{2})([+-]\d{2}:\d{2})'
        pattern = kwargs.get('pattern', dflt_pattern)
        unique_tz_offsets = [Utilities_dt.extract_tz_from_tz_aware_dt_str(x, pattern=pattern) for x in timezone_aware_times]
        unique_tz_offsets = list(set(unique_tz_offsets))        
    #--------------------
    # Make sure elements in unique_tz_offsets are of proper type, and all are unique
    unq_tz_offsets = []
    for x in unique_tz_offsets:
        if is_timedelta64_dtype(x):
            unq_tz_offsets.append(x)
        elif isinstance(x, datetime.timedelta):
            unq_tz_offsets.append(pd.to_timedelta(x))
        else:
            assert(isinstance(x, str))
            unq_tz_offsets.append(pd.to_timedelta(Utilities_dt.get_timedelta_from_timezoneoffset(x)))
    # Make sure unq_tz_offsets truly is unique
    unq_tz_offsets = list(set(unq_tz_offsets))    
    #--------------------
    assert(all(x.total_seconds()%3600==0 for x in unq_tz_offsets))
    unq_tz_offsets = [round(x.total_seconds()/3600) for x in unq_tz_offsets]
    found_tz = determine_us_timezone(unq_tz_offsets, assert_found=True)
    #--------------------
    return convert_local_to_utc_time(t_local, found_tz)

In [None]:
# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
# read_df = pd.read_csv(os.path.join(Utilities.get_local_data_dir(), r'sample_outages\outg_rec_nb_11751094\outg_rec_nb_11751094_2019_q4.csv')
# read_df = read_df[read_df['serialnumber']==880687439]
# read_df = read_df[(read_df['aep_derived_uom']=='KWH') & (read_df['aep_srvc_qlty_idntfr']=='TOTAL')]
# read_df = read_df[['endtimeperiod', 'endtimeperiod','aep_endtime_utc', 'timezoneoffset']]
# read_df = read_df.sort_values(by='aep_endtime_utc', ignore_index=True)
# read_df