In [None]:
from importlib import reload
#reload(Utilities)

import sys, os
import re

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns
from packaging import version
import itertools
from dateutil.parser import parse
from operator import itemgetter

from pmdarima import auto_arima
import statsmodels.api as sm
from statsmodels.tsa.stattools import acovf, acf, pacf, pacf_yw, pacf_ols
from pandas.plotting import lag_plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.graphics.tsaplots import month_plot, quarter_plot, seasonal_plot
from statsmodels.tsa.arima_model import ARMA, ARIMA, ARMAResults, ARIMAResults
from statsmodels.tsa.statespace.sarimax import SARIMAX

from arch import arch_model

from scipy.stats.mstats import trim

#---------------------------------------------------------------------
import pyodbc
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
# import constants for the days of the week
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
import Plot_Box_sns
import GrubbsTest
import DickeyFullerTest as dft

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
def get_index_level(index_name, df):
    index_names = list(df.index.names)
    index_level=None
    for i,idx_name_i in enumerate(index_names):
        if idx_name_i==index_name:
            assert(index_level is None)
            index_level = i
    assert(index_level is not None)
    return index_level

def find_datetime_idx(df):
    # NOTE: For now, this will fail if more than one datetime index is found
    # This is enforced through the line: assert(datetime_idx is None)
    datetime_idx = None
    grouped_cols = list(df.index.names)
    if len(grouped_cols)==1:
        if isinstance(df.index[0], datetime.datetime):
            datetime_idx = {'idx_level':0, 'idx_name':grouped_cols[0]}
    else:
        assert(len(grouped_cols)==len(df.index[0]))
        for i in range(len(grouped_cols)):
            if isinstance(df.index[0][i], datetime.datetime):
                assert(datetime_idx is None)
                datetime_idx = {'idx_level':i, 'idx_name':grouped_cols[i]}
    return datetime_idx

def find_in_df_columns_or_indices(df, names):
    # Determine where names are located in df (special case if name found in index and is datetime)
    # Returns dict {'names_in_cols':names_in_cols, 'names_in_idxs':names_in_idxs}
    #     names_in_cols is a list of all names found in the columns of df
    #     names_in_idxs is a dictionary with:
    #         'datetime_idx' = None or {'idx_level':idx_level, 'idx_name':idx_name} if found to be datetime
    #         'regular_idxs' = list of non-datetime indices, where eadch element
    #                          is a dict of the form {'idx_level':idx_level, 'idx_name':idx_name}
    #
    # NOTE: For now, this will fail if more than one datetime index is found
    # This is enforced through the line: assert(datetime_idx is None)
    #-------------------------
    names_in_idxs = {'datetime_idx':None, 'regular_idxs':[]}
    names_in_cols = []
    #-------------------------
    col_names = df.columns.tolist()
    idx_names = list(df.index.names)
    sample_idx = df.index[0] # Used for determining if datetime element exists
    #-------------------------
    for name in names:
        if name in col_names:
            names_in_cols.append(name)
        elif name in idx_names:
            if len(idx_names)==1:
                if isinstance(sample_idx, datetime.datetime):
                    assert(names_in_idxs['datetime_idx'] is None)
                    names_in_idxs['datetime_idx'] = {'idx_level':0, 'idx_name':name}
            else:
                idx_level = idx_names.index(name)
                if isinstance(sample_idx[idx_level], datetime.datetime):
                    assert(names_in_idxs['datetime_idx'] is None)
                    names_in_idxs['datetime_idx'] = {'idx_level':idx_level, 'idx_name':name}
                else:
                    regular_idx = {'idx_level':idx_level, 'idx_name':name}
                    names_in_idxs['regular_idxs'].append(regular_idx)
        else:
            print(f'Name: {name} NOT FOUND in df columns or indices')
            assert(0)
    names_in_cols_and_idxs_dict = {'names_in_cols':names_in_cols, 'names_in_idxs':names_in_idxs}
    return names_in_cols_and_idxs_dict

def get_list_of_idx_level_name_value_dicts(list_of_idx_level_name_dicts, names_vals_dict):
    # names_vals_dict = {'idx_name_1':idx_val_1, 'idx_name_2':idx_val_2, ... , 'idx_name_n':idx_val_n}
    # list_of_idx_level_name_dicts = [{'idx_level':idx_level_a, 'idx_name': idx_name_a}, 
    #                                 {'idx_level':idx_level_b, 'idx_name': idx_name_b}, 
    #                                                    ...
    #                                 {'idx_level':idx_level_m, 'idx_name': idx_name_m}]
    # 
    # return_list_dicts = [{'idx_level':idx_level_a, 'idx_name': idx_name_a, 'idx_value':idx_val_a}, 
    #                      {'idx_level':idx_level_b, 'idx_name': idx_name_b, 'idx_value':idx_val_b}, 
    #                                         ...
    #                      {'idx_level':idx_level_m, 'idx_name': idx_name_m, 'idx_value':idx_val_m}]    
    
    # Only the members included in list_of_idx_level_name_dicts will be in the returned list.
    # This implies each 'idx_name' value in list_of_idx_level_name_dicts must be a key in names_vals_dict.
    # -----> names_vals_dict must include at least all members in list_of_idx_level_name_dicts, 
    #        but can also include more without effect.
    # -----------------------------------------------------------------------
    return_list_dicts = []
    for level_name_dict in list_of_idx_level_name_dicts:
        assert(level_name_dict['idx_name'] in names_vals_dict)
        return_list_dicts.append({**level_name_dict, 
                                  **{'idx_value':names_vals_dict[level_name_dict['idx_name']]}
                                 })
    return return_list_dicts

In [None]:
def build_idx_select_arg_list(list_of_idx_level_name_value_dicts, df_idx_names):
    # list_of_idx_level_name_value_dicts can include entries for all levels in df, but does not have to.
    # Any missing level will be assigned a value of slice(None), which is equivalent for no slicing on that level
    #
    # list_of_idx_level_name_value_dicts = [{'idx_level':0, 'idx_name':'time_idx', 'idx_value':pd.to_datetime('2018-10-04 12:00:00')}, 
    #                                       {'idx_level':2, 'idx_name': 'prem_nb', 'idx_value':109826791}]
    # idx_select_arg_list = (pd.to_datetime('2018-10-04 12:00:00'), slice(None), 109826791)
    # ----------------------------------------------------------------
    n_idx_levels_in_df = len(df_idx_names)
    assert(len(list_of_idx_level_name_value_dicts) <= n_idx_levels_in_df)
    assert(all(x['idx_name'] in df_idx_names for x in list_of_idx_level_name_value_dicts))
    #-------------------------
    if len(list_of_idx_level_name_value_dicts) < n_idx_levels_in_df:
        for level,df_idx_name in enumerate(df_idx_names):
            if df_idx_name not in [x['idx_name'] for x in list_of_idx_level_name_value_dicts]:
                list_of_idx_level_name_value_dicts.append({'idx_level':level, 'idx_name': df_idx_name, 'idx_value':slice(None)})
    
    # Make sure each level has a value
    assert(all(x['idx_name'] in df_idx_names for x in list_of_idx_level_name_value_dicts))
    assert(all(x['idx_level'] in list(range(len(df_idx_names))) for x in list_of_idx_level_name_value_dicts))

    # Get a list of the index values on which to select
    # sorted by the index level
    list_of_idx_level_value_dicts = {x['idx_level']:x['idx_value'] for x in list_of_idx_level_name_value_dicts}
    idx_select_arg_list = []
    for level in range(len(df_idx_names)):
        idx_select_arg_list.append(list_of_idx_level_value_dicts[level])
    # Convert idx_select_arg_list to usable form
    idx_select_arg_list = tuple(x if x is not None else slice(None) for x in idx_select_arg_list)
    return idx_select_arg_list


def replace_cols_in_gpd_df_with_values_from_df_singlegroup(gpd_vals_dict, gpd_df, df, resample_freq, cols_to_replace):
    # TODO!!!!!!!!!!!!!!
    # if datetime index included, resample_freq MUST match the frequency used for gpd_df
    #
    # gpd_vals_dict should be a dictionary containing keys which are indice names from gpd_vals_dict
    #   and values are corresponding values to group
    # If all indices from gpd_vals_dict are included, this is the same thing as replacing the column
    #   values row by row.  This can be very time consuming
    # In most cases (indeed in the case for which this was designed) the time index can be excluded
    #
    # cols_to_replace should be columns in gpd_df, not indices
    # Enforce this
    tmp_dict = find_in_df_columns_or_indices(gpd_df, cols_to_replace)
    assert(tmp_dict['names_in_idxs']['datetime_idx'] is None)
    assert(len(tmp_dict['names_in_idxs']['regular_idxs'])==0)

    # All keys in gpd_vals_dict should be indices in gpd_df
    # Assert this
    # At the same time, find if any datetime indices supplied in gpd_vals_dict
    gpd_df_idx_names = list(gpd_df.index.names)
    gpd_datetime_idx=None
    for key in gpd_vals_dict:
        assert(key in gpd_df_idx_names)
        if isinstance(gpd_vals_dict[key], datetime.datetime):
            gpd_datetime_idx = {'idx_level': gpd_df_idx_names.index(key), 'idx_name': key}

    # Determine where in df the gpd_vals_dict.keys() are located
    names_in_cols_and_idxs_dict = find_in_df_columns_or_indices(df, gpd_vals_dict.keys())
    gpd_cols_in_df_cols = names_in_cols_and_idxs_dict['names_in_cols']
    gpd_cols_in_df_idxs = names_in_cols_and_idxs_dict['names_in_idxs']

    # If datetime index found in gpd_df, one should also be found in df
    # More generally, gpd_datetime_idx should equal gpd_cols_in_df_idxs['datetime_idx'], 
    #   whether they equal a datetime idx, or both equal None
    assert(gpd_cols_in_df_idxs['datetime_idx']==gpd_datetime_idx)
    #------------------------------------------------------------------
    # First, apply selection via indices, then columns
    # If datetime index, apply first
    if gpd_datetime_idx is not None:
        # The methodology here also only will work of the time index is the first index
        # TODO THIS COULD BE CHANGED USING THE 'idx_level' info!
        assert(gpd_cols_in_df_idxs['datetime_idx']['idx_level']==0)
        t_slice_beg = gpd_vals_dict[gpd_datetime_idx['idx_name']]
        t_slice_end = t_slice_beg + pd.to_timedelta(resample_freq if resample_freq[0].isnumeric() else '1'+resample_freq)
        df = df.loc[t_slice_beg:t_slice_end]
    # Now, other indices
    regular_idxs = gpd_cols_in_df_idxs['regular_idxs']
    if len(regular_idxs)>0:
        # Make sure entries in regular_idxs are in correct order
        regular_idxs = sorted(regular_idxs, key=lambda x: x['idx_level'])
        # Get values for indices
        regular_idx_vals = [gpd_vals_dict[x['idx_name']] for x in regular_idxs]
        regular_list_of_idx_level_name_value_dicts = get_list_of_idx_level_name_value_dicts(regular_idxs, gpd_vals_dict)
        regular_idx_select_arg_list = build_idx_select_arg_list(regular_list_of_idx_level_name_value_dicts, list(df.index.names))
                
        df = df.loc(axis=0)[pd.IndexSlice[regular_idx_select_arg_list]]
    # ------------------------------------------------------------
    # Now, apply selection via columns
    col_bool_mask = [True]*df.shape[0]
    for col in gpd_cols_in_df_cols:
        col_bool_mask = (col_bool_mask) & (df[col]==gpd_vals_dict[col])
    df = df.loc[col_bool_mask]
    # ------------------------------------------------------------
    # Now, df is reduced down to only the elements of interest
    # Need to grab the values for all columns in cols_to_replace
    # There should only be one unique value for each, let's enforce this with asset
    replace_dict = {}
    for col in cols_to_replace:
        value_counts = df[col].value_counts()
        n_unique = len(value_counts)
        # n_unique should be 1, with the only exception being
        # when a column is full of NaNs, in which case n_unique will be 0
        assert(n_unique==1 or n_unique==0)
        if n_unique==0:
            # All values should be NaNs
            assert(all(df[col].isna()))
            value = np.nan
        else:
            value = value_counts.index[0]
        assert(col not in replace_dict)
        replace_dict[col] = value
    # ------------------------------------------------------------
    # Finally, replace the value(s) in gpd_df with replace_dict
    list_of_idx_level_name_dicts_to_include = [{'idx_level': get_index_level(x, gpd_df), 'idx_name': x} for x in gpd_vals_dict]
    list_of_idx_level_name_value_dicts = get_list_of_idx_level_name_value_dicts(list_of_idx_level_name_dicts_to_include, gpd_vals_dict)
    idx_select_arg_list = build_idx_select_arg_list(list_of_idx_level_name_value_dicts, list(gpd_df.index.names))
    gpd_df.loc[pd.IndexSlice[idx_select_arg_list], cols_to_replace] = itemgetter(*tuple(cols_to_replace))(replace_dict)
    
    # Could also return df is wanted for debugging/answer checking
    return gpd_df


# For each entry in gpd_df, I need to grab the values of the variables used for grouping.
# Then, I need to get that group from df_kwh_15T to find the values of all other columns to be set.
#
# Typically (but not always): 
#     the time_idx grouped will be the index of df_kwh_15T
#     any other grouping variables are columns in df_kwh_15T
# However, it may occur that df_kwh_15T has a multi-index, in which the grouping
# variable are contained.  Any of these cases should work

def replace_cols_in_gpd_df_with_values_from_df(gpd_df, df, resample_freq, cols_to_replace, exclude_time_idx=True):
    # Not recommended to set exclude_time_idx=False unless dfs are small
    #   as this will perform the replacement row-by-row.
    # exclude_time_idx=True is far more efficient 
    #TODO build in option to allow user to input the names of indices to be included
    #    Right now, all indices in gpd_df are included, except possibly the time index
    gpd_datetime_idx = find_datetime_idx(gpd_df)
    n_idxs = len(gpd_df.index[0])
    level = []
    for i in range(n_idxs):
        if (exclude_time_idx and 
            gpd_datetime_idx is not None and 
            gpd_datetime_idx['idx_level']==i):
            continue
        level.append(i)
    #-------------------------
    all_groups = list(gpd_df.groupby(level=level).groups.keys())
    idx_names_to_include = [list(gpd_df.index.names)[lvl] for lvl in level]
    assert(len(all_groups[0])==len(idx_names_to_include))
    gpd_vals_dicts = [dict(zip(idx_names_to_include, grp)) for grp in all_groups]    
    #-------------------------
    for gpd_vals_dict in gpd_vals_dicts:
        gpd_df = replace_cols_in_gpd_df_with_values_from_df_singlegroup(gpd_vals_dict=gpd_vals_dict, 
                                                                        gpd_df=gpd_df, 
                                                                        df=df, 
                                                                        resample_freq=resample_freq, 
                                                                        cols_to_replace=cols_to_replace)
    return gpd_df

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------

In [None]:
# read_df = pd.read_csv(os.path.join(Utilities.get_local_data_dir(), r'sample_outages\outg_rec_nb_11751094\outg_rec_nb_11751094_2019_q4.csv'))
# read_df = read_df[read_df['serialnumber']==880687439]
# read_df = read_df[(read_df['aep_derived_uom']=='KWH') & (read_df['aep_srvc_qlty_idntfr']=='TOTAL')]
# read_df = read_df[['endtimeperiod', 'endtimeperiod','aep_endtime_utc', 'timezoneoffset']]
# read_df = read_df.sort_values(by='aep_endtime_utc', ignore_index=True)
# read_df