In [1]:
%run AMINonVee.ipynb

In [2]:
from importlib import reload
#reload(Utilities)

import sys, os
import re

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns
from packaging import version
import itertools
from dateutil.parser import parse
from operator import itemgetter

from pmdarima import auto_arima
import statsmodels.api as sm
from statsmodels.tsa.stattools import acovf, acf, pacf, pacf_yw, pacf_ols
from pandas.plotting import lag_plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.graphics.tsaplots import month_plot, quarter_plot, seasonal_plot
from statsmodels.tsa.arima_model import ARMA, ARIMA, ARMAResults, ARIMAResults
from statsmodels.tsa.statespace.sarimax import SARIMAX

from arch import arch_model

from scipy.stats.mstats import trim

#---------------------------------------------------------------------
import pyodbc
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
# import constants for the days of the week
from matplotlib.dates import MO, TU, WE, TH, FR, SA, SU
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
import Plot_Box_sns
import GrubbsTest
import DickeyFullerTest as dft

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def build_mean_from_sum_and_counts(df, sum_x_col, n_counts_col, placement_col):
    df[placement_col] = df[sum_x_col]/df[n_counts_col]
    return df

def build_std_from_mossom(df, sum_x_col, sum_x2_col, n_counts_col, placement_col, sample_std=True):
    # mossom = Mean Of Sqaures minus Square Of Means
    #   variance = std**2 ~ bar{x**2}-(bar{x})**2
    # However, here I will actually be using sum(x**2) and sum(x) instead
    #   ==> std**2 ~ (1/n)*sum(x**2) - (1/n**2)*sum(x)
    # Default to sample std (std_s w/ n-1 in denominator), 
    #    not population std (std_p w/ n in denominator)
    # 
    # std_p**2 = bar{x**2}-(bar{x})**2
    #          = (1/n)*sum(x**2) - (1/n**2)*sum(x)
    # std_s**2 = (n/(n-1))*std_p**2 = (n/(n-1))*(bar{x**2}-(bar{x})**2)
    #          = (n/(n-1))*((1/n)*sum(x**2) - (1/n**2)*sum(x))
    df[placement_col] = np.sqrt(
        (df[sum_x2_col]/df[n_counts_col] - df[sum_x_col]*df[sum_x_col]/(df[n_counts_col]*df[n_counts_col]))
    )
    if sample_std:
        df[placement_col] = df[placement_col]*np.sqrt((df[n_counts_col]/(df[n_counts_col]-1)))
    return df

def build_mean_and_std_from_sum_x_x2_and_counts(df, sum_x_col, sum_x2_col, n_counts_col, 
                                                placement_col_mean, placement_col_std, 
                                                sample_std=True):
    df = build_mean_from_sum_and_counts(df, sum_x_col, n_counts_col, placement_col_mean)
    df = build_std_from_mossom(df, sum_x_col, sum_x2_col, n_counts_col, placement_col_std)
    return df

In [None]:
# df_agg_cir: aggregate of all meters on circuit ==> one entry per time stamp for circuit
# df_agg_xfmr: aggregate by xfmr on circuit ==> one entry per xfmr per time stamp for circuit
# df_agg2_xfmr_cir: aggregate of all transformers on circuit
#                   i.e., aggregate by xfmr followed by aggregate of circuit 
#                   ==> one entry per time stamp for circuit

In [None]:
path_agg_cir = os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\outg_rec_nb_11751094\outg_rec_nb_11751094_2020_q4.csv')
path_agg_xfmr = os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\GroupByXfmr\outg_rec_nb_11751094\outg_rec_nb_11751094_2020_q4.csv')
#path_agg2_xfmr_cir = os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\GroupByXfmr_v2\outg_rec_nb_11751094\outg_rec_nb_11751094_2020_q4.csv')
path_agg2_xfmr_cir = os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\GroupByXfmr_v3_NET\outg_rec_nb_11751094\outg_rec_nb_11751094_2020_q4.csv')

In [None]:
df_agg_cir = pd.read_csv(path_agg_cir)
df_agg_cir = Utilities_df.remove_prepend_from_columns_in_df(df_agg_cir)

df_agg_xfmr = pd.read_csv(path_agg_xfmr)
df_agg2_xfmr_cir = pd.read_csv(path_agg2_xfmr_cir)

In [None]:
# keep = ['sum_value_sum', 'sum_value_sq_sum', 'mean_value_mean', 'mean_value_std', 'sum_counts', 'sum_counts_including_null']
# drop = ['mean_value_sum', 'mean_value_sq_sum', 'sum_value_mean', 'sum_value_std', 'mean_counts', 'mean_counts_including_null']
# df_agg2_xfmr_cir = df_agg2_xfmr_cir.drop(columns=drop)

In [None]:
print(f'df_agg_cir.shape       = {df_agg_cir.shape}')
print(f'df_agg2_xfmr_cir.shape = {df_agg2_xfmr_cir.shape}')
print(f'df_agg_xfmr.shape      = {df_agg_xfmr.shape}')

In [None]:
gpby_cols = ['starttimeperiod', 'endtimeperiod', 'aep_endtime_utc', 'timezoneoffset', 
             'aep_derived_uom', 'aep_srvc_qlty_idntfr', 'aep_usage_dt']
agg_dict = {'value_sum':['sum'], 
            'value_sq_sum':['sum'], 
            'value_mean':['mean'], 
            'value_std':['mean'], 
            'counts':['sum'], 
            'counts_including_null':['sum']}
df_agg2_xfmr_cir_v2 = df_agg_xfmr.groupby(gpby_cols).agg(agg_dict)
df_agg2_xfmr_cir_v2 = df_agg2_xfmr_cir_v2.reset_index()

df_agg2_xfmr_cir_v2.columns = df_agg2_xfmr_cir_v2.columns.to_flat_index()
rename_dict = {x:f'{x[1]}_{x[0]}' if x[1] else x[0] for x in df_agg2_xfmr_cir_v2.columns.tolist()}
df_agg2_xfmr_cir_v2 = df_agg2_xfmr_cir_v2.rename(columns=rename_dict)

In [None]:
sum_x2_col = 'sum_value_sq_sum'
sum_x_col = 'sum_value_sum'
n_counts_col = 'sum_counts'
placement_col_mean = 'value_mean_pool'
placement_col_std = 'value_std_pool'

df_agg2_xfmr_cir = build_mean_and_std_from_sum_x_x2_and_counts(df_agg2_xfmr_cir, sum_x_col, sum_x2_col, n_counts_col, 
                                                               placement_col_mean, placement_col_std)

df_agg2_xfmr_cir_v2 = build_mean_and_std_from_sum_x_x2_and_counts(df_agg2_xfmr_cir_v2, sum_x_col, sum_x2_col, n_counts_col, 
                                                                  placement_col_mean, placement_col_std)

In [None]:
sort_by=['aep_endtime_utc', 'aep_derived_uom', 'aep_srvc_qlty_idntfr']
cols_to_compare = ['aep_endtime_utc', 'sum_value_sum', 'sum_value_sq_sum', 'mean_value_mean', 
                   'mean_value_std', 'sum_counts', 'sum_counts_including_null', 'value_mean_pool', 'value_std_pool']

In [None]:
approx_diffs = Utilities_df.get_dfs_diff_approx_ok(df_agg2_xfmr_cir, df_agg2_xfmr_cir_v2, sort_by=sort_by)
if len(approx_diffs)==0:
    print('EQUAL: df_agg2_xfmr_cir == df_agg2_xfmr_cir_v2')
else:
    print('NOT EQUAL: df_agg2_xfmr_cir != df_agg2_xfmr_cir_v2')

In [None]:

df_agg_cir = df_agg_cir.sort_values(by=sort_by, ignore_index=True)
df_agg2_xfmr_cir = df_agg2_xfmr_cir.sort_values(by=sort_by, ignore_index=True)
df_agg2_xfmr_cir_v2 = df_agg2_xfmr_cir_v2.sort_values(by=sort_by, ignore_index=True)

In [None]:
# Between df_agg_cir and others, the following columns should be equal
matching_cols_dict = {
    'value_sum':'sum_value_sum', 
    'value_sq_sum':'sum_value_sq_sum', 
    'value_mean':'value_mean_pool', 
    'value_std':'value_std_pool', 
    'counts':'sum_counts', 
    'counts_including_null':'sum_counts_including_null'
}

In [None]:
approx_diffs = Utilities_df.get_dfs_diff_approx_ok(df_agg_cir[matching_cols_dict.keys()].rename(columns=matching_cols_dict), 
                                                   df_agg2_xfmr_cir[matching_cols_dict.values()])
if len(approx_diffs)==0:
    print('EQUAL: df_agg_cir == df_agg2_xfmr_cir in expected columns')
else:
    print('NOT EQUAL: df_agg_cir != df_agg2_xfmr_cir in expected columns')

In [None]:
approx_diffs = Utilities_df.get_dfs_diff_approx_ok(df_agg_cir[matching_cols_dict.keys()].rename(columns=matching_cols_dict), 
                                                   df_agg2_xfmr_cir_v2[matching_cols_dict.values()])
if len(approx_diffs)==0:
    print('df_agg_cir EQUALS df_agg2_xfmr_cir_v2 in expected columns')
else:
    print('df_agg_cir DOES NOT EQUAL df_agg2_xfmr_cir_v2 in expected columns')

In [None]:
df_agg_cir.head()

In [None]:
df_agg2_xfmr_cir.head()

In [None]:
df_agg2_xfmr_cir_v2.head()

In [None]:
tmp = pd.read_csv(os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\GroupByXfmr_v2\outg_rec_nb_11751094\outg_rec_nb_11751094_2020_q4.csv'))

In [None]:
tmp.columns

In [None]:
tmp.head()

In [None]:
file_dir = os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\GroupByXfmr_v2\outg_rec_nb_11751094')
glob_pattern=r'outg_rec_nb_*q[0-9].csv'
value_cols = ['sum_value_sum', 'mean_value_sum', 'sum_value_sq_sum',
              'mean_value_sq_sum', 'sum_value_mean', 'mean_value_mean',
              'sum_value_std', 'mean_value_std', 'sum_counts', 'mean_counts',
              'sum_counts_including_null', 'mean_counts_including_null']
cols_of_interest = ['starttimeperiod', 'endtimeperiod', 'aep_endtime_utc', 'timezoneoffset',
                    'aep_derived_uom', 'aep_srvc_qlty_idntfr', 'aep_usage_dt',
                    'sum_value_sum', 'mean_value_sum', 'sum_value_sq_sum',
                    'mean_value_sq_sum', 'sum_value_mean', 'mean_value_mean',
                    'sum_value_std', 'mean_value_std', 'sum_counts', 'mean_counts',
                    'sum_counts_including_null', 'mean_counts_including_null']
combine_kwh_delivered_and_received=False
merge_and_groupby_cols=['aep_endtime_utc']
verbose=True

In [None]:
# # keep = ['sum_value_sum', 'sum_value_sq_sum', 'mean_value_mean', 'mean_value_std', 'sum_counts', 'sum_counts_including_null']
# # drop = ['mean_value_sum', 'mean_value_sq_sum', 'sum_value_mean', 'sum_value_std', 'mean_counts', 'mean_counts_including_null']

# file_dir = os.path.join(Utilities.get_local_data_dir(), r'sample_circuits\NewMethod\GroupByXfmr_v2\outg_rec_nb_11751094')
# glob_pattern=r'outg_rec_nb_*q[0-9].csv'
# value_cols = ['sum_value_sum', 'sum_value_sq_sum', 'mean_value_mean', 'mean_value_std', 
#               'sum_counts', 'sum_counts_including_null']
# cols_of_interest = ['starttimeperiod', 'endtimeperiod', 'aep_endtime_utc', 'timezoneoffset',
#                     'aep_derived_uom', 'aep_srvc_qlty_idntfr', 'aep_usage_dt',
#                     'sum_value_sum', 'sum_value_sq_sum', 'mean_value_mean', 'mean_value_std', 
#                     'sum_counts', 'sum_counts_including_null']
# combine_kwh_delivered_and_received=False
# merge_and_groupby_cols=['aep_endtime_utc']
# verbose=True

In [None]:
kwh_vlt_dfs_xfmr_cir_dict = AMINonVee.assemble_kwh_vlt_dfs_from_saved_csvs(file_dir=file_dir, glob_pattern=glob_pattern, 
                                                                 value_cols=value_cols, 
                                                                 cols_of_interest=cols_of_interest, 
                                                                 combine_kwh_delivered_and_received=combine_kwh_delivered_and_received, 
                                                                 merge_and_groupby_cols=merge_and_groupby_cols, 
                                                                 verbose=verbose)

In [None]:
kwh_vlt_dfs_xfmr_cir_dict.keys()

In [None]:
df_xfmr_cir_kwh_15T = kwh_vlt_dfs_xfmr_cir_dict['kwh']
df_xfmr_cir_vlt_15T = kwh_vlt_dfs_xfmr_cir_dict['vlt']

In [None]:
df_xfmr_cir_kwh_15T[['aep_derived_uom', 'aep_srvc_qlty_idntfr', 'sum_value_sum', 'mean_value_sum', 'sum_value_sq_sum',
       'mean_value_sq_sum', 'sum_value_mean', 'mean_value_mean',
       'sum_value_std', 'mean_value_std', 'sum_counts', 'mean_counts',
       'sum_counts_including_null', 'mean_counts_including_null']]

In [None]:
sum_x2_col = 'sum_value_sq_sum'
sum_x_col = 'sum_value_sum'
n_counts_col = 'sum_counts'
placement_col_mean = 'value_mean_pool'
placement_col_std = 'value_std_pool'

df_xfmr_cir_kwh_15T = build_mean_and_std_from_sum_x_x2_and_counts(df_xfmr_cir_kwh_15T, sum_x_col, sum_x2_col, n_counts_col, 
                                                                  placement_col_mean, placement_col_std)


In [None]:
# keep = ['sum_value_sum', 'sum_value_sq_sum', 'mean_value_mean', 'mean_value_std', 'sum_counts', 'sum_counts_including_null']
# drop = ['mean_value_sum', 'mean_value_sq_sum', 'sum_value_mean', 'sum_value_std', 'mean_counts', 'mean_counts_including_null']
# df_xfmr_cir_kwh_15T = df_xfmr_cir_kwh_15T.drop(columns=drop)

In [None]:
df_xfmr_cir_kwh_15T.head()

In [None]:
def build_circuit_non_vals(df, non_val_cols, 
                           aep_srvc_qlty_idntfr_col='aep_srvc_qlty_idntfr', 
                           aep_srvc_qlty_idntfr_vals={'rec':'RECEIVED', 'del':'DELIVERED', 'tot':'TOTAL'}):
    # Typically, df_rec, df_del, and df_tot will not all contain to entire set of
    # dates.  To simplify the merge, first grab the total set of non_vals_df.
    # This is achieved using the combine_first method, which will keep the first non-null
    # element between the two datasets.
    # Then, merges can be done using just the value_cols and time_idx.
    # The non_vals_df will be combined with the vals_df in the end
    #------------------------
    df_rec = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['rec']]
    df_del = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['del']]
    df_tot = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['tot']]
    #------------------------
    non_vals_df = df_del[non_val_cols].combine_first(df_rec[non_val_cols])
    non_vals_df = non_vals_df.combine_first(df_tot[non_val_cols])
    assert(non_vals_df.isna().sum().sum()==0) #TODO should this assert remain?
    return non_vals_df

In [None]:
def get_ciruit_total_vals_method2(df, non_val_cols, 
                                  value_cols_dict = {'value_sum_cir_col':'value_sum_cir', 
                                                     'counts_cir_col':'counts_cir', 
                                                     'value_mean_cir_col':'value_mean_cir'}, 
                                  remove_intermediate_cols=True, 
                                  aep_srvc_qlty_idntfr_col='aep_srvc_qlty_idntfr', 
                                  aep_srvc_qlty_idntfr_vals={'rec':'RECEIVED', 'del':'DELIVERED', 'tot':'TOTAL'}, 
                                  maintain_original_col_order=True):
    # TODO How to enforce assert(len(aep_srvc_qlty_idntfrs)<=3)
    #---------------------------------------------------
    expected_keys = ['counts_cir_col', 'value_mean_cir_col', 'value_sum_cir_col']
    assert(len(set(value_cols_dict.keys()).intersection(set(expected_keys)))==3)    
    value_cols = list(value_cols_dict.values())
    #---------------------------------------------------
    df_rec = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['rec']]
    df_del = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['del']]
    df_tot = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['tot']]
    #---------------------------------------------------    
    # This essentially enforces assert(len(aep_srvc_qlty_idntfrs)<=3)
    assert(df_rec.shape[0]==df_rec.index.nunique())
    assert(df_del.shape[0]==df_del.index.nunique())
    assert(df_tot.shape[0]==df_tot.index.nunique())
    #---------------------------------------------------
    return_df = df_del[value_cols].merge(df_rec[value_cols], 
                                         left_index=True, right_index=True, 
                                         how='outer', suffixes=('_del', '_rec')) 
    # Note: Suffix argument will do nothing here, as df_tot contains value_cols, whereas
    #       return_df currently contains [f'x_{del}' for x in value_cols] and [f'x_{rec}' for x in value_cols]
    #       Therefore, there are no overlapping column names.  As such, we must rename using .rename
    return_df = return_df.merge(df_tot[value_cols], left_index=True, right_index=True, how='outer')
    return_df = return_df.rename(columns = {x:f'{x}_tot' for x in value_cols})
    # I used an outer merge so there will always been an entry for delivered, received, and total in return_df
    # However, if one of these did not exist in the original, the outer merge will leave a NaN value in the cell
    # Therefore, I call return_df.fillna(0) to replace any of these NaNs with 0
    return_df = return_df.fillna(0)
    #---------------------------------------------------
    value_sum_cir_col = value_cols_dict['value_sum_cir_col']
    counts_cir_col = value_cols_dict['counts_cir_col']
    value_mean_cir_col = value_cols_dict['value_mean_cir_col']
    #---------------------------------------------------
    # Made a list of newly created columns, which will be need if remove_intermediate_cols==True
    new_col_tags = ['_del', '_rec', '_tot']
    new_cols = [value_col+tag for value_col in value_cols for tag in new_col_tags]
    new_cols.append(f"{counts_cir_col}_del_plus_tot")
    #---------------------------------------------------
    return_df[f"{counts_cir_col}_del_plus_tot"] = (return_df[f"{counts_cir_col}_del"] +
                                                   return_df[f"{counts_cir_col}_tot"])
    return_df[counts_cir_col] = return_df[[f"{counts_cir_col}_rec", 
                                           f"{counts_cir_col}_del_plus_tot"]].max(axis=1)
    return_df[value_sum_cir_col] = (return_df[f"{value_sum_cir_col}_tot"] +
                                    return_df[f"{value_sum_cir_col}_del"] -
                                    return_df[f"{value_sum_cir_col}_rec"])
    return_df[value_mean_cir_col] = return_df[value_sum_cir_col]/return_df[value_cols_dict['counts_cir_col']]
    return_df[counts_cir_col]=return_df[counts_cir_col].astype('int64')
    #---------------------------------------------------
    if remove_intermediate_cols:
        return_df = return_df.drop(columns=new_cols)
    #---------------------------------------------------
    non_vals_df = build_circuit_non_vals(df, non_val_cols, 
                                         aep_srvc_qlty_idntfr_col=aep_srvc_qlty_idntfr_col, 
                                         aep_srvc_qlty_idntfr_vals=aep_srvc_qlty_idntfr_vals)
    assert(non_vals_df.shape[0]==return_df.shape[0])
    return_df = return_df.merge(non_vals_df, how='inner', left_index=True, right_index=True)
    return_df[aep_srvc_qlty_idntfr_col] = 'TOTAL'
    if maintain_original_col_order:
        cols_from_og = [x for x in df.columns if x in return_df.columns]
        return_df = Utilities_df.move_cols_to_front(return_df, cols_from_og)
    return return_df

In [None]:
df_xfmr_cir_kwh_15T.columns

In [None]:
#TODO why 'aep_srvc_qlty_idntfr' not in non_val_cols?
non_val_cols = ['starttimeperiod', 'endtimeperiod', 'aep_endtime_utc', 'timezoneoffset',
                'aep_derived_uom', 'aep_usage_dt', 'aep_endtime_utc_from_timestamp', 
                'starttimeperiod_utc', 'endtimeperiod_utc']
value_cols_dict = {'value_sum_cir_col':'sum_value_sum', 
                   'counts_cir_col':'sum_counts', 
                   'value_mean_cir_col':'mean_value_sum'}

In [None]:
df_xfmr_cir_kwh_15T_2 = get_ciruit_total_vals_method2(df_xfmr_cir_kwh_15T, non_val_cols, 
                                                      value_cols_dict=value_cols_dict, remove_intermediate_cols=True)

In [None]:
df_xfmr_cir_kwh_15T_2.head()

In [None]:
def build_net_counts_for_del_rec_tot(merged_df, col_name_base, 
           col_tags = {'rec':'_rec', 'del':'_del', 'tot':'_tot'}, 
           remove_intermediate_cols=True):
    col_rec = col_name_base+col_tags['rec']
    col_del = col_name_base+col_tags['del']
    col_tot = col_name_base+col_tags['tot']
    #---------------------------------------------------    
    merged_df[f"{col_name_base}_del_plus_tot"] = merged_df[col_del] + merged_df[col_tot]
    merged_df[col_name_base] = merged_df[[col_rec, f"{col_name_base}_del_plus_tot"]].max(axis=1)
    merged_df[col_name_base]=merged_df[col_name_base].astype('int64')
    #---------------------------------------------------
    if remove_intermediate_cols:
        merged_df = merged_df.drop(columns=[col_rec, col_del, col_tot, f"{col_name_base}_del_plus_tot"])
    return merged_df

def build_net_value_for_del_rec_tot(merged_df, col_name_base, 
                                    col_tags = {'rec':'_rec', 'del':'_del', 'tot':'_tot'}, 
                                    remove_intermediate_cols=True):
    col_rec = col_name_base+col_tags['rec']
    col_del = col_name_base+col_tags['del']
    col_tot = col_name_base+col_tags['tot']
    #--------------------------------------------------- 
    merged_df[col_name_base] = (merged_df[col_tot] +
                                merged_df[col_del] -
                                merged_df[col_rec])
    #--------------------------------------------------- 
    if remove_intermediate_cols:
        merged_df = merged_df.drop(columns=[col_rec, col_del, col_tot])
    return merged_df


def get_ciruit_total_vals(df, non_val_cols, 
                          value_cols_dict_list = [
                              {'value_col':'sum_value_sum', 'counts_col':'sum_counts', 'mean_col':'mean_value_sum'}, 
                              {'value_col':'sum_value_sq_sum', 'counts_col':'sum_counts', 'mean_col':'mean_value_sq_sum'}, 
                              {'value_col':'sum_value_mean', 'counts_col':'sum_counts', 'mean_col':'mean_value_mean'},
                              {'value_col':'sum_value_std', 'counts_col':'sum_counts', 'mean_col':'mean_value_std'}
                          ], 
                          remove_intermediate_cols=True, 
                          aep_srvc_qlty_idntfr_col='aep_srvc_qlty_idntfr', 
                          aep_srvc_qlty_idntfr_vals={'rec':'RECEIVED', 'del':'DELIVERED', 'tot':'TOTAL'}, 
                          maintain_original_col_order=True):
    #--------------------------------------------------- 
    value_cols = [x['value_col'] for x in value_cols_dict_list]
    counts_cols = list(set(x['counts_col'] for x in value_cols_dict_list))
    merge_cols = value_cols + counts_cols
    #---------------------------------------------------
    df_rec = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['rec']]
    df_del = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['del']]
    df_tot = df[df[aep_srvc_qlty_idntfr_col]==aep_srvc_qlty_idntfr_vals['tot']]
    #---------------------------------------------------    
    # This essentially enforces assert(len(aep_srvc_qlty_idntfrs)<=3)
    # -- each line ensures that every date in df has exactly one entry
    assert(df_rec.shape[0]==df_rec.index.nunique())
    assert(df_del.shape[0]==df_del.index.nunique())
    assert(df_tot.shape[0]==df_tot.index.nunique())
    #---------------------------------------------------
    return_df = df_del[merge_cols].merge(df_rec[merge_cols], 
                                         left_index=True, right_index=True, 
                                         how='outer', suffixes=('_del', '_rec')) 
    # Note: Suffix argument will do nothing here, as df_tot contains merge_cols, whereas
    #       return_df currently contains [f'x_{del}' for x in merge_cols] and [f'x_{rec}' for x in merge_cols]
    #       Therefore, there are no overlapping column names.  As such, we must rename using .rename
    return_df = return_df.merge(df_tot[merge_cols], left_index=True, right_index=True, how='outer')
    return_df = return_df.rename(columns = {x:f'{x}_tot' for x in merge_cols})
    # I used an outer merge so there will always been an entry for delivered, received, and total in return_df
    # However, if one of these did not exist in the original, the outer merge will leave a NaN value in the cell
    # Therefore, I call return_df.fillna(0) to replace any of these NaNs with 0
    return_df = return_df.fillna(0)
    #---------------------------------------------------
    for counts_col in counts_cols:
        return_df = build_net_counts_for_del_rec_tot(return_df, col_name_base=counts_col, 
                                                     col_tags = {'rec':'_rec', 'del':'_del', 'tot':'_tot'}, 
                                                     remove_intermediate_cols=remove_intermediate_cols)
    for value_col in value_cols:
        return_df = build_net_value_for_del_rec_tot(return_df, col_name_base=value_col, 
                                                    col_tags = {'rec':'_rec', 'del':'_del', 'tot':'_tot'}, 
                                                    remove_intermediate_cols=remove_intermediate_cols)
    for cols_dict in value_cols_dict_list:
        if (cols_dict.get('mean_col', None) is None or 
            cols_dict.get('counts_col', None) is None):
            continue
        return_df = build_mean_from_sum_and_counts(return_df, cols_dict['value_col'], cols_dict['counts_col'], cols_dict['mean_col'])
    #---------------------------------------------------
    non_vals_df = build_circuit_non_vals(df, non_val_cols, 
                                         aep_srvc_qlty_idntfr_col=aep_srvc_qlty_idntfr_col, 
                                         aep_srvc_qlty_idntfr_vals=aep_srvc_qlty_idntfr_vals)
    assert(non_vals_df.shape[0]==return_df.shape[0])
    return_df = return_df.merge(non_vals_df, how='inner', left_index=True, right_index=True)
    return_df[aep_srvc_qlty_idntfr_col] = 'TOTAL'
    if maintain_original_col_order:
        cols_from_og = [x for x in df.columns if x in return_df.columns]
        return_df = Utilities_df.move_cols_to_front(return_df, cols_from_og)
    return return_df

In [None]:
df = df_xfmr_cir_kwh_15T.copy()

#TODO why 'aep_srvc_qlty_idntfr' not in non_val_cols?
non_val_cols = ['starttimeperiod', 'endtimeperiod', 'aep_endtime_utc', 'timezoneoffset',
                'aep_derived_uom', 'aep_usage_dt', 'aep_endtime_utc_from_timestamp', 
                'starttimeperiod_utc', 'endtimeperiod_utc']
value_cols_dict = {'value_sum_cir_col':'sum_value_sum', 
                   'counts_cir_col':'sum_counts', 
                   'value_mean_cir_col':'sum_value_mean'}
remove_intermediate_cols=True
aep_srvc_qlty_idntfr_col='aep_srvc_qlty_idntfr'
aep_srvc_qlty_idntfr_vals={'rec':'RECEIVED', 'del':'DELIVERED', 'tot':'TOTAL'}
maintain_original_col_order=True

df_rec = df_xfmr_cir_kwh_15T[df_xfmr_cir_kwh_15T['aep_srvc_qlty_idntfr']=='RECEIVED']
df_del = df_xfmr_cir_kwh_15T[df_xfmr_cir_kwh_15T['aep_srvc_qlty_idntfr']=='DELIVERED']
df_tot = df_xfmr_cir_kwh_15T[df_xfmr_cir_kwh_15T['aep_srvc_qlty_idntfr']=='TOTAL']
#---------------------------------------------------    
# This essentially enforces assert(len(aep_srvc_qlty_idntfrs)<=3)
# -- each line ensures that every date in df has exactly one entry
assert(df_rec.shape[0]==df_rec.index.nunique())
assert(df_del.shape[0]==df_del.index.nunique())
assert(df_tot.shape[0]==df_tot.index.nunique())

In [None]:
df_xfmr_cir_kwh_15T.columns

In [None]:
df_xfmr_cir_kwh_15T.columns

In [None]:
df_xfmr_cir_kwh_15T.columns

In [None]:
we = ['sum_value_sum', 'mean_value_sum', 
      'sum_value_sq_sum', 'mean_value_sq_sum', 
      'sum_value_mean', 'mean_value_mean',
      'sum_value_std', 'mean_value_std', 
      'sum_counts', 'mean_counts',
      'sum_counts_including_null', 'mean_counts_including_null']

In [None]:
# def build_mean_from_sum_and_counts(df, sum_x_col, n_counts_col, placement_col):
#     df[placement_col] = df[sum_x_col]/df[n_counts_col]
#     return df

In [None]:
# def get_ciruit_total_vals(df, non_val_cols, 
#                                  value_cols_dict = {'value_sum_cir_col':'value_sum_cir', 
#                                                     'counts_cir_col':'counts_cir', 
#                                                     'value_mean_cir_col':'value_mean_cir'}, 
#                                  remove_intermediate_cols=True, 
#                                  aep_srvc_qlty_idntfr_col='aep_srvc_qlty_idntfr', 
#                                  aep_srvc_qlty_idntfr_vals={'rec':'RECEIVED', 'del':'DELIVERED', 'tot':'TOTAL'}, 
#                                  maintain_original_col_order=True):

In [None]:
# value_cols = [
#     'sum_value_sum', 'sum_counts', 'mean_value_sum', 
#     'sum_value_sq_sum', 'sum_counts', 'mean_value_sq_sum', 
#     'sum_value_mean', 'sum_counts', 'mean_value_mean',
#     'sum_value_std', 'sum_counts', 'mean_value_std', 
#     'sum_counts', 'sum_counts', 'mean_counts',
#     'sum_counts_including_null', 'sum_counts', 'mean_counts_including_null'
# ]

In [None]:
# value_cols = [
#     'sum_value_sum', 'sum_counts', 'mean_value_sum', 
#     'sum_value_sq_sum', 'sum_counts', 'mean_value_sq_sum', 
#     'sum_value_mean', 'sum_counts', 'mean_value_mean',
#     'sum_value_std', 'sum_counts', 'mean_value_std'
# ]

In [None]:
# value_cols = [
#     ['sum_value_sum', 'sum_counts', 'mean_value_sum'], 
#     ['sum_value_sq_sum', 'sum_counts', 'mean_value_sq_sum'], 
#     ['sum_value_mean', 'sum_counts', 'mean_value_mean'],
#     ['sum_value_std', 'sum_counts', 'mean_value_std']
# ]

In [None]:
df_xfmr_cir_kwh_15T.loc['2020-01-01 05:15:00']

In [None]:
df_xfmr_cir_kwh_15T.columns

In [None]:
'sum_value_sum'
'sum_xfmr sum_cir value'

In [None]:
# value_cols_dict_list = [
#     {'value_col':'sum_value_sum', 'counts_col':'sum_counts', 'mean_col':'mean_value_sum'}, 
#     {'value_col':'sum_value_sq_sum', 'counts_col':'sum_counts', 'mean_col':'mean_value_sq_sum'}, 
#     {'value_col':'sum_value_mean', 'counts_col':'sum_counts', 'mean_col':'mean_value_mean'},
#     {'value_col':'sum_value_std', 'counts_col':'sum_counts', 'mean_col':'mean_value_std'}
# ]

value_cols_dict_list = [
    {'value_col':'sum_value_sum', 'counts_col':'sum_counts', 'mean_col':'mean_value_sum'}
]

In [None]:
df_xfmr_cir_kwh_15T = get_ciruit_total_vals(df_xfmr_cir_kwh_15T, non_val_cols, 
                                            value_cols_dict_list=value_cols_dict_list, 
                                            remove_intermediate_cols=True)

In [None]:
df_xfmr_cir_kwh_15T.columns

In [None]:
df_xfmr_cir_kwh_15T_2.columns

In [None]:
df_xfmr_cir_kwh_15T.equals(df_xfmr_cir_kwh_15T_2)

In [None]:
# freqs=['H', '4H', 'D', 'MS']
# other_grouper_cols = []
# build_agg_dfs = True
# time_col_for_agg = 'endtimeperiod_utc'
# agg_cols =
# df_key = 'df'
# df_agg_key = 'df_agg'

# df_xfmr_cir_kwh_dict = get_resampled_dfs

In [None]:
df.columns

In [None]:
df