In [None]:
from importlib import reload
#reload(Utilities)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re
from pathlib import Path
import json
import pickle

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version
import copy

import itertools

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
import matplotlib.colors as mcolors
import matplotlib.cm as cm #e.g. for cmap=cm.jet
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
from MeterPremise import MeterPremise
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from MECPODf import MECPODf
from MECPOAn import MECPOAn
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
from OutageDAQ import OutageDataInfo as ODI
from OutageMdlrPrep import OutageMdlrPrep
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_General
import Plot_Box_sns
import Plot_Hist
import Plot_Bar
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
# #-------------------------
# run_date = '20240906' # Date of data acquisition
# #-------------------------
# date_0   = '2023-04-01' # Lower limit for end events
# date_1   = '2024-08-31' # Upper limit for end events

#-------------------------
run_date = '20250514' # Date of data acquisition
#-------------------------
date_0   = '2024-07-01' # Lower limit for end events
date_1   = '2024-09-30' # Upper limit for end events
#-------------------------
dataset  = 'outg'
# dataset  = 'otbl'
# dataset  = 'prbl'
#-------------------------

In [None]:
ODI.assert_dataset(dataset)
#-------------------------
date_pd_subdir = f"{date_0.replace('-','')}_{date_1.replace('-','')}"
data_base_dir  = os.path.join(
    Utilities.get_local_data_dir(), 
    r'dovs_and_end_events_data', 
    run_date, 
    date_pd_subdir, 
    ODI.get_subdir(dataset)
)
#-------------------------
assert(os.path.isdir(data_base_dir))
#-------------------------
# files_dir    = os.path.join(data_base_dir, 'EndEvents')
files_dir    = os.path.join(data_base_dir, 'end_events_method', 'EndEvents')
naming_tag   = ODI.get_naming_tag(dataset)
is_no_outage = ODI.get_is_no_outage(dataset)
#-------------------------
assert(os.path.isdir(files_dir))
#-------------------------
print(f'data_base_dir = {data_base_dir}')
print(f'files_dir     = {files_dir}')
print(f'naming_tag    = {naming_tag}')
print(f'is_no_outage  = {is_no_outage}')

In [None]:
# Currently, expecting grp_by_col to be 'outg_rec_nb', 'trsf_pole_nb', or ['outg_rec_nb', 'trsf_pole_nb']
#   Actually, 'outg_rec_nb' will probably not be run again, instead will likely always be ['outg_rec_nb', 'trsf_pole_nb']

In [None]:
batch_size                    = 1000
grp_by_col                    = ['outg_rec_nb', 'trsf_pole_nb']
outg_rec_nb_col               = 'outg_rec_nb'
#-------------------------
file_path_glob                = r'end_events_[0-9]*.csv'
file_path_regex               = None

assert_all_cols_equal         = False #Seems new EndEvents have curr_acct_cls_cd as well...
include_normalize_by_nSNs     = True
inclue_zero_counts            = True
return_multiindex_outg_reason = False
return_normalized_separately  = False

# 0-31, 1-6, 6-11, 11-16, 16-21, 21-26, 26-31
days_min_outg_td_window       = 0
days_max_outg_td_window       = 31

xfmr_equip_typ_nms_of_interest = ['TRANSFORMER, OH', 'TRANSFORMER, UG']

if include_normalize_by_nSNs and not return_normalized_separately:
    normalize_by_nSNs_included = True
else:
    normalize_by_nSNs_included = False

#--------------------------------------------------
if naming_tag is None:
    naming_tag=''
    
# only set up for outg_rec_nb or trsf_pole_nb currently
assert(
    grp_by_col=='outg_rec_nb' or 
    grp_by_col==['outg_rec_nb', 'trsf_pole_nb'] or 
    grp_by_col=='trsf_pole_nb' or 
    grp_by_col==['trsf_pole_nb', 'no_outg_rec_nb']
)

# Not possible for have outg_rec_nb for no outage case!
if is_no_outage:
    assert(grp_by_col!='outg_rec_nb' and grp_by_col!=('outg_rec_nb', 'trsf_pole_nb'))

In [None]:
save_dfs_to_pkl = True
read_dfs_from_pkl = False
assert(save_dfs_to_pkl+read_dfs_from_pkl <=1) # Should never both read and write!

save_subdir_pkls = 'rcpo_dfs'
if   grp_by_col == ['outg_rec_nb', 'trsf_pole_nb']:
    save_subdir_pkls += '_GRP_BY_OUTG_AND_XFMR'
elif grp_by_col == 'trsf_pole_nb':
    save_subdir_pkls += '_GRP_BY_XFMR'
elif grp_by_col == 'outg_rec_nb':
    save_subdir_pkls += '_GRP_BY_OUTG'
elif grp_by_col == ['trsf_pole_nb', 'no_outg_rec_nb']:
    save_subdir_pkls += '_GRP_BY_NO_OUTG_AND_XFMR'
else:
    assert(0)
#-----
save_dir_base_pkls = os.path.join(data_base_dir, 'end_events_method', save_subdir_pkls)
save_dir_pkls      = os.path.join(save_dir_base_pkls, f'outg_td_window_{days_min_outg_td_window}_to_{days_max_outg_td_window}_days')
#-----
if save_dfs_to_pkl and not os.path.exists(save_dir_pkls):
    os.makedirs(save_dir_pkls)

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
# TODO NEED TO BE AUTOMATED
outg_rec_nb_idx_lvl      = 0
trsf_pole_nbs_idx_lvl    = 1

trsf_pole_nbs_loc        = ('index', 'trsf_pole_nb')
rcpo_df_to_time_infos_on = [('index', 'outg_rec_nb')]
time_infos_to_rcpo_df_on = ['index']

rcpo_df_to_PNs_on        = [('index', 'trsf_pole_nb')]
PNs_to_rcpo_df_on        = ['index']
how                      = 'left'

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
mp_df      = None
mp_df_cols = None

In [None]:
# if is_no_outage:
#     mp_df      = None
#     mp_df_cols = None
# else:
#     # mp_df_path = os.path.join(dovs_and_end_events_data_dir, r'df_mp_dupls_dropped.csv')
#     # mp_df = pd.read_csv(mp_df_path, dtype=str)
#     mp_df_path  = os.path.join(data_base_dir, r'df_mp_outg_full.pkl')
#     mp_df       = pd.read_pickle(mp_df_path)
#     merge_on_mp = ['mfr_devc_ser_nbr', 'prem_nb', 'OUTG_REC_NB']
#     mp_df_cols = dict(
#         serial_number_col = 'mfr_devc_ser_nbr', 
#         prem_nb_col       = 'prem_nb', 
#         trsf_pole_nb_col  = 'trsf_pole_nb', 
#         outg_rec_nb_col   = 'OUTG_REC_NB'
#     )
#     # Below ensures there is only one entry per 'meter' (meter here is defined by a unique grouping of merge_on_mp)
#     if any(mp_df.groupby(merge_on_mp).size()>1):
#         print('Resolving uniqueness violators')
#         mp_df = Utilities_df.resolve_uniqueness_violators(
#             df                      = mp_df, 
#             groupby_cols            = merge_on_mp, 
#             gpby_dropna             = False,
#             run_nan_groups_separate = True
#         )
#     assert(not any(mp_df.groupby(merge_on_mp).size()>1))

In [None]:
if not read_dfs_from_pkl:
    start = time.time()
    rcpo_df_OG, ede_typeid_to_reason_df_OG = OutageMdlrPrep.build_rcpx_df_from_EndEvents_in_csvs(    
        files_dir                      = files_dir, 
        mp_df                          = mp_df, 
        file_path_glob                 = file_path_glob, 
        file_path_regex                = file_path_regex, 
        min_outg_td_window             = datetime.timedelta(days=days_min_outg_td_window),
        max_outg_td_window             = datetime.timedelta(days=days_max_outg_td_window),
        build_ede_typeid_to_reason_df  = True, 
        batch_size                     = batch_size, 
        cols_and_types_to_convert_dict = None, 
        to_numeric_errors              = 'coerce', 
        assert_all_cols_equal          = assert_all_cols_equal, 
        include_normalize_by_nSNs      = include_normalize_by_nSNs, 
        inclue_zero_counts             = inclue_zero_counts, 
        return_multiindex_outg_reason  = return_multiindex_outg_reason, 
        return_normalized_separately   = return_normalized_separately, 
        verbose                        = True, 
        n_update                       = 1, 
        grp_by_cols                    = grp_by_col, 
        outg_rec_nb_col                = outg_rec_nb_col, 
        trsf_pole_nb_col               = 'trsf_pole_nb', 
        addtnl_dropna_subset_cols      = None, 
        is_no_outage                   = is_no_outage, 
        prem_nb_col                    = 'aep_premise_nb', 
        serial_number_col              = 'serialnumber', 
        include_prem_nbs               = True, 
        set_faulty_mp_vals_to_nan      = False,
        correct_faulty_mp_vals         = False, 
        trust_sql_grouping             = True, 
        drop_gpd_for_sql_appendix      = True, 
        mp_df_cols                     = mp_df_cols, 
        make_all_columns_lowercase     = True
    )
    print(time.time()-start)
    #-------------------------
    if save_dfs_to_pkl:
        rcpo_df_OG.to_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_OG.pkl'))
        ede_typeid_to_reason_df_OG.to_pickle(os.path.join(save_dir_pkls, f'ede_typeid_to_reason{naming_tag}_df_OG.pkl'))        
else:
    rcpo_df_OG = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_OG.pkl'))
    ede_typeid_to_reason_df_OG = pd.read_pickle(os.path.join(save_dir_pkls, f'ede_typeid_to_reason{naming_tag}_df_OG.pkl'))    

In [None]:
# #TODO DELETE ME!!!!!!!!!
# rcpo_df_OG = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_OG.pkl'))
# ede_typeid_to_reason_df_OG = pd.read_pickle(os.path.join(save_dir_pkls, f'ede_typeid_to_reason{naming_tag}_df_OG.pkl'))

In [None]:
reason_to_ede_typeid_df = AMIEndEvents.invert_ede_typeid_to_reason_df(ede_typeid_to_reason_df_OG)
rcpo_df = rcpo_df_OG.copy()

In [None]:
#-------------------------
# NOTE: Need mp_df_curr and mp_df_hist separate for functionality, so one cannot simply use mp_df loaded earlier.
# NOTE: drop_approx_duplicates=False below. These will be dropped later
#-----
if grp_by_col=='outg_rec_nb':
    mp_df_curr_hist = DOVSOutages.build_mp_df_curr_hist_for_outgs(outg_rec_nbs=rcpo_df.index.get_level_values(outg_rec_nb_idx_lvl).tolist())
else:
    mp_df_curr_hist = MeterPremise.build_mp_df_curr_hist_for_xfmrs(rcpo_df.index.get_level_values(trsf_pole_nbs_idx_lvl).tolist(), drop_approx_duplicates=False)

#-------------------------
if is_no_outage:
    # Build no_outg_time_infos_df, which has prem_nbs indices and t_min, t_max (and possible summary_path) columns
    # This is where the time information for each premise number comes from
    paths = Utilities.find_all_paths(base_dir=files_dir, glob_pattern=file_path_glob)

    no_outg_time_infos_df = MECPOAn.get_bsln_time_interval_infos_df_from_summary_files(
        summary_paths           = [AMIEndEvents.find_summary_file_from_csv(x) for x in paths], 
        output_prem_nbs_col     = 'prem_nbs', 
        output_t_min_col        = 't_min', 
        output_t_max_col        = 't_max', 
        make_addtnl_groupby_idx = True, 
        include_summary_paths   = True, 
        date_only               = False, 
        date_col                = 'aep_event_dt'
    )

In [None]:
df_mp_install_time_col = 'inst_ts'
df_mp_removal_time_col = 'rmvl_ts'

# Make sure all dates are datetime objects, not e.g., strings
if(
    not is_datetime64_dtype(mp_df_curr_hist['mp_df_curr'][df_mp_install_time_col]) or 
    not is_datetime64_dtype(mp_df_curr_hist['mp_df_curr'][df_mp_removal_time_col])
):
    # If one isn't, chances are both are not (and no harm in converting both either way)
    mp_df_curr_hist['mp_df_curr'] = Utilities_df.convert_col_types(
        df                  = mp_df_curr_hist['mp_df_curr'], 
        cols_and_types_dict = {
            df_mp_install_time_col : datetime.datetime, 
            df_mp_removal_time_col : datetime.datetime
        }
    )
    
if(
    not is_datetime64_dtype(mp_df_curr_hist['mp_df_hist'][df_mp_install_time_col]) or 
    not is_datetime64_dtype(mp_df_curr_hist['mp_df_hist'][df_mp_removal_time_col])
):
    # If one isn't, chances are both are not (and no harm in converting both either way)
    mp_df_curr_hist['mp_df_hist'] = Utilities_df.convert_col_types(
        df                  = mp_df_curr_hist['mp_df_hist'], 
        cols_and_types_dict = {
            df_mp_install_time_col : datetime.datetime, 
            df_mp_removal_time_col : datetime.datetime
        }
    )

In [None]:
if not is_no_outage:
    rcpo_df = DOVSOutages.append_outg_info_to_df(
        df               = rcpo_df, 
        outg_rec_nb_idfr = 'index', 
    )
    assert(rcpo_df.columns.nlevels==2)
    outg_cols_lvl_0_val = 'outg_dummy_lvl_0'
    assert(outg_cols_lvl_0_val in rcpo_df.columns.get_level_values(0))

In [None]:
#-------------------------
# I don't think I want to do the removal on current, only hist!
# This is because current is used for get_SNs_andor_PNs_for_xfmrs
# e.g., I was missing some PNs because maybe a new meter was installed after rcpo_df['DT_OFF_TS_FULL'].max()
#   So, in all likelihood that was an appropriate meter entry in historical, but this was excluded because
#   there wasn't an entry in current that passed the cuts below
#-----
mp_df_curr_hist['mp_df_hist'] = mp_df_curr_hist['mp_df_hist'][(
    ((mp_df_curr_hist['mp_df_hist']['rmvl_ts']>rcpo_df[(outg_cols_lvl_0_val, 'DT_OFF_TS_FULL')].min()) | (mp_df_curr_hist['mp_df_hist']['rmvl_ts'].isna())) & 
    (mp_df_curr_hist['mp_df_hist']['inst_ts']<=rcpo_df[(outg_cols_lvl_0_val, 'DT_OFF_TS_FULL')].max())
)]

In [None]:
df_mp_serial_number_col='mfr_devc_ser_nbr'
df_mp_prem_nb_col='prem_nb'
df_mp_install_time_col='inst_ts' 
df_mp_removal_time_col='rmvl_ts' 
df_mp_trsf_pole_nb_col='trsf_pole_nb'
#-------------------------
dflt_args_drop_approx_mp_duplicates = MeterPremise.get_dflt_args_drop_approx_mp_duplicates(
    df_mp_serial_number_col=df_mp_serial_number_col, 
    df_mp_prem_nb_col=df_mp_prem_nb_col, 
    df_mp_install_time_col=df_mp_install_time_col, 
    df_mp_removal_time_col=df_mp_removal_time_col, 
    df_mp_trsf_pole_nb_col=df_mp_trsf_pole_nb_col
)
drop_approx_duplicates_args = Utilities.supplement_dict_with_default_values(
    to_supplmnt_dict=None, 
    default_values_dict=dflt_args_drop_approx_mp_duplicates, 
    extend_any_lists=False, 
    inplace=True
)
#-----
mp_df_curr_hist['mp_df_hist'] = MeterPremise.drop_approx_mp_duplicates(
    mp_df=mp_df_curr_hist['mp_df_hist'], 
    **drop_approx_duplicates_args
)
#-----
mp_df_curr_hist['mp_df_curr'] = MeterPremise.drop_approx_mp_duplicates(
    mp_df=mp_df_curr_hist['mp_df_curr'], 
    **drop_approx_duplicates_args
)

In [None]:
if save_dfs_to_pkl:
    mp_df_curr_hist['mp_df_hist'].to_pickle(os.path.join(save_dir_pkls, f'mp{naming_tag}_df_hist.pkl'))
    mp_df_curr_hist['mp_df_curr'].to_pickle(os.path.join(save_dir_pkls, f'mp{naming_tag}_df_curr.pkl'))

In [None]:
# mp_df_curr_hist = {}
# mp_df_curr_hist['mp_df_hist'] = pd.read_pickle(os.path.join(save_dir_pkls, f'mp{naming_tag}_df_hist.pkl'))
# mp_df_curr_hist['mp_df_curr'] = pd.read_pickle(os.path.join(save_dir_pkls, f'mp{naming_tag}_df_curr.pkl'))

In [None]:
time_infos_df = OutageMdlrPrep.get_outg_time_infos_df(
    rcpo_df=rcpo_df, 
    outg_rec_nb_idx_lvl=outg_rec_nb_idx_lvl, 
    times_relative_to_off_ts_only=True, 
    td_for_left=None, 
    td_for_right=None
)

og_cols = rcpo_df_OG.columns
outg_info_cols = list(set(rcpo_df.columns).difference(set(og_cols)))
rcpo_df = rcpo_df.drop(columns=outg_info_cols)

In [None]:
if grp_by_col=='outg_rec_nb':
    assert(not is_no_outage)
    if not read_dfs_from_pkl:
        start=time.time()
        #-------------------------
        rcpo_df_raw  = MECPODf.project_level_0_columns_from_rcpo_wide(
            rcpo_df_wide = rcpo_df, 
            level_0_val  = 'counts', 
            droplevel    = True
        )
        #-----
        rcpo_df_raw = MECPODf.add_outage_active_SNs_to_rcpo_df(
            rcpo_df                    = rcpo_df_raw, 
            set_outage_nSNs            = True, 
            include_outage_premise_nbs = True, 
            df_mp_curr                 = mp_df_curr_hist['mp_df_curr'], 
            df_mp_hist                 = mp_df_curr_hist['mp_df_hist']
        )
        #-----
        rcpo_df_raw = MECPODf.add_active_prim_SNs_to_rcpo_df(
            rcpo_df                             = rcpo_df_raw, 
            direct_SNs_in_outgs_df              = None, 
            outg_rec_nb_col                     = 'index', 
            prim_SNs_col                        = 'direct_serial_numbers', 
            set_prim_nSNs                       = True, 
            sort_SNs                            = True, 
            build_direct_SNs_in_outgs_df_kwargs = {}, 
            mp_df_curr                          = mp_df_curr_hist['mp_df_curr'], 
            mp_df_hist                          = mp_df_curr_hist['mp_df_hist']
        )
        #-------------------------
        rcpo_df_norm = MECPODf.project_level_0_columns_from_rcpo_wide(
            rcpo_df_wide = rcpo_df, 
            level_0_val  = 'counts_norm', 
            droplevel    = True
        )
        rcpo_df_norm_by_outg_nSNs = MECPODf.build_rcpo_df_norm_by_outg_active_nSNs(
            rcpo_df_raw, 
            df_mp_curr = mp_df_curr_hist['mp_df_curr'], 
            df_mp_hist = mp_df_curr_hist['mp_df_hist']
        )
        rcpo_df_norm_by_prim_nSNs = MECPODf.build_rcpo_df_norm_by_prim_active_nSNs(
            rcpo_df_raw                         = rcpo_df_raw, 
            direct_SNs_in_outgs_df              = None, 
            outg_rec_nb_col                     = 'index', 
            prim_nSNs_col                       = '_prim_nSNs', 
            prim_SNs_col                        = '_prim_SNs', 
            other_SNs_col_tags_to_ignore        = ['_SNs', '_nSNs', '_prem_nbs', '_nprem_nbs'], 
            drop_prim_nSNs_eq_0                 = True, 
            new_level_0_val                     = 'counts_norm_by_prim_nSNs', 
            remove_SNs_cols                     = False, 
            build_direct_SNs_in_outgs_df_kwargs = dict(equip_typ_nms_of_interest=xfmr_equip_typ_nms_of_interest), 
            df_mp_curr                          = mp_df_curr_hist['mp_df_curr'], 
            df_mp_hist                          = mp_df_curr_hist['mp_df_hist']
        )
        #-------------------------
        icpo_df_raw = MECPODf.convert_rcpo_to_icpo_df(
            rcpo_df                 = rcpo_df_raw, 
            reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
            is_norm                 = False
        )

        icpo_df_norm = MECPODf.convert_rcpo_to_icpo_df(
            rcpo_df                 = rcpo_df_norm, 
            reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
            is_norm                 = True, 
            counts_col              = '_nSNs'
        )

        icpo_df_norm_by_outg_nSNs = MECPODf.convert_rcpo_to_icpo_df(
            rcpo_df                 = rcpo_df_norm_by_outg_nSNs, 
            reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
            is_norm                 = True, 
            counts_col              = '_outg_nSNs'
        )

        icpo_df_norm_by_prim_nSNs = MECPODf.convert_rcpo_to_icpo_df(
            rcpo_df                 = rcpo_df_norm_by_prim_nSNs, 
            reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
            is_norm                 = True, 
            counts_col              = '_prim_nSNs'
        )
        #-------------------------
        print(time.time()-start)
        #-------------------------
        if save_dfs_to_pkl:
            #-------------------------
            rcpo_df_raw.to_pickle(              os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_raw.pkl'))
            rcpo_df_norm.to_pickle(             os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm.pkl'))
            rcpo_df_norm_by_outg_nSNs.to_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm_by_outg_nSNs.pkl'))
            rcpo_df_norm_by_prim_nSNs.to_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm_by_prim_nSNs.pkl'))
            #-----
            icpo_df_raw.to_pickle(              os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_raw.pkl'))
            icpo_df_norm.to_pickle(             os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm.pkl'))
            icpo_df_norm_by_outg_nSNs.to_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm_by_outg_nSNs.pkl'))
            icpo_df_norm_by_prim_nSNs.to_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm_by_prim_nSNs.pkl'))
            #-------------------------
    else:
        rcpo_df_raw               = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_raw.pkl'))
        rcpo_df_norm              = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm.pkl'))
        rcpo_df_norm_by_outg_nSNs = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm_by_outg_nSNs.pkl'))
        rcpo_df_norm_by_prim_nSNs = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm_by_prim_nSNs.pkl'))
        #-----
        icpo_df_raw               = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_raw.pkl'))
        icpo_df_norm              = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm.pkl'))
        icpo_df_norm_by_outg_nSNs = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm_by_outg_nSNs.pkl'))
        icpo_df_norm_by_prim_nSNs = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm_by_prim_nSNs.pkl'))

In [None]:
#TODO!!!!!
# if grp_by_col=='trsf_pole_nb':
if not read_dfs_from_pkl:
    start=time.time()
    #-------------------------
    rcpo_df_raw  = MECPODf.project_level_0_columns_from_rcpo_wide(
        rcpo_df_wide = rcpo_df, 
        level_0_val  = 'counts', 
        droplevel    = True
    )
    #-----
    rcpo_df_raw = OutageMdlrPrep.add_xfmr_active_SNs_to_rcpo_df(
        rcpo_df                                = rcpo_df_raw, 
        trsf_pole_nbs_loc                      = trsf_pole_nbs_loc, 
        set_xfmr_nSNs                          = True, 
        include_active_xfmr_PNs                = True, 
        df_mp_curr                             = mp_df_curr_hist['mp_df_curr'],
        df_mp_hist                             = mp_df_curr_hist['mp_df_hist'], 
        time_infos_df                          = time_infos_df, 
        rcpo_df_to_time_infos_on               = rcpo_df_to_time_infos_on, 
        time_infos_to_rcpo_df_on               = time_infos_to_rcpo_df_on, 
        how                                    = how, 
        rcpo_df_to_PNs_on                      = rcpo_df_to_PNs_on, 
        PNs_to_rcpo_df_on                      = PNs_to_rcpo_df_on, 
        addtnl_get_active_SNs_for_xfmrs_kwargs = dict(
            assert_all_trsf_pole_nbs_found=False
        ), 
        xfmr_SNs_col                           = '_xfmr_SNs', 
        xfmr_nSNs_col                          = '_xfmr_nSNs', 
        xfmr_PNs_col                           = '_xfmr_PNs', 
        xfmr_nPNs_col                          = '_xfmr_nPNs',  
    )
    #-------------------------
    rcpo_df_norm = MECPODf.project_level_0_columns_from_rcpo_wide(
        rcpo_df_wide = rcpo_df, 
        level_0_val  = 'counts_norm', 
        droplevel    = True
    )
    #-----
    assert('_xfmr_nSNs' in rcpo_df_raw.columns)
    assert('_xfmr_SNs' in rcpo_df_raw.columns)
    rcpo_df_norm_by_xfmr_nSNs = OutageMdlrPrep.build_rcpo_df_norm_by_xfmr_active_nSNs(
        rcpo_df_raw                            = rcpo_df_raw, 
        trsf_pole_nbs_loc                      = trsf_pole_nbs_loc, 
        xfmr_nSNs_col                          = '_xfmr_nSNs', 
        xfmr_SNs_col                           = '_xfmr_SNs', 
        other_SNs_col_tags_to_ignore           = ['_SNs', '_nSNs', '_prem_nbs', '_nprem_nbs', '_xfmr_PNs', '_xfmr_nPNs'], 
        drop_xfmr_nSNs_eq_0                    = True, 
        new_level_0_val                        = 'counts_norm_by_xfmr_nSNs', 
        remove_SNs_cols                        = False, 
        df_mp_curr                             = mp_df_curr_hist['mp_df_curr'],
        df_mp_hist                             = mp_df_curr_hist['mp_df_hist'], 
        time_infos_df                          = time_infos_df,
        rcpo_df_to_time_infos_on               = rcpo_df_to_time_infos_on, 
        time_infos_to_rcpo_df_on               = time_infos_to_rcpo_df_on, 
        how                                    = how, 
        rcpo_df_to_PNs_on                      = rcpo_df_to_PNs_on, 
        PNs_to_rcpo_df_on                      = PNs_to_rcpo_df_on, 
        addtnl_get_active_SNs_for_xfmrs_kwargs = dict(
            assert_all_trsf_pole_nbs_found=False
        ), 
    )
    #-------------------------
    icpo_df_raw = MECPODf.convert_rcpo_to_icpo_df(
        rcpo_df                 = rcpo_df_raw, 
        reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
        is_norm                 = False
    )

    icpo_df_norm = MECPODf.convert_rcpo_to_icpo_df(
        rcpo_df                 = rcpo_df_norm, 
        reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
        is_norm                 = True, 
        counts_col              = '_nSNs'
    )

    icpo_df_norm_by_xfmr_nSNs = MECPODf.convert_rcpo_to_icpo_df(
        rcpo_df                 = rcpo_df_norm_by_xfmr_nSNs, 
        reason_to_ede_typeid_df = reason_to_ede_typeid_df, 
        is_norm                 = True, 
        counts_col              = '_xfmr_nSNs'
    )
    #-------------------------
    print(time.time()-start)
    #-------------------------
    if save_dfs_to_pkl:
        #-------------------------
        rcpo_df_raw.to_pickle(              os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_raw.pkl'))
        rcpo_df_norm.to_pickle(             os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm.pkl'))
        rcpo_df_norm_by_xfmr_nSNs.to_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm_by_xfmr_nSNs.pkl'))
        #-----
        icpo_df_raw.to_pickle(              os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_raw.pkl'))
        icpo_df_norm.to_pickle(             os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm.pkl'))
        icpo_df_norm_by_xfmr_nSNs.to_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm_by_xfmr_nSNs.pkl'))
else:
    rcpo_df_raw               = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_raw.pkl'))
    rcpo_df_norm              = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm.pkl'))
    rcpo_df_norm_by_xfmr_nSNs = pd.read_pickle(os.path.join(save_dir_pkls, f'rcpo{naming_tag}_df_norm_by_xfmr_nSNs.pkl'))
    #-----
    icpo_df_raw               = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_raw.pkl'))
    icpo_df_norm              = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm.pkl'))
    icpo_df_norm_by_xfmr_nSNs = pd.read_pickle(os.path.join(save_dir_pkls, f'icpo{naming_tag}_df_norm_by_xfmr_nSNs.pkl'))

In [None]:
assert(0)

In [None]:
save_dfs_to_pkl

In [None]:
save_dir_pkls

In [None]:
rcpo_df_raw  = MECPODf.project_level_0_columns_from_rcpo_wide(rcpo_df, 'counts', droplevel=True)
rcpo_df_raw

In [None]:
rcpo_df_raw  = MECPODf.project_level_0_columns_from_rcpo_wide(rcpo_df, 'counts', droplevel=True)
rcpo_df_raw[rcpo_df_raw.index.get_level_values(1)=='1871406714001']

In [None]:
addtnl_get_active_SNs_for_xfmrs_kwargs=dict(assert_all_trsf_pole_nbs_found=False)
get_active_SNs_for_xfmrs_kwargs = dict(
    rcpo_df=rcpo_df_raw[rcpo_df_raw.index.get_level_values(1)=='1871406714001'], 
    trsf_pole_nbs_loc=trsf_pole_nbs_loc, 
    df_mp_curr=mp_df_curr_hist['mp_df_curr'],
    df_mp_hist=mp_df_curr_hist['mp_df_hist'],  
    time_infos_df=time_infos_df, 
    rcpo_df_to_time_infos_on=rcpo_df_to_time_infos_on, 
    time_infos_to_rcpo_df_on=time_infos_to_rcpo_df_on, 
    how=how, 
    rcpo_df_to_PNs_on=rcpo_df_to_PNs_on, 
    PNs_to_rcpo_df_on=PNs_to_rcpo_df_on, 
    return_prem_nbs_col='_xfmr_PNs', 
    return_SNs_col='_xfmr_SNs'
)
if addtnl_get_active_SNs_for_xfmrs_kwargs is not None:
    get_active_SNs_for_xfmrs_kwargs = {**get_active_SNs_for_xfmrs_kwargs, 
                                       **addtnl_get_active_SNs_for_xfmrs_kwargs}
active_SNs_df = get_active_SNs_for_xfmrs_in_rcpo_df(**get_active_SNs_for_xfmrs_kwargs)
assert(isinstance(active_SNs_df, pd.DataFrame))

In [None]:
addtnl_get_active_SNs_for_xfmrs_kwargs=dict(assert_all_trsf_pole_nbs_found=False)
get_active_SNs_for_xfmrs_kwargs = dict(
    rcpo_df=rcpo_df_raw, 
    trsf_pole_nbs_loc=trsf_pole_nbs_loc, 
    df_mp_curr=mp_df_curr_hist['mp_df_curr'],
    df_mp_hist=mp_df_curr_hist['mp_df_hist'],  
    time_infos_df=time_infos_df, 
    rcpo_df_to_time_infos_on=rcpo_df_to_time_infos_on, 
    time_infos_to_rcpo_df_on=time_infos_to_rcpo_df_on, 
    how=how, 
    rcpo_df_to_PNs_on=rcpo_df_to_PNs_on, 
    PNs_to_rcpo_df_on=PNs_to_rcpo_df_on, 
    return_prem_nbs_col='_xfmr_PNs', 
    return_SNs_col='_xfmr_SNs'
)
if addtnl_get_active_SNs_for_xfmrs_kwargs is not None:
    get_active_SNs_for_xfmrs_kwargs = {**get_active_SNs_for_xfmrs_kwargs, 
                                       **addtnl_get_active_SNs_for_xfmrs_kwargs}
active_SNs_df = get_active_SNs_for_xfmrs_in_rcpo_df(**get_active_SNs_for_xfmrs_kwargs)
assert(isinstance(active_SNs_df, pd.DataFrame))

In [None]:
rcpo_df_raw  = MECPODf.project_level_0_columns_from_rcpo_wide(rcpo_df_OG.copy(), 'counts', droplevel=True)

In [None]:
rcpo_df=rcpo_df_raw.copy()
trsf_pole_nbs_loc=trsf_pole_nbs_loc
df_mp_curr=mp_df_curr_hist['mp_df_curr']
df_mp_hist=mp_df_curr_hist['mp_df_hist']
time_infos_df=time_infos_df
rcpo_df_to_time_infos_on=rcpo_df_to_time_infos_on
time_infos_to_rcpo_df_on=time_infos_to_rcpo_df_on
how=how
rcpo_df_to_PNs_on=rcpo_df_to_PNs_on
PNs_to_rcpo_df_on=PNs_to_rcpo_df_on

addtnl_mp_df_curr_cols=None
addtnl_mp_df_hist_cols=None

return_prem_nbs_col='_xfmr_PNs'
return_SNs_col='_xfmr_SNs'

assert_all_trsf_pole_nbs_found=False
df_mp_serial_number_col='mfr_devc_ser_nbr'
df_mp_prem_nb_col='prem_nb'
df_mp_install_time_col='inst_ts'
df_mp_removal_time_col='rmvl_ts'
df_mp_trsf_pole_nb_col='trsf_pole_nb'
t_min_col='t_min'
t_max_col='t_max'

In [None]:
#--------------------------------------------------
assert(t_min_col in time_infos_df.columns and 
       t_max_col in time_infos_df.columns)
time_infos_df = time_infos_df[[t_min_col, t_max_col]]
#-----
# Remove any duplicates from time_infos_df
tmp_col = Utilities.generate_random_string()
time_infos_df[tmp_col] = time_infos_df.index
time_infos_df = time_infos_df.drop_duplicates()
time_infos_df = time_infos_df.drop(columns=[tmp_col])
#--------------------------------------------------
# trsf_pole_nbs_loc can be a string or tuple/list
# First, find trsf_pole_nbs and trsf_pole_nbs_idx_lvl
assert(Utilities.is_object_one_of_types(trsf_pole_nbs_loc, [str, list, tuple]))
if isinstance(trsf_pole_nbs_loc, str):
    assert(trsf_pole_nbs_loc.startswith('index'))
    if trsf_pole_nbs_loc=='index':
        trsf_pole_nbs_idx_lvl = 0
    else:
        trsf_pole_nbs_idx_lvl = re.findall('index_(\d*)', trsf_pole_nbs_loc)
        assert(len(trsf_pole_nbs_idx_lvl)==1)
        trsf_pole_nbs_idx_lvl=trsf_pole_nbs_idx_lvl[0]
        trsf_pole_nbs_idx_lvl=int(trsf_pole_nbs_idx_lvl)
else:
    assert(len(trsf_pole_nbs_loc)==2)
    assert(trsf_pole_nbs_loc[0]=='index')
    assert(trsf_pole_nbs_loc[1] in rcpo_df.index.names)
    trsf_pole_nbs_idx_lvl = rcpo_df.index.names.index(trsf_pole_nbs_loc[1])
    #---------------
    assert(trsf_pole_nbs_idx_lvl < rcpo_df.index.nlevels)
    trsf_pole_nbs = rcpo_df.index.get_level_values(trsf_pole_nbs_idx_lvl).tolist()
#--------------------------------------------------
#-------------------------
necessary_mp_cols = [df_mp_serial_number_col, df_mp_prem_nb_col, df_mp_install_time_col, df_mp_removal_time_col]

In [None]:
#-------------------------
# At a bare minimum, df_mp_curr and df_mp_hist must both have the following columns:
#   necessary_mp_cols = ['mfr_devc_ser_nbr', 'prem_nb', 'inst_ts', 'rmvl_ts']
assert(all([x in df_mp_curr.columns for x in necessary_mp_cols+[df_mp_trsf_pole_nb_col]]))
assert(all([x in df_mp_hist.columns for x in necessary_mp_cols]))
#-------------------------
# PNs_for_xfmrs is a DF with trsf_pole_nbs indices and elements which are lists of PNs for each xfmr
PNs_for_xfmrs = MeterPremise.get_SNs_andor_PNs_for_xfmrs(
    trsf_pole_nbs=trsf_pole_nbs, 
    include_SNs=False,
    include_PNs=True,
    trsf_pole_nb_col=df_mp_trsf_pole_nb_col, 
    serial_number_col=df_mp_serial_number_col, 
    prem_nb_col=df_mp_prem_nb_col, 
    return_SNs_col=None, #Not grabbing SNs
    return_PNs_col=return_prem_nbs_col, 
    assert_all_trsf_pole_nbs_found=assert_all_trsf_pole_nbs_found, 
    mp_df=None, #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    return_mp_df_also=False
)

# OG

In [None]:
'41840850B40133' in trsf_pole_nbs

In [None]:
PNs_for_xfmrs.loc['41840850B40133']

In [None]:
rcpo_df=rcpo_df_raw.copy()

In [None]:
#-------------------------
# Join together rcpo_df, time_infos_df and PNs_for_xfmrs
rcpo_df = merge_rcpo_and_df(
    rcpo_df=rcpo_df, 
    df_2=time_infos_df, 
    rcpo_df_on=rcpo_df_to_time_infos_on,
    df_2_on=time_infos_to_rcpo_df_on, 
    how=how
)
#-----
rcpo_df = merge_rcpo_and_df(
    rcpo_df=rcpo_df, 
    df_2=PNs_for_xfmrs, 
    rcpo_df_on=rcpo_df_to_PNs_on,
    df_2_on=PNs_to_rcpo_df_on, 
    how=how
)

In [None]:
# idx_i = ('12070597', '1860360762758')
# idx_i = ('12048233', '1839488704233')
# idx_i = ('12334688', '41840850B40133')
idx_i = ('12061939', '41830829A30166')
row_i = rcpo_df.loc[idx_i]

In [None]:
# row_i

In [None]:
rcpo_idx_names = list(rcpo_df.index.names)
assert(not any([x is None for x in rcpo_idx_names]))

In [None]:
# active_SNs_df_i will have indices equal to premise numbers and value equal to lists
#   of active SNs for each PN
# Purpose of making idx_names_w_vals a list of tuples, instead of a dict, is to ensure the correct order is maintained
#   Dicts usually return the correct order, but this is not guaranteed
if len(rcpo_idx_names)==1:
    assert(rcpo_df.index.nlevels==1)
    idx_names_w_vals = [(rcpo_idx_names[0], idx_i)]
else:
    idx_names_w_vals = [((rcpo_idx_names[i] if i!=trsf_pole_nbs_idx_lvl else df_mp_trsf_pole_nb_col), idx_i[i]) 
                        for i in range(len(idx_i))]
PNs_i=row_i[return_prem_nbs_col]
dt_0_i=row_i[t_min_col]
dt_1_i=row_i[t_max_col]
#-----
# See NOTEs above regarding t_min/t_max being empty
# In such a case, it is simply impossibe (with the summary files currently generated) to access
#   the date over which the data would have been run, if any events existed.
#   In future versions, this information will be included in the summary files!
# I don't want to completely exclude these (by e.g., setting dt_0_i=pd.Timestamp.min and 
#   dt_1_i=pd.Timestamp.max), so I will simply include the meters which are active TODAY.
# This obviously is not correct, but this occurrence is rare (only happening when every single meter
#   on a transformer had no events during the time period) and this crude approximation will be fine.
if Utilities.is_object_one_of_types(dt_0_i, [list, np.ndarray]):
    assert(len(dt_0_i)==0)
    # I believe if this happens for one it should happen for both...
    assert(Utilities.is_object_one_of_types(dt_1_i, [list, np.ndarray]) and len(dt_1_i)==0)
    dt_0_i=pd.Timestamp.today()
if Utilities.is_object_one_of_types(dt_1_i, [list, np.ndarray]):
    assert(len(dt_1_i)==0)
    # I believe if this happens for one it should happen for both...
    # But, dt_0_i changed already above, so must check row_i[t_min_col] instead!
    assert(Utilities.is_object_one_of_types(row_i[t_min_col], [list, np.ndarray]) and len(row_i[t_min_col])==0)
    dt_1_i=pd.Timestamp.today()
if((not isinstance(PNs_i, list) and pd.isna(PNs_i)) or 
   len(PNs_i)==0):
    active_SNs_df_i = pd.DataFrame()
else:
    active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
        PNs=PNs_i,
        df_mp_curr=df_mp_curr, 
        df_mp_hist=df_mp_hist, 
        dt_0=dt_0_i,
        dt_1=dt_1_i,
        output_index=None,
        output_groupby=[df_mp_prem_nb_col], 
        include_prems_wo_active_SNs_when_groupby=True, 
        assert_all_PNs_found=False
    )
    active_SNs_df_i=active_SNs_df_i.reset_index()
if active_SNs_df_i.shape[0]==0:
    active_SNs_df_i[df_mp_prem_nb_col] = np.nan
    active_SNs_df_i[df_mp_serial_number_col] = [[]] 
    for name,val in idx_names_w_vals:
        active_SNs_df_i[name] = val
    active_SNs_df_i = active_SNs_df_i.set_index([x[0] for x in idx_names_w_vals])
else:
    for name,val in idx_names_w_vals:
        active_SNs_df_i[name] = val
    active_SNs_df_i = active_SNs_df_i.explode(df_mp_serial_number_col)
    active_SNs_df_i = Utilities_df.consolidate_df(
        df=active_SNs_df_i, 
        groupby_cols=[x[0] for x in idx_names_w_vals], 
        cols_shared_by_group=None, 
        cols_to_collect_in_lists=[df_mp_serial_number_col, df_mp_prem_nb_col], 
        include_groupby_cols_in_output_cols=False, 
        allow_duplicates_in_lists=False, 
        recover_uniqueness_violators=True, 
        rename_cols=None, 
        verbose=False
    )

In [None]:
active_SNs_df_i

In [None]:
active_SNs_df_i.iloc[0]['mfr_devc_ser_nbr']

In [None]:
active_SNs_df_i.iloc[0]['prem_nb']

In [None]:
PNs_for_xfmrs.loc['41830829A30166']['_xfmr_PNs']

# NEW

In [None]:
rcpo_df=rcpo_df_raw.copy()
rcpo_idxs_to_mp_df={0:'OUTG_REC_NB', 1:'trsf_pole_nb'}

In [None]:
#-------------------------
# Join together rcpo_df, time_infos_df and PNs_for_xfmrs
rcpo_df = merge_rcpo_and_df(
    rcpo_df=rcpo_df, 
    df_2=time_infos_df, 
    rcpo_df_on=rcpo_df_to_time_infos_on,
    df_2_on=time_infos_to_rcpo_df_on, 
    how=how
)

In [None]:
rcpo_idx_names = list(rcpo_df.index.names)
assert(not any([x is None for x in rcpo_idx_names]))

In [None]:
# idx_i = ('12070597', '1860360762758')
# idx_i = ('12048233', '1839488704233')
# idx_i = ('12334688', '41840850B40133')
idx_i = ('12061939', '41830829A30166')
row_i = rcpo_df.loc[idx_i]

In [None]:
dt_0_i=row_i[t_min_col]
dt_1_i=row_i[t_max_col]

In [None]:
# Make sure df_mp_install_time_col and df_mp_removal_time_col are datetime objects, not e.g., strings
if(not is_datetime64_dtype(mp_df[df_mp_install_time_col]) or 
   not is_datetime64_dtype(mp_df[df_mp_removal_time_col])):
    # If one isn't, chances are both are not (and no harm in converting both either way)
    mp_df = Utilities_df.convert_col_types(
        df=mp_df, 
        cols_and_types_dict={
            df_mp_install_time_col:datetime.datetime, 
            df_mp_removal_time_col:datetime.datetime
        }
    )

In [None]:
mp_df_slicer = DFSlicer()
if len(rcpo_idx_names)==1:
    assert(rcpo_df.index.nlevels==1)
    idx_names_w_vals = [(rcpo_idx_names[0], idx_i)]
    mp_df_slicer.add_single_slicer(dict(column=rcpo_idxs_to_mp_df[0], value=idx_i))
else:
    idx_names_w_vals = [((rcpo_idx_names[i] if i!=trsf_pole_nbs_idx_lvl else df_mp_trsf_pole_nb_col), idx_i[i]) 
                        for i in range(len(idx_i))]
    assert(len(rcpo_idx_names)==len(idx_i))
    for i_idx in range(len(rcpo_idx_names)):
        mp_df_slicer.add_single_slicer(dict(column=rcpo_idxs_to_mp_df[i_idx], value=idx_i[i_idx]))

In [None]:
print(mp_df_slicer.single_slicers[0].column)
print(mp_df_slicer.single_slicers[0].value)

In [None]:
print(mp_df_slicer.single_slicers[1].column)
print(mp_df_slicer.single_slicers[1].value)

In [None]:
mp_df_i = mp_df_slicer.perform_slicing(mp_df)
mp_df_i

In [None]:
print(mp_df[mp_df['trsf_pole_nb']=='41830829A30166'].shape)
mp_df[mp_df['trsf_pole_nb']=='41830829A30166']

In [None]:
dovs_outgs

In [None]:
dovs_outgs = DOVSOutages(                 
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True, 
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        outg_rec_nbs=['12061939', '12415998'], 
        include_premise=True
    )
)

In [None]:
dovs_outgs.df[dovs_outgs.df['OUTG_REC_NB']=='12061939']['PREMISE_NB'].unique().tolist()

In [None]:
dovs_outgs.df[dovs_outgs.df['OUTG_REC_NB']=='12415998']['PREMISE_NB'].unique().tolist()

In [None]:
mp_df[mp_df['trsf_pole_nb']=='41830829A30166']['OUTG_REC_NB'].value_counts()

In [None]:
mp_df_i = mp_df_i[(mp_df_i[df_mp_install_time_col] <= dt_0_i) & 
                  (mp_df_i[df_mp_removal_time_col].fillna(pd.Timestamp.max) > dt_1_i)]
mp_df_i

In [None]:
print(len(set(mp_df_i['mfr_devc_ser_nbr'].tolist()).symmetric_difference(set(active_SNs_df_i.iloc[0]['mfr_devc_ser_nbr']))))
print(len(set(mp_df_i['prem_nb'].tolist()).symmetric_difference(set(active_SNs_df_i.iloc[0]['prem_nb']))))

In [None]:
SNs_i = mp_df_i[df_mp_serial_number_col].unique().tolist()
PNs_i = mp_df_i[df_mp_prem_nb_col].unique().tolist()

In [None]:
SNs_i

In [None]:
PNs_i

In [None]:
mp_df[mp_df['mfr_devc_ser_nbr']=='883040556']

In [None]:
mp_df_curr_hist_BUILT = MeterPremise.build_mp_df_curr_hist_for_xfmrs(rcpo_df_OG.index.get_level_values(trsf_pole_nbs_idx_lvl).tolist())

In [None]:
mp_df_curr_hist_BUILT['mp_df_curr'][mp_df_curr_hist_BUILT['mp_df_curr']['mfr_devc_ser_nbr']=='883040556']

In [None]:
mp_df_curr_hist_BUILT['mp_df_hist'][mp_df_curr_hist_BUILT['mp_df_hist']['mfr_devc_ser_nbr']=='883040556']

In [None]:
pd.MultiIndex(idx_i, names=rcpo_idx_names)

In [None]:
list(idx_i)

In [None]:
pd.MultiIndex.from_tuples([idx_i])

In [None]:
pd.DataFrame(index=pd.MultiIndex.from_tuples([idx_i]), columns=['Fuck'])

In [None]:
rcpo_idx_names

In [None]:
idx_names_w_vals

In [None]:
if len(rcpo_idx_names)==1:
    assert(rcpo_df.index.nlevels==1)
    idx_names_w_vals = [(rcpo_idx_names[0], idx_i)]
else:
    idx_names_w_vals = [((rcpo_idx_names[i] if i!=trsf_pole_nbs_idx_lvl else df_mp_trsf_pole_nb_col), idx_i[i]) 
                        for i in range(len(idx_i))]
PNs_i=row_i[return_prem_nbs_col]
dt_0_i=row_i[t_min_col]
dt_1_i=row_i[t_max_col]
#-----
# See NOTEs above regarding t_min/t_max being empty
# In such a case, it is simply impossibe (with the summary files currently generated) to access
#   the date over which the data would have been run, if any events existed.
#   In future versions, this information will be included in the summary files!
# I don't want to completely exclude these (by e.g., setting dt_0_i=pd.Timestamp.min and 
#   dt_1_i=pd.Timestamp.max), so I will simply include the meters which are active TODAY.
# This obviously is not correct, but this occurrence is rare (only happening when every single meter
#   on a transformer had no events during the time period) and this crude approximation will be fine.
if Utilities.is_object_one_of_types(dt_0_i, [list, np.ndarray]):
    assert(len(dt_0_i)==0)
    # I believe if this happens for one it should happen for both...
    assert(Utilities.is_object_one_of_types(dt_1_i, [list, np.ndarray]) and len(dt_1_i)==0)
    dt_0_i=pd.Timestamp.today()
if Utilities.is_object_one_of_types(dt_1_i, [list, np.ndarray]):
    assert(len(dt_1_i)==0)
    # I believe if this happens for one it should happen for both...
    # But, dt_0_i changed already above, so must check row_i[t_min_col] instead!
    assert(Utilities.is_object_one_of_types(row_i[t_min_col], [list, np.ndarray]) and len(row_i[t_min_col])==0)
    dt_1_i=pd.Timestamp.today()

In [None]:
if((not isinstance(PNs_i, list) and pd.isna(PNs_i)) or 
   len(PNs_i)==0):
    active_SNs_df_i = pd.DataFrame()
else:
    active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
        PNs=PNs_i,
        df_mp_curr=df_mp_curr, 
        df_mp_hist=df_mp_hist, 
        dt_0=dt_0_i,
        dt_1=dt_1_i,
        output_index=None,
        output_groupby=[df_mp_prem_nb_col], 
        include_prems_wo_active_SNs_when_groupby=True, 
        assert_all_PNs_found=False, 
        drop_approx_duplicates=True, 
#         drop_approx_duplicates_args=dict(addtnl_groupby_cols=['OUTG_REC_NB'])
    )
    active_SNs_df_i=active_SNs_df_i.reset_index()

In [None]:
active_SNs_df_i

In [None]:
rcpo_df.index

In [None]:
type(pd.DataFrame(index=rcpo_df.index, columns=['FUCK', 'BALLS']).loc[('12048233', '1839488704233')])

In [None]:
# Replaces get_active_SNs_for_xfmrs_OLD, but should probably build get_active_SNs_for_xfmrs
#  which accepts a list of trsf_pole_nbs instead of rcpo_df, which this function can use
def get_active_SNs_for_xfmrs_in_rcpo_df_NEW(
    rcpo_df, 
    trsf_pole_nbs_loc, 
    df_mp, 
    time_infos_df, 
    rcpo_idxs_to_df_mp={0:'OUTG_REC_NB', 1:'trsf_pole_nb'}, 
    rcpo_df_to_time_infos_on = [('index', 'outg_rec_nb')], 
    time_infos_to_rcpo_df_on = ['index'], 
    how='left', 
    return_SNs_col='SNs', 
    return_prem_nbs_col='prem_nbs', 
    assert_all_trsf_pole_nbs_found=True, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb', 
    t_min_col='t_min', 
    t_max_col='t_max'
):
    r"""
    Difficulty is that default.meter_premise_hist does not have trsf_pole_nb field.
    Therefore, one must use default.meter_premise to find the premise numbers for xfrms in trsf_pole_nbs,
      then use those PNs to select the correct entries from default.meter_premise_hist.
    The trsf_pole_nbs should be contained in rcpo_df, and will be found using the trsf_pole_nbs_loc
      parameter described below.
      
    trsf_pole_nbs_loc:
        Directs where the transformer pole numbers are located
        This should identify an index (w/ level)
        Set equal to 'index' for normal DFs, or when trsf_pole_nbs are in level 0 of index.
        For a DF with MultiIndex index, there are two options:
            i.  Set equal to f'index_{idx_level}' for a DF with MutliIndex index, where idx_level
                is an int identifying the level in which the trsf_pole_nbs reside
            ii. Set equal to the tuple ('index', trsf_pole_nbs_idx_name), where trsf_pole_nbs_idx_name is
            the name of the index level in which the trsf_pole_nbs reside.
      
    """
    #--------------------------------------------------
    # Make sure df_mp_install_time_col and df_mp_removal_time_col are datetime objects, not e.g., strings
    if(not is_datetime64_dtype(df_mp[df_mp_install_time_col]) or 
       not is_datetime64_dtype(df_mp[df_mp_removal_time_col])):
        # If one isn't, chances are both are not (and no harm in converting both either way)
        df_mp = Utilities_df.convert_col_types(
            df=df_mp, 
            cols_and_types_dict={
                df_mp_install_time_col:datetime.datetime, 
                df_mp_removal_time_col:datetime.datetime
            }
        )
    #--------------------------------------------------
    assert(t_min_col in time_infos_df.columns and 
           t_max_col in time_infos_df.columns)
    time_infos_df = time_infos_df[[t_min_col, t_max_col]]
    #-----
    # Remove any duplicates from time_infos_df
    tmp_col = Utilities.generate_random_string()
    time_infos_df[tmp_col] = time_infos_df.index
    time_infos_df = time_infos_df.drop_duplicates()
    time_infos_df = time_infos_df.drop(columns=[tmp_col])
    #--------------------------------------------------
    # trsf_pole_nbs_loc can be a string or tuple/list
    # First, find trsf_pole_nbs and trsf_pole_nbs_idx_lvl
    assert(Utilities.is_object_one_of_types(trsf_pole_nbs_loc, [str, list, tuple]))
    if isinstance(trsf_pole_nbs_loc, str):
        assert(trsf_pole_nbs_loc.startswith('index'))
        if trsf_pole_nbs_loc=='index':
            trsf_pole_nbs_idx_lvl = 0
        else:
            trsf_pole_nbs_idx_lvl = re.findall('index_(\d*)', trsf_pole_nbs_loc)
            assert(len(trsf_pole_nbs_idx_lvl)==1)
            trsf_pole_nbs_idx_lvl=trsf_pole_nbs_idx_lvl[0]
            trsf_pole_nbs_idx_lvl=int(trsf_pole_nbs_idx_lvl)
    else:
        assert(len(trsf_pole_nbs_loc)==2)
        assert(trsf_pole_nbs_loc[0]=='index')
        assert(trsf_pole_nbs_loc[1] in rcpo_df.index.names)
        trsf_pole_nbs_idx_lvl = rcpo_df.index.names.index(trsf_pole_nbs_loc[1])
        #---------------
        assert(trsf_pole_nbs_idx_lvl < rcpo_df.index.nlevels)
    #--------------------------------------------------
    #-------------------------
    necessary_mp_cols = [df_mp_serial_number_col, df_mp_prem_nb_col, df_mp_install_time_col, df_mp_removal_time_col]
    #-------------------------
    # At a bare minimum, df_mp must have the following columns:
    #   necessary_mp_cols = ['mfr_devc_ser_nbr', 'prem_nb', 'inst_ts', 'rmvl_ts']
    assert(all([x in df_mp.columns for x in necessary_mp_cols+[df_mp_trsf_pole_nb_col]]))
    #-------------------------
    # Join together rcpo_df, time_infos_df and PNs_for_xfmrs
    rcpo_df = merge_rcpo_and_df(
        rcpo_df=rcpo_df, 
        df_2=time_infos_df, 
        rcpo_df_on=rcpo_df_to_time_infos_on,
        df_2_on=time_infos_to_rcpo_df_on, 
        how=how
    )
    #--------------------------------------------------
    # Only reason for making dict is to ensure trsf_pole_nbs are not repeated 
    active_SNs_in_xfmrs_dfs_dict = {}
    active_SNs_df = pd.DataFrame(index=rcpo_df.index, columns=[return_SNs_col, return_prem_nbs_col])

    rcpo_idx_names = list(rcpo_df.index.names)
    assert(not any([x is None for x in rcpo_idx_names]))
    for idx_i, row_i in rcpo_df.iterrows():    
        dt_0_i=row_i[t_min_col]
        dt_1_i=row_i[t_max_col]
        #-----
        # See NOTEs above regarding t_min/t_max being empty
        # In such a case, it is simply impossibe (with the summary files currently generated) to access
        #   the date over which the data would have been run, if any events existed.
        #   In future versions, this information will be included in the summary files!
        # I don't want to completely exclude these (by e.g., setting dt_0_i=pd.Timestamp.min and 
        #   dt_1_i=pd.Timestamp.max), so I will simply include the meters which are active TODAY.
        # This obviously is not correct, but this occurrence is rare (only happening when every single meter
        #   on a transformer had no events during the time period) and this crude approximation will be fine.
        if Utilities.is_object_one_of_types(dt_0_i, [list, np.ndarray]):
            assert(len(dt_0_i)==0)
            # I believe if this happens for one it should happen for both...
            assert(Utilities.is_object_one_of_types(dt_1_i, [list, np.ndarray]) and len(dt_1_i)==0)
            dt_0_i=pd.Timestamp.today()
        if Utilities.is_object_one_of_types(dt_1_i, [list, np.ndarray]):
            assert(len(dt_1_i)==0)
            # I believe if this happens for one it should happen for both...
            # But, dt_0_i changed already above, so must check row_i[t_min_col] instead!
            assert(Utilities.is_object_one_of_types(row_i[t_min_col], [list, np.ndarray]) and len(row_i[t_min_col])==0)
            dt_1_i=pd.Timestamp.today()            
        #-------------------------
        df_mp_slicer = DFSlicer()
        if len(rcpo_idx_names)==1:
            assert(rcpo_df.index.nlevels==1)
            idx_names_w_vals = [(rcpo_idx_names[0], idx_i)]
            df_mp_slicer.add_single_slicer(dict(column=rcpo_idxs_to_df_mp[0], value=idx_i))
        else:
            idx_names_w_vals = [((rcpo_idx_names[i] if i!=trsf_pole_nbs_idx_lvl else df_mp_trsf_pole_nb_col), idx_i[i]) 
                                for i in range(len(idx_i))]
            assert(len(rcpo_idx_names)==len(idx_i))
            for i_idx in range(len(rcpo_idx_names)):
                df_mp_slicer.add_single_slicer(dict(column=rcpo_idxs_to_df_mp[i_idx], value=idx_i[i_idx]))
        #-----        
        df_mp_i = df_mp_slicer.perform_slicing(df_mp)
        df_mp_i = df_mp_i[(df_mp_i[df_mp_install_time_col] <= dt_0_i) & 
                          (df_mp_i[df_mp_removal_time_col].fillna(pd.Timestamp.max) > dt_1_i)]                        
        #-------------------------
        SNs_i = df_mp_i[df_mp_serial_number_col].unique().tolist()
        PNs_i = df_mp_i[df_mp_prem_nb_col].unique().tolist()        
        #-------------------------
        # Make sure not already an entry in active_SNs_df for idx_i
        assert(active_SNs_df.loc[idx_i].isna().sum()==active_SNs_df.shape[1] and 
               isinstance(active_SNs_df.loc[idx_i], pd.Series))
        active_SNs_df.loc[idx_i, return_SNs_col] = SNs_i
        active_SNs_df.loc[idx_i, return_prem_nbs_col] = PNs_i
    #-------------------------
    # Change [nan] entries to []
    #-----
    # First, if any entries equal NaN, change to []
    active_SNs_df.loc[active_SNs_df[return_SNs_col].isna(), return_SNs_col] = active_SNs_df.loc[active_SNs_df[return_SNs_col].isna(), return_SNs_col].apply(lambda x: [])
    # Now, change any entries equal to [] or [NaN] to []
    found_nans_srs = active_SNs_df[return_SNs_col].apply(lambda x: len([ix for ix in x if not pd.isna(ix)]))==0
    if found_nans_srs.sum()>0:
        active_SNs_df.loc[found_nans_srs, return_SNs_col] = active_SNs_df.loc[found_nans_srs, return_SNs_col].apply(lambda x: [])
    #-----
    # First, if any entries equal NaN, change to []
    active_SNs_df.loc[active_SNs_df[return_prem_nbs_col].isna(), return_prem_nbs_col] = active_SNs_df.loc[active_SNs_df[return_prem_nbs_col].isna(), return_prem_nbs_col].apply(lambda x: [])
    # Now, change any entries equal to [] or [NaN] to []
    found_nans_srs = active_SNs_df[return_prem_nbs_col].apply(lambda x: len([ix for ix in x if not pd.isna(ix)]))==0
    if found_nans_srs.sum()>0:
        active_SNs_df.loc[found_nans_srs, return_prem_nbs_col] = active_SNs_df.loc[found_nans_srs, return_prem_nbs_col].apply(lambda x: [])
    #-------------------------
    return active_SNs_df

In [None]:
rcpo_df_raw  = MECPODf.project_level_0_columns_from_rcpo_wide(rcpo_df_OG.copy(), 'counts', droplevel=True)


mp_df_curr_hist = {}
mp_df_curr_hist['mp_df_curr'] = mp_df[mp_df['trsf_pole_nb'].notna()].copy()
mp_df_curr_hist['mp_df_hist'] = mp_df[mp_df['trsf_pole_nb'].isna()].copy()

if mp_df_curr_hist['mp_df_curr'].shape[0]==0:
    mp_df_curr_hist['mp_df_curr'] = mp_df_curr_hist['mp_df_curr']
    
if mp_df_curr_hist['mp_df_hist'].shape[0]==0:
    mp_df_curr_hist['mp_df_hist'] = mp_df_curr_hist['mp_df_hist']
    
assert(mp_df_curr_hist['mp_df_curr'].shape[0]+mp_df_curr_hist['mp_df_hist'].shape[0]==mp_df.shape[0])

In [None]:
start=time.time()
addtnl_get_active_SNs_for_xfmrs_kwargs=dict(assert_all_trsf_pole_nbs_found=False)
get_active_SNs_for_xfmrs_kwargs = dict(
    rcpo_df=rcpo_df_raw, 
    trsf_pole_nbs_loc=trsf_pole_nbs_loc, 
    df_mp_curr=mp_df_curr_hist['mp_df_curr'],
    df_mp_hist=mp_df_curr_hist['mp_df_hist'],  
    time_infos_df=time_infos_df, 
    rcpo_df_to_time_infos_on=rcpo_df_to_time_infos_on, 
    time_infos_to_rcpo_df_on=time_infos_to_rcpo_df_on, 
    how=how, 
    rcpo_df_to_PNs_on=rcpo_df_to_PNs_on, 
    PNs_to_rcpo_df_on=PNs_to_rcpo_df_on, 
    return_prem_nbs_col='_xfmr_PNs', 
    return_SNs_col='_xfmr_SNs'
)
if addtnl_get_active_SNs_for_xfmrs_kwargs is not None:
    get_active_SNs_for_xfmrs_kwargs = {**get_active_SNs_for_xfmrs_kwargs, 
                                       **addtnl_get_active_SNs_for_xfmrs_kwargs}
active_SNs_df = get_active_SNs_for_xfmrs_in_rcpo_df(**get_active_SNs_for_xfmrs_kwargs)
assert(isinstance(active_SNs_df, pd.DataFrame))
print(time.time()-start)

In [None]:
start=time.time()
addtnl_get_active_SNs_for_xfmrs_kwargs=dict(assert_all_trsf_pole_nbs_found=False)
get_active_SNs_for_xfmrs_kwargs = dict(
    rcpo_df=rcpo_df_raw, 
    trsf_pole_nbs_loc=trsf_pole_nbs_loc, 
    df_mp=mp_df,
    time_infos_df=time_infos_df, 
    rcpo_df_to_time_infos_on=rcpo_df_to_time_infos_on, 
    time_infos_to_rcpo_df_on=time_infos_to_rcpo_df_on, 
    how=how, 
#     rcpo_df_to_PNs_on=rcpo_df_to_PNs_on, 
#     PNs_to_rcpo_df_on=PNs_to_rcpo_df_on, 
    return_prem_nbs_col='_xfmr_PNs', 
    return_SNs_col='_xfmr_SNs'
)
if addtnl_get_active_SNs_for_xfmrs_kwargs is not None:
    get_active_SNs_for_xfmrs_kwargs = {**get_active_SNs_for_xfmrs_kwargs, 
                                       **addtnl_get_active_SNs_for_xfmrs_kwargs}
active_SNs_df_NEW = get_active_SNs_for_xfmrs_in_rcpo_df_NEW(**get_active_SNs_for_xfmrs_kwargs)
assert(isinstance(active_SNs_df_NEW, pd.DataFrame))
print(time.time()-start)

In [None]:
active_SNs_df

In [None]:
active_SNs_df_NEW

In [None]:
active_SNs_df.equals(active_SNs_df_NEW)

In [None]:
active_SNs_df['_xfmr_SNs']=active_SNs_df['_xfmr_SNs'].apply(sorted)
active_SNs_df['_xfmr_PNs']=active_SNs_df['_xfmr_PNs'].apply(sorted)

active_SNs_df_NEW['_xfmr_SNs']=active_SNs_df_NEW['_xfmr_SNs'].apply(sorted)
active_SNs_df_NEW['_xfmr_PNs']=active_SNs_df_NEW['_xfmr_PNs'].apply(sorted)

In [None]:
active_SNs_df.equals(active_SNs_df_NEW)

In [None]:
active_SNs_df=active_SNs_df.sort_index()
active_SNs_df_NEW=active_SNs_df_NEW.sort_index()

In [None]:
active_SNs_df.equals(active_SNs_df_NEW)

In [None]:
# Utilities_df.get_dfs_diff(active_SNs_df, active_SNs_df_NEW)

In [None]:
active_SNs_df

In [None]:
active_SNs_df_NEW

In [None]:
all(active_SNs_df.index==active_SNs_df_NEW.index)

In [None]:
for idx in active_SNs_df.index:
    print(idx)
    print(active_SNs_df.loc[idx]['_xfmr_SNs']==active_SNs_df_NEW.loc[idx]['_xfmr_SNs'])
    print(active_SNs_df.loc[idx]['_xfmr_SNs'])
    print(active_SNs_df_NEW.loc[idx]['_xfmr_SNs'])
    print()
    assert(active_SNs_df.loc[idx]['_xfmr_SNs']==active_SNs_df_NEW.loc[idx]['_xfmr_SNs'])

In [None]:
mp_df[mp_df['mfr_devc_ser_nbr'].isin(['881479764', '883040489', '883040491', '883040492', '883040505', '883040506', '883040507', '883040508', '883040553', '883040554', '883040555', '883040556'])]