In [None]:
from importlib import reload
#reload(Utilities)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re
import string

from pathlib import Path
import json
import pickle

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version
import copy
from functools import reduce

import itertools

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
import matplotlib.colors as mcolors
import matplotlib.cm as cm #e.g. for cmap=cm.jet
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
from MeterPremise import MeterPremise
from EEMSP import EEMSP
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from MECPODf import MECPODf
from MECPOAn import MECPOAn
from MECPOCollection import MECPOCollection
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_General
import Plot_Box_sns
import Plot_Hist
import Plot_Bar
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import preprocessing

import tensorflow as tf
from tensorflow import keras

import scipy

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
def build_full_data_dfs_v2(full_data_df):
    # Building final_data_df_v2 with this method (calling get_merged_cpo_dfs first) causes each index
    #   in the final DF to be repeated the same amount of times.
    # If I had built straight from individual DFs themselves (without calling get_merged_cpo_dfs), this would
    #   not be the case, and in general the indices would have different numbers of repititions.
    #   e.g., if a group only has events in the 01-05 Days period, that group's index would only occur once in the final DF
    #         whereas if a group has events in all 6 periods, that group's index would be repeated 6 times.

    full_data_dfs_v2 = []
    time_pds = full_data_df.columns.get_level_values(0).unique()
    time_pds = [x for x in time_pds if x!='is_outg']
    #-------------------------
    for time_pd in time_pds:
        found_days = re.findall(r'(\d{2})-(\d{2}) Days', time_pd)
        assert(len(found_days)==1)
        found_days=found_days[0]
        assert(len(found_days)==2)
        days_min = float(found_days[0])
        days_max = float(found_days[1])
        days_avg = 0.5*(days_min+days_max)
        #-----
        full_data_df_i = full_data_df[[time_pd, 'is_outg']].copy()
        full_data_df_i = full_data_df_i.droplevel(0, axis=1)
        full_data_df_i['events_period'] = days_avg
        full_data_dfs_v2.append(full_data_df_i)    
    #-------------------------
    for i in range(1, len(full_data_dfs_v2)):
        assert(len(set(full_data_dfs_v2[i-1]).symmetric_difference(full_data_dfs_v2[i]))==0)
    full_data_df_v2 = pd.concat(full_data_dfs_v2)
    full_data_df_v2 = Utilities_df.move_cols_to_either_end(full_data_df_v2, ['events_period' ,'is_outg'], to_front=False)    
    #-------------------------
    return full_data_df_v2

In [None]:
class DumbClassifier(BaseEstimator):
    r"""
    A dumb classifier which always predicts 1
    """
    def fit(self, X, y=None):
        return self
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds, x_min=None, x_max=None):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlim(x_min,x_max)
    plt.legend()
    
    
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--') # Dashed diagonal

In [None]:
def get_optimal_n_reasons_per_plot(n_reasons_per_plot_init, n_reasons):
    r"""
    Find a good n_reasons_per_plot value given an approximate initial value of n_reasons_per_plot.
    -----
    Given n_reasons_per_plot_init and n_reasons, n_plots can be found.
    In the simple method, one would then simply put n_reasons_per_plot_init in the first i_plot=n_plots-1 plots, and 
      whatever remains in plot i_plot=n_plots.  This could lead to a very small number of plots in the final i_plot=n_plots
    This function takes the calculated n_plots and adjusts n_reasons_per_plot_init to avoid this asymmetry in the final plot.
    
    e.g., suppose n_reasons_per_plot_init=20 and n_reasons=66
      In this case, n_plots = 4
      Simple method:
        First 3 plots have 20 reasons each, final plot has 6 reasons.
      This method:
        First 3 plots have 17 reasons each, final plot has 15 reasons.
    """
    #-------------------------
    n_plots = np.ceil(n_reasons/n_reasons_per_plot).astype(int)
    n_reasons_per_plot_opt = np.ceil(n_reasons/n_plots).astype(int)
    return n_reasons_per_plot_opt

In [None]:
def get_X_entries_by_binary_confusion_matrix_result(X, y_actl, y_pred):
    r"""
    Given the input feature vector X, actual (binary) y-values y_actl, and predicted 
    (binary) y-values y_pred, return X_tp, X_tn, X_fp, X_fn
    """
    #-------------------------
    # True positives
    # Are outages, predicted as outages
    X_tp = X[(y_actl==1) & (y_pred==1)]

    # True negatives
    # Are not outages, predicted as not outages
    X_tn = X[(y_actl==0) & (y_pred==0)]

    # False positives
    # Are not outages, predicted as outages
    X_fp = X[(y_actl==0) & (y_pred==1)]

    # False negatives
    # Are outages, predicted as not outages
    X_fn = X[(y_actl==1) & (y_pred==0)]
    #-------------------------
    return dict(
        X_tp=X_tp, 
        X_tn=X_tn, 
        X_fp=X_fp, 
        X_fn=X_fn
    )

In [None]:
def convert_X_by_confusion_results_to_dfs(
    X_by_confusion_result, 
    full_data_df, 
    run_PCA, 
    run_scaler
):
    r"""
    X_tp, X_tn, X_fp, X_fn should be pd.DataFrames unless run_PCA==True or run_scaler==True.
    If not DFs, convert them.
    """
    #-------------------------
    expected_keys = ['X_tp', 'X_tn', 'X_fp', 'X_fn']
    assert(len(set(X_by_confusion_result.keys()).symmetric_difference(set(expected_keys)))==0)
    X_tp = X_by_confusion_result['X_tp']
    X_tn = X_by_confusion_result['X_tn']
    X_fp = X_by_confusion_result['X_fp']
    X_fn = X_by_confusion_result['X_fn']
    #-------------------------
    assert(type(X_tp)==type(X_tn)==type(X_fp)==type(X_fn))
    assert(X_tp.shape[1]==X_tn.shape[1]==X_fp.shape[1]==X_fn.shape[1])
    n_cols = X_tp.shape[1]
    # X_tp, X_tn, X_fp, X_fn should be pd.DataFrames unless run_PCA==True or run_scaler==True
    if not isinstance(X_tp, pd.DataFrame):
        assert(run_PCA or run_scaler)
        if run_PCA:
            assert(n_cols==pca.n_components_)
            df_cols = [f'pca_comp_{i}' for i in range(pca.n_components_)]
        else:
            assert(n_cols==len(full_data_df.columns[:-1]))
            df_cols = full_data_df.columns[:-1]
        #----------
        X_tp = pd.DataFrame(X_tp, columns=df_cols)
        X_tn = pd.DataFrame(X_tn, columns=df_cols)
        X_fp = pd.DataFrame(X_fp, columns=df_cols)
        X_fn = pd.DataFrame(X_fn, columns=df_cols)
    #-------------------------
    return dict(
        X_tp=X_tp, 
        X_tn=X_tn, 
        X_fp=X_fp, 
        X_fn=X_fn
    )

In [None]:
def draw_X_by_binary_confusion_result(
    fig_num, 
    X_tp_w_args=None, 
    X_fn_w_args=None, 
    X_tn_w_args=None, 
    X_fp_w_args=None, 
    n_reasons_per_plot=20, 
    n_reasons_total_to_plot=None, 
    reason_order=None, 
    n_x=2, 
    include_xtick_labels_legend=False
):
    r"""
    """
    #-------------------------
    # At least one must not be None!
    assert(
        X_tp_w_args is not None or
        X_fn_w_args is not None or
        X_tn_w_args is not None or
        X_fp_w_args is not None
    )
    #-------------------------
    dflt_X_tp_args = dict(label='X_tp', color='green')
    dflt_X_fn_args = dict(label='X_fn', edgecolor='green', fill=False, hatch='//')
    dflt_X_tn_args = dict(label='X_tn', color='red')
    dflt_X_fp_args = dict(label='X_fp', edgecolor='red', fill=False, hatch='//')
    #-------------------------
    Xs_w_args =    [     X_tp_w_args,    X_fn_w_args,    X_tn_w_args,    X_fp_w_args]
    dflt_Xs_args = [dflt_X_tp_args, dflt_X_fn_args, dflt_X_tn_args, dflt_X_fp_args]
    #-------------------------
    dfs_w_args = []
    for i, X_w_args in enumerate(Xs_w_args):
        if X_w_args is not None:
            assert(Utilities.is_object_one_of_types(X_w_args, [pd.DataFrame, list, tuple]))
            if isinstance(X_w_args, pd.DataFrame):
                X_w_args = [X_w_args, dict()]
            assert(len(X_w_args)==2 and 
                   isinstance(X_w_args[0], pd.DataFrame) and
                   isinstance(X_w_args[1], dict)
                  )
            X_w_args[1] = Utilities.supplement_dict_with_default_values(
                to_supplmnt_dict=X_w_args[1], 
                default_values_dict=dflt_Xs_args[i], 
                inplace=True
            )
            dfs_w_args.append(X_w_args)
    #-------------------------
    # Make sure all DFs have same number of reasons (i.e., all have same number of columns)
    for i in range(1, len(dfs_w_args)):
        assert(dfs_w_args[i-1][0].shape[1]==dfs_w_args[i][0].shape[1])
    #-----
    if n_reasons_total_to_plot is None:
        n_reasons=dfs_w_args[0][0].shape[1]
    else:
        n_reasons=n_reasons_total_to_plot
    #-----
    if get_optimal_n_reasons_per_plot:
        n_reasons_per_plot = get_optimal_n_reasons_per_plot(n_reasons_per_plot, n_reasons)
    n_y = np.ceil(n_reasons/(n_reasons_per_plot*n_x)).astype(int)
    #-------------------------
    if reason_order is None:
        reason_order = dfs_w_args[0][0].columns.tolist()
    reason_idxs = Utilities.get_batch_idx_pairs(n_reasons, n_reasons_per_plot)
    fig, axs = Plot_General.default_subplots(n_x=n_x, n_y=n_y, fig_num=fig_num, return_flattened_axes=True)
    #-----
    # NOTE: If n_x > 1, then len(axs) can be greater than len(reason_idxs)
    #       When this occurs, the final row of plots will have one or more empty 
    assert(len(axs)>=len(reason_idxs))
    for i_plot in range(len(reason_idxs)):
        idx_i_0 = reason_idxs[i_plot][0]
        idx_i_1 = reason_idxs[i_plot][1]
        #-----
        Plot_Bar.plot_multiple_barplots(
            ax=axs[i_plot], 
            dfs_w_args=dfs_w_args, 
            order=reason_order[idx_i_0:idx_i_1], 
            draw_side_by_side=True, 
            replace_xtick_labels_with_ints=True, 
            xtick_ints_offset=idx_i_0, 
            tick_args=[dict(axis='x', labelrotation=90, labelsize=15), 
                       dict(axis='y', labelsize=15)], 
            draw_legend=True
        )
    #--------------------------------------------------
    if include_xtick_labels_legend:
        xtick_elements = reason_order[:n_reasons]
        xtick_rename_dict = {xtick_el:i+1 for i,xtick_el in enumerate(xtick_elements)}
        subplot_layout_params = Plot_General.get_subplot_layout_params(fig)
        xtick_labels_legend_textbox_kwargs = dict(
            fig=fig, 
            xtick_rename_dict=xtick_rename_dict, 
            text_x_pos=1.02*subplot_layout_params['right'], 
            text_y_pos=subplot_layout_params['top'], 
            n_chars_per_line=30, 
            multi_line_offset=None, 
            new_org_separator=': ', 
            fontsize=10, 
            ha='left', 
            va='top',
            n_lines_between_entries=1, 
            n_cols=2, 
            col_padding = 0.01
        )
        Plot_General.generate_xtick_labels_legend_textbox(
            **xtick_labels_legend_textbox_kwargs
        )  
    #--------------------------------------------------
    return fig,axs

# ---------------------------------------------------------------------------------------------------

In [None]:
def get_stat_sig_reasons(outg_df, no_outg_df, p_val_lim=0.05, equal_var=False):
    r"""
    Returns a list of columns which are statistically significant between outg_df and no_outg_df
    """
    #-------------------------
    sig_reasons = []
    for reason in outg_df.columns:
        ttr = scipy.stats.ttest_ind(outg_df[reason], no_outg_df[reason], equal_var=equal_var)
        if ttr.pvalue < p_val_lim:
            sig_reasons.append(reason)
    #-------------------------
    return sig_reasons


def project_stat_sig_reasons(outg_df, no_outg_df, p_val_lim=0.05, equal_var=False):
    r"""
    Returns only the columns which are statistically different between outg_df and no_outg_df
    """
    #-------------------------
    sig_reasons = get_stat_sig_reasons(
        outg_df=outg_df, 
        no_outg_df=no_outg_df, 
        p_val_lim=p_val_lim, 
        equal_var=equal_var
    )
    #-------------------------
    return outg_df[sig_reasons], no_outg_df[sig_reasons]

# ---------------------------------------------------------------------------------------------------

In [None]:
def build_train_test_by_date(
    df_outage, 
    df_no_outage, 
    outg_rec_nb_idfr, 
    train_dates,
    test_dates=None, 
    y_col = ('is_outg', 'is_outg'), 
    random_state = None
):
    r"""
    For now, df_outage train/test determined by date, and df_no_outage is selected to match the size of df_outage.
    This is mainly because getting the time info of the no outage case is a hassle, and all no outage data are currently
    from 2021 (I believe)
    """
    #--------------------------------------------------
    # Get outg_rec_nbs (series) and outg_rec_nbs_unq (list) from df
    outg_rec_nbs = DOVSOutages.get_outg_rec_nbs_from_df(df=df_outage, idfr=outg_rec_nb_idfr)
    assert(len(df_outage)==len(outg_rec_nbs)) # Important in ensuring proper merge at end
    outg_rec_nbs_unq = outg_rec_nbs.unique().tolist()
    #--------------------------------------------------
    build_sql_function = DOVSOutages_SQL.build_sql_outage
    build_sql_function_kwargs = dict(
        outg_rec_nbs = outg_rec_nbs_unq, 
        field_to_split = 'outg_rec_nbs', 
        cols_of_interest=['OUTG_REC_NB', 'DT_OFF_TS']
    )
    dovs_outgs = DOVSOutages(
        df_construct_type=DFConstructType.kRunSqlQuery, 
        contstruct_df_args=None, 
        init_df_in_constructor=True,
        build_sql_function=build_sql_function, 
        build_sql_function_kwargs=build_sql_function_kwargs, 
        build_consolidated=False
    )
    dovs_outgs_df = dovs_outgs.get_df()
    #--------------------------------------------------
    if test_dates is None:
        test_dates = [train_dates[1], dovs_outgs_df['DT_OFF_TS'].max() + pd.Timedelta('1 second')]

    dovs_outgs_df['train_set'] = False
    dovs_outgs_df['test_set'] = False

    dovs_outgs_df.loc[((dovs_outgs_df['DT_OFF_TS'] >= train_dates[0]) & (dovs_outgs_df['DT_OFF_TS'] < train_dates[1])), 'train_set'] = True
    dovs_outgs_df.loc[((dovs_outgs_df['DT_OFF_TS'] >= test_dates[0]) & (dovs_outgs_df['DT_OFF_TS'] < test_dates[1])), 'test_set'] = True

    # No entry should be contained in both train and test sets!
    assert(all(dovs_outgs_df[['train_set', 'test_set']].sum(axis=1)<=1))

    # Split up df_outage according to train/test split in dovs_outgs_df
    df_outage_train = df_outage[outg_rec_nbs.isin(dovs_outgs_df[dovs_outgs_df['train_set']]['OUTG_REC_NB'])]
    df_outage_test = df_outage[outg_rec_nbs.isin(dovs_outgs_df[dovs_outgs_df['test_set']]['OUTG_REC_NB'])]
#     df_outage_train, df_outage_test = train_test_split(df_outage, test_size=0.33, random_state=random_state)

    # Get the relative sizes the be used for selecting train/test for df_no_outage
    train_size = df_outage_train.shape[0]/(df_outage_train.shape[0]+df_outage_test.shape[0])
    test_size = 1.0-train_size

    # Make the split for no outage
    df_no_outage_train, df_no_outage_test = train_test_split(df_no_outage, test_size=test_size, random_state=random_state)

    # Combine outage and no outage
    assert(all(df_outage_train.columns==df_no_outage_train.columns))
    assert(all(df_outage_test.columns==df_no_outage_test.columns))
    df_train = pd.concat([df_outage_train, df_no_outage_train])
    df_test = pd.concat([df_outage_test, df_no_outage_test])

    # Randomize order or rows
    df_train = df_train.sample(frac=1, random_state=random_state)
    df_test = df_test.sample(frac=1, random_state=random_state)

    # Split X and y
    X_train = df_train[[x for x in df_train.columns if x!=y_col]]
    X_test  = df_test[[x for x in df_test.columns if x!=y_col]]

    y_train = df_train[y_col]
    y_test  = df_test[y_col]
    #--------------------------------------------------
    return X_train, X_test, y_train, y_test



# !!!!!!!!!!!!!!!!!!!!!!! REPLACE BY get_cpx_outg_df_subset_by_outg_datetime
# PROBABLY NEED TO UPDATE get_cpo_df_subsets_by_outg_season as well!
# def get_cpo_df_subset_by_outg_date(
#     cpo_df, 
#     date_0,
#     date_1, 
#     outg_rec_nb_idfr='index', 
#     return_notin_also=False
# ):
#     r"""
#     Returns the subset of cpo_df whose associated outages are within [date_0, date_1)
#     """
#     #-------------------------
#     if not isinstance(date_0, datetime.datetime):
#         date_0 = pd.to_datetime(date_0)
#     if not isinstance(date_1, datetime.datetime):
#         date_1 = pd.to_datetime(date_1)
#     assert(isinstance(date_0, datetime.datetime))
#     assert(isinstance(date_1, datetime.datetime))
#     #-------------------------
#     outg_rec_nbs = MECPODf.get_outg_rec_nbs_from_cpo_df(cpo_df=cpo_df, idfr=outg_rec_nb_idfr)
#     assert(len(cpo_df)==len(outg_rec_nbs)) # Important in ensuring proper selection towards end of function
#     outg_rec_nbs_unq = outg_rec_nbs.unique().tolist()
#     #-------------------------
#     build_sql_function = DOVSOutages_SQL.build_sql_outage
#     build_sql_function_kwargs = dict(
#         outg_rec_nbs = outg_rec_nbs_unq, 
#         field_to_split = 'outg_rec_nbs', 
#         cols_of_interest=['OUTG_REC_NB', 'DT_OFF_TS']
#     )
#     dovs_outgs = DOVSOutages(
#         df_construct_type=DFConstructType.kRunSqlQuery, 
#         contstruct_df_args=None, 
#         init_df_in_constructor=True,
#         build_sql_function=build_sql_function, 
#         build_sql_function_kwargs=build_sql_function_kwargs, 
#         build_consolidated=False
#     )
#     dovs_outgs_df = dovs_outgs.get_df()
#     #-------------------------
#     subset_outg_rec_nbs = dovs_outgs_df.loc[(dovs_outgs_df['DT_OFF_TS'] >= date_0) & 
#                                             (dovs_outgs_df['DT_OFF_TS'] < date_1)]['OUTG_REC_NB'].unique().tolist()
#     cpo_df_subset = cpo_df[outg_rec_nbs.isin(subset_outg_rec_nbs)].copy()
#     #-------------------------
#     if not return_notin_also:
#         return cpo_df_subset
#     else:
#         cpo_df_notin = cpo_df[~outg_rec_nbs.isin(subset_outg_rec_nbs)].copy()
#         return cpo_df_subset, cpo_df_notin

# MOVED TO OutageModeler!!!!!!!!!!!!!!!!!!!!!!!!!!
def get_cpx_outg_df_subset_by_outg_datetime(
    cpx_outg_df, 
    date_0,
    date_1, 
    outg_rec_nb_idfr='index', 
    return_notin_also=False
):
    r"""
    Returns the subset of cpx_outg_df whose associated outages are within [date_0, date_1)
    """
    #-------------------------
    if date_0 is None and date_1 is None:
        if not return_notin_also:
            return cpx_outg_df
        else:
            return cpx_outg_df, pd.DataFrame()
    #-------------------------
    if date_0 is None:
        date_0 = pd.Timestamp.min
    #-----
    if date_1 is None:
        date_1 = pd.Timestamp.max
    #-------------------------
    if not isinstance(date_0, datetime.datetime):
        date_0 = pd.to_datetime(date_0)
    if not isinstance(date_1, datetime.datetime):
        date_1 = pd.to_datetime(date_1)
    assert(isinstance(date_0, datetime.datetime))
    assert(isinstance(date_1, datetime.datetime))
    #-------------------------
    contstruct_df_args=None
    build_sql_function = DOVSOutages_SQL.build_sql_outage
    build_sql_function_kwargs=dict(
        datetime_col='DT_OFF_TS_FULL', 
        cols_of_interest=[
            'OUTG_REC_NB', 
            dict(field_desc=f"DOV.DT_ON_TS - DOV.STEP_DRTN_NB/(60*24)", 
                 alias='DT_OFF_TS_FULL', table_alias_prefix=None)
        ]
    )
    #-----
    df_off_df = DOVSOutages.get_outg_info_for_df(
        df=cpx_outg_df, 
        outg_rec_nb_idfr=outg_rec_nb_idfr, 
        contstruct_df_args=contstruct_df_args, 
        build_sql_function=build_sql_function, 
        build_sql_function_kwargs=build_sql_function_kwargs, 
        set_outg_rec_nb_as_index=True
    )
    #-------------------------
    outg_rec_nbs = MECPODf.get_outg_rec_nbs_from_cpo_df(cpo_df=cpx_outg_df, idfr=outg_rec_nb_idfr)
    assert(len(cpx_outg_df)==len(outg_rec_nbs)) # Important in ensuring proper selection towards end of function
    #-----
    subset_outg_rec_nbs = df_off_df.loc[(df_off_df['DT_OFF_TS_FULL'] >= date_0) & 
                                        (df_off_df['DT_OFF_TS_FULL'] < date_1)].index.unique().tolist()
    cpx_outg_df_subset = cpx_outg_df[outg_rec_nbs.isin(subset_outg_rec_nbs)].copy()
    #-------------------------
    if not return_notin_also:
        return cpx_outg_df_subset
    else:
        cpx_outg_df_notin = cpx_outg_df[~outg_rec_nbs.isin(subset_outg_rec_nbs)].copy()
        return cpx_outg_df_subset, cpx_outg_df_notin   
    
#TODO PLACE IN MECPODf.py, probably near get_rcpo_df_subset_by_mjr_mnr_causes
def get_cpo_df_subsets_by_outg_season(
    cpo_df, 
    outg_rec_nb_idfr='index'
):
    r"""
    Returns a dict whose keys are seasons and values are cpo_df subsets whose associated outages occur in the given season.
    """
    #-------------------------
    outg_rec_nbs = MECPODf.get_outg_rec_nbs_from_cpo_df(cpo_df=cpo_df, idfr=outg_rec_nb_idfr)
    assert(len(cpo_df)==len(outg_rec_nbs)) # Important in ensuring proper selection towards end of function
    outg_rec_nbs_unq = outg_rec_nbs.unique().tolist()
    #-------------------------
    build_sql_function = DOVSOutages_SQL.build_sql_outage
    build_sql_function_kwargs = dict(
        outg_rec_nbs = outg_rec_nbs_unq, 
        field_to_split = 'outg_rec_nbs', 
        cols_of_interest=['OUTG_REC_NB', 'DT_OFF_TS']
    )
    dovs_outgs = DOVSOutages(
        df_construct_type=DFConstructType.kRunSqlQuery, 
        contstruct_df_args=None, 
        init_df_in_constructor=True,
        build_sql_function=build_sql_function, 
        build_sql_function_kwargs=build_sql_function_kwargs, 
        build_consolidated=False
    )
    dovs_outgs_df = dovs_outgs.get_df()
    #-------------------------
    dovs_outgs_df['season']=np.nan
    dovs_outgs_df.loc[dovs_outgs_df['DT_OFF_TS'].dt.month.isin([3,4,5]), 'season'] = 'spring'
    dovs_outgs_df.loc[dovs_outgs_df['DT_OFF_TS'].dt.month.isin([6,7,8]), 'season'] = 'summer'
    dovs_outgs_df.loc[dovs_outgs_df['DT_OFF_TS'].dt.month.isin([9,10,11]), 'season'] = 'autumn'
    dovs_outgs_df.loc[dovs_outgs_df['DT_OFF_TS'].dt.month.isin([12,1,2]), 'season'] = 'winter'
    # Make sure all entries assigned a season
    assert(dovs_outgs_df['season'].isna().sum()==0)
    #-------------------------
    # Split up cpo_df according to seasons
    cpo_df_spring = cpo_df[outg_rec_nbs.isin(dovs_outgs_df[dovs_outgs_df['season']=='spring']['OUTG_REC_NB'])]
    cpo_df_summer = cpo_df[outg_rec_nbs.isin(dovs_outgs_df[dovs_outgs_df['season']=='summer']['OUTG_REC_NB'])]
    cpo_df_autumn = cpo_df[outg_rec_nbs.isin(dovs_outgs_df[dovs_outgs_df['season']=='autumn']['OUTG_REC_NB'])]
    cpo_df_winter = cpo_df[outg_rec_nbs.isin(dovs_outgs_df[dovs_outgs_df['season']=='winter']['OUTG_REC_NB'])]
    assert(cpo_df_spring.shape[0]+cpo_df_summer.shape[0]+cpo_df_autumn.shape[0]+cpo_df_winter.shape[0]==cpo_df.shape[0])
    #-------------------------
    return dict(
        spring=cpo_df_spring, 
        summer=cpo_df_summer, 
        autumn=cpo_df_autumn, 
        winter=cpo_df_winter
    )



def append_season_col_to_df(
    df, 
    date_col, 
    placement_col='season', 
    seasons=None, 
    assert_all_classified=True
):
    r"""
    """
    #-------------------------
    if seasons is None:
        seasons = dict(
            spring=[3,4,5], 
            summer=[6,7,8], 
            autumn=[9,10,11], 
            winter=[12,1,2], 
        )
    #-------------------------
    df[placement_col] = np.nan
    for season,months in seasons.items():
        df.loc[df[date_col].dt.month.isin(months), placement_col] = season
    #-------------------------
    if assert_all_classified:
        assert(df[placement_col].isna().sum()==0)
    #-------------------------
    return df


def get_df_subsets_by_seasons(
    df, 
    date_col, 
    seasons=None, 
    assert_all_classified=False
):
    r"""
    """
    #-------------------------
    if seasons is None:
        seasons = dict(
            spring=[3,4,5], 
            summer=[6,7,8], 
            autumn=[9,10,11], 
            winter=[12,1,2], 
        )
    #-------------------------
    return_dict = {}
    classified_count = 0
    for season,months in seasons.items():
        assert(season not in return_dict)
        return_dict[season] = df[df[date_col].dt.month.isin(months)]
        classified_count += return_dict[season].shape[0]
    #-------------------------
    if assert_all_classified:
        assert(classified_count==df.shape[0])
    #-------------------------
    return return_dict


def append_season_col_from_time_infos_to_df(
    df, 
    time_infos_df, 
    date_col, 
    placement_col='season', 
    seasons=None, 
    missing_tolerance=0.05
):
    r"""
    NOTE: If df.columns in MultiIndex, the function will pad placement_col by prepending levels with values=''
    """
    #-------------------------
    #-------------------------
    # Don't want any duplicate entries in time_infos_df, this could lead to additional rows being added to df
    # NOTE: Assertion below is slightly too strong.  To be absolutely correct, only those indices which are also contained in
    #       df need to be unique in DF.
    assert(time_infos_df.index.nunique()==time_infos_df.shape[0])
    #-------------------------
    # Create placement_col in time_infos_df
    time_infos_df = append_season_col_to_df(
        df=time_infos_df.copy(), 
        date_col=date_col, 
        placement_col=placement_col, 
        seasons=seasons, 
        assert_all_classified=False
    )
    time_infos_df = time_infos_df[[placement_col]]
    #-------------------------
    # Make time_infos_df.columns have same number of levels as that of df so pd.merge doesn't complain
    #   Needed extra levels are prepended with values = ''
    assert(time_infos_df.columns.nlevels<=df.columns.nlevels)
    if time_infos_df.columns.nlevels < df.columns.nlevels:
        for i in range(df.columns.nlevels-time_infos_df.columns.nlevels):
            time_infos_df = Utilities_df.prepend_level_to_MultiIndex(time_infos_df, '', axis=1)  
    # Procedure above may have added levels to placement_col
    # Update with the correct value
    assert(time_infos_df.shape[1]==1)
    placement_col = time_infos_df.columns[0]
    #-------------------------
    df_w_season = pd.merge(
        df, 
        time_infos_df, 
        how='left', 
        left_index=True, 
        right_index=True
    )
    #-------------------------
    missing_pct = df_w_season[placement_col].isna().sum()/df.shape[0]
    if missing_pct>missing_tolerance:
        print(f'missing_pct = {missing_pct} > missing_tolerance = {missing_tolerance}')
        assert(0)
    #-------------------------
    return df_w_season


def get_df_subsets_by_seasons_from_time_infos(
    df, 
    time_infos_df, 
    date_col, 
    seasons=None, 
    missing_tolerance=0.05
):
    r"""
    """
    #-------------------------
    if seasons is None:
        seasons = dict(
            spring=[3,4,5], 
            summer=[6,7,8], 
            autumn=[9,10,11], 
            winter=[12,1,2], 
        )
    #-------------------------
    placement_col = Utilities.generate_random_string()
    df_w_season = append_season_col_from_time_infos_to_df(
        df=df, 
        time_infos_df=time_infos_df, 
        date_col=date_col, 
        placement_col=placement_col, 
        seasons=seasons, 
        missing_tolerance=missing_tolerance
    )
    # Note: append_season_col_from_time_infos_to_df alters placement_col if necessary to match the number of levels
    #       in df.columns.  Find new value for placement_col
    placement_col_idx = Utilities_df.find_idxs_in_highest_order_of_columns(df_w_season, placement_col)
    # Should only be one
    assert(len(placement_col_idx)==1)
    placement_col_idx=placement_col_idx[0]
    placement_col = df_w_season.columns[placement_col_idx]
    #-------------------------
    return_dict = {}
    for season,months in seasons.items():
        assert(season not in return_dict)
        return_dict[season] = df_w_season[df_w_season[placement_col]==season].drop(columns=[placement_col])
    #-------------------------
    return return_dict

In [None]:
# MOVED TO OutageModeler!
def merge_cpx_df_w_time_infos(
    cpx_df, 
    time_infos_df, 
    time_infos_drop_dupls_subset=['index', 't_min'], 
    dummy_lvl_base_name = 'dummy_lvl'
):
    r"""
    cpx_df and time_infos_df must have same indices.
    Typically, these are no_outg_rec_nb and trsf_pole_nb.
    
    time_infos_drop_dupls_subset:
        Since data are collected over multiple files, sometimes a (no-)outage event is split over multiple files
          with, e.g., different PNs.  So, duplicates must be dropped.
        Typically, index needs to be included as well (index usually comprised of no_outg_rec_nb and trsf_pole_nb)
    """
    #-------------------------
    if time_infos_drop_dupls_subset is not None:
        assert(Utilities.is_object_one_of_types(time_infos_drop_dupls_subset, [list, tuple]))
        if 'index' in time_infos_drop_dupls_subset:
            time_infos_drop_dupls_subset.remove('index')
            time_infos_df = time_infos_df.reset_index().drop_duplicates(
                subset=list(time_infos_df.index.names)+time_infos_drop_dupls_subset
            ).set_index(time_infos_df.index.names)
        else:
            time_infos_df = time_infos_df.drop_duplicates(subset=time_infos_drop_dupls_subset)
    #-------------------------
    # In order to merge, cpx_df and time_infos_df must have same number of levels in columns
    if cpx_df.columns.nlevels>1:
        n_levels_to_add = cpx_df.columns.nlevels - time_infos_df.columns.nlevels
        #-----
        for i_new_lvl in range(n_levels_to_add):
            # With each iteration, prepending a new level from n_levels_to_add-1 to 0
            i_level_val = f'{dummy_lvl_base_name}_{(n_levels_to_add-1)-i_new_lvl}'
            time_infos_df = Utilities_df.prepend_level_to_MultiIndex(
                df=time_infos_df, 
                level_val=i_level_val, 
                level_name=None, 
                axis=1
            )
    assert(cpx_df.columns.nlevels==time_infos_df.columns.nlevels)
    #-------------------------
    # Apparently, pd.merge is smart enough to match index level names, so the following isn't strictly necessary!
    # However, it doesn't hurt, and is good in practice
    assert(len(set(cpx_df.index.names).symmetric_difference(set(time_infos_df.index.names)))==0)
    if time_infos_df.index.names!=cpx_df.index.names:
        time_infos_df = time_infos_df.reset_index().set_index(cpx_df.index.names)
    #-----
    cpx_df_wt = pd.merge(
        cpx_df, 
        time_infos_df, 
        how='left', 
        left_index=True, 
        right_index=True
    )
    #-------------------------
    return cpx_df_wt


# MOVED TO OutageModeler!
def get_cpx_baseline_df_subset_by_datetime(
    cpx_bsln_df, 
    bsln_time_infos_df, 
    date_0,
    date_1,
    bsln_time_infos_time_col='t_min', 
    return_notin_also=False, 
    merge_time_info_to_cpx_bsln_df=False
):
    r"""
    cpx_bsln_df and bsln_time_infos_df must have same indices.
    Typically, these are no_outg_rec_nb and trsf_pole_nb.
    
    NOTE: Have found merging can be taxing (from memory standpoint) when DFs are large.
          Hence why default merge_time_info_to_cpx_bsln_df=False
    """
    #-------------------------
    if date_0 is None and date_1 is None:
        if not return_notin_also:
            return cpx_outg_df
        else:
            return cpx_outg_df, pd.DataFrame()
    #-------------------------
    if date_0 is None:
        date_0 = pd.Timestamp.min
    #-----
    if date_1 is None:
        date_1 = pd.Timestamp.max
    #-------------------------
    if not isinstance(date_0, datetime.datetime):
        date_0 = pd.to_datetime(date_0)
    if not isinstance(date_1, datetime.datetime):
        date_1 = pd.to_datetime(date_1)
    assert(isinstance(date_0, datetime.datetime))
    assert(isinstance(date_1, datetime.datetime))
    #-------------------------
    assert(len(set(cpx_bsln_df.index.names).symmetric_difference(set(bsln_time_infos_df.index.names)))==0)
    if bsln_time_infos_df.index.names!=cpx_bsln_df.index.names:
        bsln_time_infos_df = bsln_time_infos_df.reset_index().set_index(cpx_bsln_df.index.names)
    #-------------------------
    if bsln_time_infos_df.columns.nlevels>1 and not Utilities.is_object_one_of_types(bsln_time_infos_time_col, [list, tuple]):
        bsln_time_infos_time_col = Utilities_df.find_single_col_in_multiindex_cols(
            df=bsln_time_infos_df, 
            col=bsln_time_infos_time_col
        )
    assert(bsln_time_infos_time_col in bsln_time_infos_df.columns.tolist())
    #-------------------------
    bsln_time_infos_df = bsln_time_infos_df[[bsln_time_infos_time_col]]
    # Since collected over multiple files, sometimes a no-outage 'event' is split over multiple files
    #   with, e.g., different PNs.  So, duplicates must be dropped
    # Need to called reset_index because otherwise only bsln_time_infos_time_col will be considered, whereas
    #   here a duplicate must have same no_outg_rec_nb, trsf_pole_nb, and bsln_time_infos_time_col!
    bsln_time_infos_df = bsln_time_infos_df.reset_index().drop_duplicates().set_index(bsln_time_infos_df.index.names)
    #-------------------------
    # There should be an entry in bsln_time_infos_df for each in cpx_bsln_df
    # The reverse is NOT true: If no events found for specifiec timeframe, no entries will exist in cpx_bsln_df
    assert(len(set(cpx_bsln_df.index).difference(set(bsln_time_infos_df.index)))==0)
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    if not merge_time_info_to_cpx_bsln_df:
        subset_idxs = bsln_time_infos_df.loc[
            (bsln_time_infos_df[bsln_time_infos_time_col] >= date_0) &
            (bsln_time_infos_df[bsln_time_infos_time_col] < date_1)
        ].index.unique().tolist()
        cpx_bsln_df_subset = cpx_bsln_df[cpx_bsln_df.index.isin(subset_idxs)].copy()
        #-------------------------
        if not return_notin_also:
            return cpx_bsln_df_subset
        else:
            cpx_bsln_df_notin = cpx_bsln_df[~cpx_bsln_df.index.isin(subset_idxs)].copy()
            return cpx_bsln_df_subset, cpx_bsln_df_notin
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    else:
        cpx_bsln_df_wt = merge_cpx_df_w_time_infos(
            cpx_df=cpx_bsln_df, 
            time_infos_df=bsln_time_infos_df, 
            time_infos_drop_dupls_subset=['index', bsln_time_infos_time_col]
        )
        #-------------------------
        # Merging will add dummy levels if needed, so adjust bsln_time_infos_time_col if needed
        if not bsln_time_infos_time_col in cpx_bsln_df_wt.columns.tolist():
            bsln_time_infos_time_col = Utilities_df.find_single_col_in_multiindex_cols(
                df=cpx_bsln_df_wt, 
                col=bsln_time_infos_time_col
            )
        assert(bsln_time_infos_time_col in cpx_bsln_df_wt.columns.tolist())
        #-------------------------
        cpx_bsln_df_wt_subset = cpx_bsln_df_wt.loc[
            (cpx_bsln_df_wt[bsln_time_infos_time_col] >= date_0) &
            (cpx_bsln_df_wt[bsln_time_infos_time_col] < date_1)
        ]
        if not return_notin_also:
            return cpx_bsln_df_wt_subset
        else:
            cpx_bsln_df_wt_notin = cpx_bsln_df_wt.loc[
                ~((cpx_bsln_df_wt[bsln_time_infos_time_col] >= date_0) &
                (cpx_bsln_df_wt[bsln_time_infos_time_col] < date_1))
            ]
            return cpx_bsln_df_wt_subset, cpx_bsln_df_wt_notin

In [None]:
def build_df_eemsp(conn_eemsp, trsf_pole_nbs, batch_size=1000, verbose=True, n_update=10):
    return_df = pd.DataFrame()
    n_batches = int(np.ceil(len(trsf_pole_nbs)/batch_size))
    if verbose:
        print(f'n_trsf_pole_nbs = {len(trsf_pole_nbs)}')
        print(f'batch_size = {batch_size}')
        print(f'n_batches = {n_batches}')
    for i in range(n_batches):
        if verbose and (i+1)%n_update==0:
            print(f'{i+1}/{n_batches}')
        i_beg = i*batch_size
        i_end = (i+1)*batch_size
        if i==n_batches-1:
            i_end = len(trsf_pole_nbs)
        sql_eemsp_i = EEMSP.build_sql_eemsp_oracle(trsf_pole_nbs[i_beg:i_end])
        df_eemsp_i = pd.read_sql_query(sql_eemsp_i, conn_eemsp)
        #-----
        if return_df.shape[0]>0:
            assert(all(df_eemsp_i.columns==return_df.columns))
        return_df = pd.concat([return_df, df_eemsp_i])
    return return_df    

# ---------------------------------------------------------------------------------------------------

In [None]:
# MOVED TO OutageModeler!
def set_target_val_1_by_idx(
    df,
    val_1_idxs,
    remove_others_from_outages=False, 
    target_col=('is_outg', 'is_outg'), 
    from_outg_col=('from_outg', 'from_outg')
):
    r"""
    Set the target value to 1 for those in df with indices found in val_1_idxs.

    df:
        pd.DataFrame object OR a list of such objects
    
    val_1_idxs:
        A list containing the indices whose target values should be set to 1.
        Note, in general, val_1_idxs can contain more indices than those found in df, 
          as an intersection will be used in the code.
          
    remove_others_from_outages:
        If True, those with df[from_outg_col]==1 and df[target_col]==0 will be removed.
        This is useful if one wants to use a subset of outages as the target and remove all other outages from the data.
        
    target_col
    
    from_outg_col:
        Only used if remove_others_from_outages==True
        
    """
    #----------------------------------------------------------------------------------------------------
    assert(Utilities.is_object_one_of_types(df, [pd.DataFrame, list]))
    if isinstance(df, list):
        return_dfs = []
        for df_i in df:
            df_i_fnl = set_target_val_1_by_idx(
                df                         = df_i, 
                val_1_idxs                 = val_1_idxs,
                remove_others_from_outages = remove_others_from_outages, 
                target_col                 = target_col, 
                from_outg_col              = from_outg_col
            )
            return_dfs.append(df_i_fnl)
        return return_dfs
    #----------------------------------------------------------------------------------------------------
    # First, set all target values to 0
    df[target_col] = 0
    
    #-------------------------
    # Set the target values to 1 for any indices in val_1_idxs
    df.loc[list(set(df.index).intersection(set(val_1_idxs))), target_col] = 1
    
    #-------------------------
    # Remove other outages not marked as target==1 if remove_others_from_outages==True
    if remove_others_from_outages:
        # Drop any entries which are from the outages collection but not marked as target==1
        # NOTE: The method below is a little safer than finding the indices to drop and then calling .drop()
        #         as this should be safe against duplicate indices, whereas the .drop method would not be.
        #       However, I do not expect duplicate indices to occur, so either would probably be fine.
        df = df[
            ~(
                (df[from_outg_col]==1) & 
                (df[target_col]==0)
            )
        ]
        
    #-------------------------
    return df

# MOVED TO OutageModeler!
def ensure_target_val_1_min_pct(
    df,
    min_pct,
    target_col=('is_outg', 'is_outg'), 
    random_state=None, 
    assert_success=True, 
    return_discarded=False
):
    r"""
    Make sure the collection of entries with target value==1 comprises at least min_pct of the overall collection.
    If the percentage is below min_pct, entries are removed from the target value==0 collection until desired 
      percentage is reached.
    If the percentage is already above min_pct, simply return the df.
    
    min_pct:
        Should be between 0 and 100! (not, e.g., 0 and 1)
    """
    #-------------------------
    pct = 100*(df[target_col]==1).sum()/df.shape[0]
    #-------------------------
    if pct >= min_pct:
        return df
    #-------------------------
    # Need to determine how many entries from target==0 to keep 
    #   Define the number of target==1 values to be n_1 and the needed number of
    #     target==0 values to be n_0
    #   One must solve for n_0 in the following:  100*n_1/(n_1+n_0)=min_pct
    #   ==> n_0 = (100-min_pct)*n_1/min_pct
    n_1 = (df[target_col]==1).sum()
    n_0 = np.floor((100-min_pct)*n_1/min_pct).astype(int)
    #-------------------------
    df_1 = df[df[target_col]==1]
    df_0 = df[df[target_col]==0]
    df_0_sub = df_0.sample(n=n_0, replace=False, random_state=random_state)
    #-------------------------
    # Join df_1 and df_0_sub and randomize order
    return_df = pd.concat([df_1, df_0_sub])
    return_df = return_df.sample(frac=1, random_state=random_state)
    #-------------------------
    # Ensure operation was successful
    if assert_success:
        pct = 100*(return_df[target_col]==1).sum()/return_df.shape[0]
        assert(pct>=min_pct)
    #-------------------------
    if return_discarded:
        df_0_discarded = df_0[~df_0.index.isin(df_0_sub.index)]
        assert(df_0_sub.shape[0]+df_0_discarded.shape[0]==df_0.shape[0])
        return return_df, df_0_discarded
    else:
        return return_df

# ---------------------------------------------------------------------------------------------------

In [None]:
# MOVED TO OutageModeler
from sklearn.model_selection import GroupShuffleSplit
def train_test_split_df_by_outage(
    df, 
    outg_rec_nb_idfr, 
    test_size, 
    random_state=None
    
):
    r"""
    This is simply a train-test split according to the outage groups.
    i.e., this enforces that all entries for a given outage remain together (either all in train or all in test)
          and never split across train/test
    e.g., if outage 1 affects 5 transformers, all 5 transformers will be in train or all will be in test, it will never
          occur that 3 are in train and 2 in test
          
          
    outg_rec_nb_idfr:
        This directs from where the outg_rec_nbs will be retrieved.
        This should be a string, list, or tuple.
        If the outg_rec_nbs are located in a column, idfr should simply be the column
            - Single index columns --> simple string
            - MultiIndex columns   --> appropriate tuple to identify column
        If the outg_rec_nbs are located in the index:
            - Single level index --> simple string 'index' or 'index_0'
            - MultiIndex index:  --> 
                - string f'index_{level}', where level is the index level containing the outg_rec_nbs
                - tuple of length 2, with 0th element ='index' and 1st element = idx_level_name where
                    idx_level_name is the name of the index level containing the outg_rec_nbs 
    """
    #-------------------------
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    outg_rec_nbs = DOVSOutages.get_outg_rec_nbs_list_from_df(
        df=df, 
        idfr=outg_rec_nb_idfr, 
        unique_only=False
    )
    #-----
    split = gss.split(df, groups=outg_rec_nbs)
    train_idxs, test_idxs = next(split)
    #-----
    df_train = df.iloc[train_idxs].copy()
    df_test  = df.iloc[test_idxs].copy()
    #-------------------------
    # Make sure the operation worked as expected
    outg_rec_nbs_train = DOVSOutages.get_outg_rec_nbs_list_from_df(
        df=df_train, 
        idfr=outg_rec_nb_idfr, 
        unique_only=True
    )
    outg_rec_nbs_test = DOVSOutages.get_outg_rec_nbs_list_from_df(
        df=df_test, 
        idfr=outg_rec_nb_idfr, 
        unique_only=True
    )
    assert(len(set(outg_rec_nbs_train).intersection(set(outg_rec_nbs_test)))==0)
    #-------------------------
    return df_train, df_test

# MOVED TO Utilities_df!
def train_test_split_df_group(
    X,
    y, 
    groups, 
    test_size, 
    random_state=None
    
):
    r"""
    This is simply a train-test split according to the outage groups.
    i.e., this enforces that all entries for a given group remain together (either all in train or all in test)
          and never split across train/test
          
    NOTE: If input is list, return value will be np.ndarray
          If input is np.ndarray, output np.ndarray
          If inputs pd.DataFrame/pd.Series, output pd.DataFrame/pd.Series
    """
    #-------------------------
    # The methods expect X and y to be either np.ndarrays, lists or pd.DataFrame/pd.Series (respectively)
    assert(Utilities.is_object_one_of_types(X, [np.ndarray, list, pd.DataFrame]))
    assert(Utilities.is_object_one_of_types(y, [np.ndarray, list, pd.Series]))
    assert(len(X)==len(y)) # next(split) would have failed if this wasn't true, but having it here 
    #                      makes it easier to locate and debug if it ever happens
    #-------------------------
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    split = gss.split(X, y, groups=groups)
    train_idxs, test_idxs = next(split)
    #-------------------------
    # In order to grab elements simply using list of indices (instead of looping through or whatever)
    #   X and y must be np.ndarrays
    if isinstance(X, list):
        X = np.array(X)
    if isinstance(y, list):
        y = np.array(y)
    #-------------------------
    if isinstance(X, pd.DataFrame):
        X_train = X.iloc[train_idxs]
        X_test  = X.iloc[test_idxs]
    else:
        X_train = X[train_idxs]
        X_test  = X[test_idxs]
    #-----
    if isinstance(y, pd.Series):
        y_train = y.iloc[train_idxs]
        y_test  = y.iloc[test_idxs]
    else:
        y_train = y[train_idxs]
        y_test  = y[test_idxs]
    #-------------------------
    return X_train, X_test, y_train, y_test

In [None]:
def classify_data_by_binary_confusion_matrix_result(
    data_df, 
    y_pred, 
    y_col=('is_outg', 'is_outg')
):
    r"""
    Classify each row in data_df by its confusion matrix result.
    Returns a pd.Series object with index matching that of data_df and values equal
      to confusion result (TP, FN, TN, FP)
    """
    #-------------------------
    assert(y_col in data_df.columns)
    assert(data_df.shape[0]==len(y_pred))
    #-------------------------
    return_srs = data_df[[y_col]].copy()
    return_srs.columns=['y']
    #-------------------------
    return_srs['y_pred'] = y_pred
    #-------------------------
    return_srs['confusion_result'] = ''
    #-----
    return_srs.loc[
        (return_srs['y']      == 1) & 
        (return_srs['y_pred'] == 1), 
        'confusion_result'
    ] = 'TP'
    #-----
    return_srs.loc[
        (return_srs['y']      == 0) & 
        (return_srs['y_pred'] == 0), 
        'confusion_result'
    ] = 'TN'
    #-----
    return_srs.loc[
        (return_srs['y']      == 0) & 
        (return_srs['y_pred'] == 1), 
        'confusion_result'
    ] = 'FP'
    #-----
    return_srs.loc[
        (return_srs['y']      == 1) & 
        (return_srs['y_pred'] == 0), 
        'confusion_result'
    ] = 'FN'
    #-----
    assert(return_srs['confusion_result'].isin(['TP', 'TN', 'FP', 'FN']).all())
    return_srs=return_srs['confusion_result']
    #-------------------------
    return return_srs


def get_df_subset_by_binary_confusion_matrix_result(
    data_df, 
    y_pred, 
    y_col=('is_outg', 'is_outg')
):
    r"""
    Get subsets of data_df for each confusion matrix result (TP, TN, FP, FN)
    """
    #-------------------------
    cnfsn_res_srs = classify_data_by_binary_confusion_matrix_result(
        data_df=data_df, 
        y_pred=y_pred, 
        y_col=y_col
    )
    assert(data_df.index.equals(cnfsn_res_srs.index))
    #-------------------------
    data_df_tp = data_df.loc[cnfsn_res_srs[cnfsn_res_srs=='TP'].index].copy()
    data_df_tn = data_df.loc[cnfsn_res_srs[cnfsn_res_srs=='TN'].index].copy()
    data_df_fp = data_df.loc[cnfsn_res_srs[cnfsn_res_srs=='FP'].index].copy()
    data_df_fn = data_df.loc[cnfsn_res_srs[cnfsn_res_srs=='FN'].index].copy()
    #-------------------------
    return dict(
        TP=data_df_tp, 
        TN=data_df_tn, 
        FP=data_df_fp, 
        FN=data_df_fn
    )

In [None]:
def get_outg_rec_nb_value_counts(
    df, 
    outg_rec_nb_idfr
):
    r"""
    """
    #-------------------------
    return DOVSOutages.get_outg_rec_nbs_from_df(
        df=df, 
        idfr=outg_rec_nb_idfr
    ).value_counts()

def get_n_trsf_poles_per_outg(
    df, 
    outg_rec_nb_idfr, 
    trsf_pole_nb_idfr
):
    r"""
    Returns a series object whose index contains the outg_rec_nbs and values equal the number
      of transformer pole numbers found for that outage
      
    IMPORTANT: One should use the full dataset, not, e.g. results broken up by confusion matrix results (e.g., df_TP)
               The latter will not work because transformers belonging to the same outage may have different
                 predictions, meaning some can be TP and others FN.
               Thus, using df_TP would incorrectly underestimate the number of transformers per outage!
      
    outg_rec_nb_idfr/trsf_pole_nb_idfr:
        These direct from where the outg_rec_nbs and trsf_pole_nbs will be retrieved.
        !!! Explanations below for outg_rec_nbs_idfr, but hold true for trsf_pole_nb_idfr as well !!!
        This should be a string, list, or tuple.
        If the outg_rec_nbs are located in a column, idfr should simply be the column
            - Single index columns --> simple string
            - MultiIndex columns   --> appropriate tuple to identify column
        If the outg_rec_nbs are located in the index:
            - Single level index --> simple string 'index' or 'index_0'
            - MultiIndex index:  --> 
                - string f'index_{level}', where level is the index level containing the outg_rec_nbs
                - tuple of length 2, with 0th element ='index' and 1st element = idx_level_name where
                    idx_level_name is the name of the index level containing the outg_rec_nbs 
    """
    #-------------------------
    outg_rec_nbs  = DOVSOutages.get_outg_rec_nbs_from_df(df=df, idfr=outg_rec_nb_idfr)
    trsf_pole_nbs = DOVSOutages.get_outg_rec_nbs_from_df(df=df, idfr=trsf_pole_nb_idfr)
    #-----
    assert(Utilities.is_object_one_of_types(outg_rec_nbs,  [pd.Index, pd.Series]))
    assert(Utilities.is_object_one_of_types(trsf_pole_nbs, [pd.Index, pd.Series]))
    #-------------------------
    assert(len(outg_rec_nbs)==len(trsf_pole_nbs))
    og_len = len(outg_rec_nbs)
    #--------------------------------------------------
    # If both pd.Index objects, create new pd.DataFrame housing the two
    if isinstance(outg_rec_nbs, pd.Index) and isinstance(trsf_pole_nbs, pd.Index):
        tmp_df = pd.DataFrame({
            'outg_rec_nb'  : outg_rec_nbs, 
            'trsf_pole_nb' : trsf_pole_nbs
        })
        trsf_pole_nbs_per_outg = tmp_df.drop_duplicates()['outg_rec_nb'].value_counts()
    #--------------------------------------------------
    elif isinstance(outg_rec_nbs, pd.Series) and isinstance(trsf_pole_nbs, pd.Series):
        tmp_df = pd.merge(outg_rec_nbs, trsf_pole_nbs, left_index=True, right_index=True, how='inner')
        assert(tmp_df.shape[0]==og_len)
        trsf_pole_nbs_per_outg = tmp_df.drop_duplicates().iloc[:,0].value_counts()
    #--------------------------------------------------
    else:
        #-------------------------
        # Headache below simply ensures the pd.Index object is found as an index (in exactly
        #   one level of) the pd.Series object
        if isinstance(outg_rec_nbs, pd.Index):
            assert(isinstance(trsf_pole_nbs, pd.Series))
            idx_obj = outg_rec_nbs
            srs_obj = trsf_pole_nbs
        else:
            assert(isinstance(outg_rec_nbs, pd.Series))
            assert(isinstance(trsf_pole_nbs, pd.Index))
            idx_obj = trsf_pole_nbs
            srs_obj = outg_rec_nbs
        #-----
        idxs_eq=0
        for idx_level in range(srs_obj.index.nlevels):
            if all(srs_obj.index.get_level_values(idx_level)==idx_obj):
                idxs_eq+=1
        assert(idxs_eq==1)    
        #-------------------------
        tmp_df = pd.DataFrame({
            'outg_rec_nb'  : outg_rec_nbs, 
            'trsf_pole_nb' : trsf_pole_nbs
        })
        trsf_pole_nbs_per_outg = tmp_df.drop_duplicates()['outg_rec_nb'].value_counts()
    #--------------------------------------------------
    return trsf_pole_nbs_per_outg


def get_outgs_w_single_xfmr(
    df, 
    outg_rec_nb_idfr, 
    trsf_pole_nb_idfr
):
    r"""
    Returns a list of outg_rec_nbs from df which affect a single transformer
    
    IMPORTANT: One should use the full dataset, not, e.g. results broken up by confusion matrix results (e.g., df_TP)
               The latter will not work because transformers belonging to the same outage may have different
                 predictions, meaning one can be tp and another fn.
               Thus, using df_TP would incorrectly see such a situation as an outage with a single transformer!
      
    outg_rec_nb_idfr/trsf_pole_nb_idfr:
        These direct from where the outg_rec_nbs and trsf_pole_nbs will be retrieved.
        !!! Explanations below for outg_rec_nbs_idfr, but hold true for trsf_pole_nb_idfr as well !!!
        This should be a string, list, or tuple.
        If the outg_rec_nbs are located in a column, idfr should simply be the column
            - Single index columns --> simple string
            - MultiIndex columns   --> appropriate tuple to identify column
        If the outg_rec_nbs are located in the index:
            - Single level index --> simple string 'index' or 'index_0'
            - MultiIndex index:  --> 
                - string f'index_{level}', where level is the index level containing the outg_rec_nbs
                - tuple of length 2, with 0th element ='index' and 1st element = idx_level_name where
                    idx_level_name is the name of the index level containing the outg_rec_nbs 
    """
    #-------------------------
    n_trsf_poles_per_outg=get_n_trsf_poles_per_outg(
        df=df, 
        outg_rec_nb_idfr=outg_rec_nb_idfr, 
        trsf_pole_nb_idfr=trsf_pole_nb_idfr
    )
    return n_trsf_poles_per_outg[n_trsf_poles_per_outg==1].index.tolist()

In [None]:
# MOVED to Utilities_df!!!!!!!
# def get_idfr_loc(
#     df, 
#     idfr
# ):
#     r"""
#     Returns the identifier and whether or not it was found in index
#     If idfr found in columns, essentially just reutnrs idfr
#     If idfr found in index, returns the index level where it was found
    
#     idfr:
#         This should be a string, list, or tuple.
#         If column, idfr should simply be the column
#             - Single index columns --> simple string
#             - MultiIndex columns   --> appropriate tuple to identify column
#         If in the index:
#             - Single level index --> simple string 'index' or 'index_0'
#             - MultiIndex index:  --> 
#                 - string f'index_{level}', where level is the index level containing the outg_rec_nbs
#                 - tuple of length 2, with 0th element ='index' and 1st element = idx_level_name where
#                     idx_level_name is the name of the index level containing the outg_rec_nbs 
#     """
#     #-------------------------
#     assert(Utilities.is_object_one_of_types(idfr, [str, list, tuple]))
#     # NOTE: pd doesn't like checking for idfr in df.columns if idfr is a list.  It is fine checking when
#     #       it is a tuple, as tuples can represent columns.  Therefore, if idfr is a list, convert to tuple
#     #       as this will fix the issue below and have no effect elsewhere.
#     if isinstance(idfr, list):
#         idfr = tuple(idfr)
#     if idfr in df.columns:
#         return idfr, False
#     #-------------------------
#     # If not in the columns (because return from function not executed above), outg_rec_nbs must be in the indices!
#     # The if/else block below determines idfr_idx_lvl
#     if isinstance(idfr, str):
#         assert(idfr.startswith('index'))
#         if idfr=='index':
#             idfr_idx_lvl=0
#         else:
#             idfr_idx_lvl = re.findall(r'index_(\d*)', idfr)
#             assert(len(idfr_idx_lvl)==1)
#             idfr_idx_lvl=idfr_idx_lvl[0]
#             idfr_idx_lvl=int(idfr_idx_lvl)
#     else:
#         assert(len(idfr)==2)
#         assert(idfr[0]=='index')
#         idx_level_name = idfr[1]
#         assert(idx_level_name in df.index.names)
#         # Need to also make sure idx_level_name only occurs once, so no ambiguity!
#         assert(df.index.names.count(idx_level_name)==1)
#         idfr_idx_lvl = df.index.names.index(idx_level_name)
#     #-------------------------
#     assert(idfr_idx_lvl < df.index.nlevels)
#     return (idfr_idx_lvl, True)

In [None]:
def get_direct_xfmrs_from_df(
    df, 
    outgs_w_single_xfmr=None, 
    xfmr_equip_typ_nms_of_interest=None, 
    outg_rec_nb_idfr='index_0', 
    trsf_pole_nb_idfr='index_1', 
    dovs_location_id_col=('outg_dummy_lvl_0', 'LOCATION_ID'), 
    dovs_equip_typ_nm_col=('outg_dummy_lvl_0', 'EQUIP_TYP_NM'), 
    return_indirect=False
):
    r"""
    Direct transformers must always have their pole number (trsf_pole_nb) equal to the DOVS location ID (LOCATION_ID).
    An additional constrain on the equipment type name (EQUIP_TYP_NM in DOVS) may be enforced by setting the
      xfmr_equip_typ_nms_of_interest parameters (e.g., xfmr_equip_typ_nms_of_interest = ['TRANSFORMER, OH', 'TRANSFORMER, UG'])
    Transformers from outages affecting only a single transformer may also be considered as direct by setting the
      outgs_w_single_xfmr parameter (using, e.g., the get_outgs_w_single_xfmr method).
    See LOGIC EXPLANATIONS below
      
      
    outgs_w_single_xfmr: 
        Should be a list containing outg_rec_nbs.
        Can be obtained, e.g., from the get_outgs_w_single_xfmr method.
        IMPORTANT: When obtaining outgs_w_single_xfmr, one should use the full dataset, not, e.g. results broken up by confusion 
                     matrix results (e.g., df_TP).
                   The latter will not work because transformers belonging to the same outage may have different
                     predictions, meaning one can be tp and another fn.
                   Thus, using df_TP would incorrectly see such a situation as an outage with a single transformer!
                   SHORT: Even if using this function on the subset df_TP, one should obtain outgs_w_single_xfmr from df.
                   
    return_indirect:
        If true, instead of returning the direct transformers, the indirect will be returned.
        Implemented simply using the apply_not parameter in DFSlicer (i.e., use the logic defined below to obtain
          slicing_booleans, but slice with ~slicing_booleans instead)
  
    ----- LOGIC EXPLANATIONS -----
    If outgs_w_single_xfmr is None and xfmr_equip_typ_nms_of_interest is None, the logic is as follows:
        trsf_pole_nb==LOCATION_ID
    -----
    If outgs_w_single_xfmr and xfmr_equip_typ_nms_of_interest are supplied, the logic is as follows:
        (trsf_pole_nb==LOCATION_ID & EQUIP_TYP_NM in xfmr_equip_typ_nms_of_interest) | OUTG_REC_NB in outgs_w_single_xfmr
    -----
    If outgs_w_single_xfmr is supplied and xfmr_equip_typ_nms_of_interest is None, the logic is as follows:
        trsf_pole_nb==LOCATION_ID | OUTG_REC_NB in outgs_w_single_xfmr
    -----
    If xfmr_equip_typ_nms_of_interest is supplied and outgs_w_single_xfmr is None, the logic is as follows:
        trsf_pole_nb==LOCATION_ID & EQUIP_TYP_NM in xfmr_equip_typ_nms_of_interest
    -----
    NOTE: If needed, one could easily include a parameter, e.g., join_outgs_w_single_xfmr, to allow one to join
            outgs_w_single_xfmr with 'or' (default) or 'and'
    """
    #-------------------------
    outg_rec_nb_idfr_loc  = Utilities_df.get_idfr_loc(df, outg_rec_nb_idfr)
    trsf_pole_nb_idfr_loc = Utilities_df.get_idfr_loc(df, trsf_pole_nb_idfr)

    # If either is found in the index, reset_index will need to be called for DFSlicer to be used
    if outg_rec_nb_idfr_loc[1] or trsf_pole_nb_idfr_loc[1]:
        if outg_rec_nb_idfr_loc[1]:
            df.index = df.index.set_names(Utilities.generate_random_string(), level=outg_rec_nb_idfr_loc[0])
            outg_rec_nb_idfr_loc = (df.index.names[outg_rec_nb_idfr_loc[0]], False)
        #-----
        if trsf_pole_nb_idfr_loc[1]:
            df.index = df.index.set_names(Utilities.generate_random_string(), level=trsf_pole_nb_idfr_loc[0])
            trsf_pole_nb_idfr_loc = (df.index.names[trsf_pole_nb_idfr_loc[0]], False)
        #-----
        df = df.reset_index()
    #-------------------------
    assert(outg_rec_nb_idfr_loc[0] in df.columns)
    assert(trsf_pole_nb_idfr_loc[0] in df.columns)
    #--------------------------------------------------
    slicer = DFSlicer(
        single_slicers = [
            dict(
                column=dovs_location_id_col, 
                value=df[trsf_pole_nb_idfr_loc[0]], 
                comparison_operator='=='
            )
        ]
    )
    #-----
    if xfmr_equip_typ_nms_of_interest is not None:
        slicer.add_single_slicer(
            dict(
                column=dovs_equip_typ_nm_col, 
                value=xfmr_equip_typ_nms_of_interest, 
                comparison_operator='isin'
            )
        )
    #-------------------------
    slicers = [slicer]
    #-------------------------
    if outgs_w_single_xfmr is not None:
        # Need a second slicer, since this will be added with | (whereas first two
        #   are combined with &)
        slicer_2 = DFSlicer(
            single_slicers = [
                dict(
                    column=outg_rec_nb_idfr_loc[0], 
                    value=outgs_w_single_xfmr, 
                    comparison_operator='isin'
                )
            ]
        )
        slicers.append(slicer_2)
    #--------------------------------------------------
    return_df = DFSlicer.combine_slicers_and_perform_slicing(
        df=df, 
        slicers=slicers, 
        join_slicers='or', 
        apply_not=return_indirect
    )
    return return_df

In [None]:
def get_direct_xfmrs_in_tp_fn_from_df(
    data_df, 
    y_pred, 
    y_col=('is_outg', 'is_outg'), 
    outgs_w_single_xfmr=None, 
    xfmr_equip_typ_nms_of_interest=None, 
    outg_rec_nb_idfr='index_0', 
    trsf_pole_nb_idfr='index_1'
):
    r"""
    Returns the subset of direct (and indirect) transformers from data_df in the true positive (tp) and 
      false negative (fn) groups.
    Return object is dictionary with keys = TP_dir, TP_indir, FN_dir, FN_indir
    NOTE: Can only return results for TP and FN because the idea of a direct transformer only exists
            for real outages (no such analog in baseline data)
            
    Direct transformers must always have their pole number (trsf_pole_nb) equal to the DOVS location ID (LOCATION_ID).
    An additional constrain on the equipment type name (EQUIP_TYP_NM in DOVS) may be enforced by setting the
      xfmr_equip_typ_nms_of_interest parameters (e.g., xfmr_equip_typ_nms_of_interest = ['TRANSFORMER, OH', 'TRANSFORMER, UG'])
    Transformers from outages affecting only a single transformer may also be considered as direct by setting the
      outgs_w_single_xfmr parameter (using, e.g., the get_outgs_w_single_xfmr method).
    See LOGIC EXPLANATIONS below
      
      
    outgs_w_single_xfmr: 
        Should be a list containing outg_rec_nbs.
        Can be obtained, e.g., from the get_outgs_w_single_xfmr method.
        IMPORTANT: When obtaining outgs_w_single_xfmr, one should use the full dataset, not, e.g. results broken up by confusion 
                     matrix results (e.g., df_TP).
                   The latter will not work because transformers belonging to the same outage may have different
                     predictions, meaning one can be tp and another fn.
                   Thus, using df_TP would incorrectly see such a situation as an outage with a single transformer!
                   SHORT: Even if using this function on the subset df_TP, one should obtain outgs_w_single_xfmr from df.
    """
    #--------------------------------------------------
    data_df_by_cnfsn_res = get_df_subset_by_binary_confusion_matrix_result(
        data_df=data_df, 
        y_pred=y_pred, 
        y_col=y_col
    )
    #-----
    data_df_tp = data_df_by_cnfsn_res['TP']
    data_df_fn = data_df_by_cnfsn_res['FN']
    #--------------------------------------------------
    data_df_tp = DOVSOutages.append_outg_info_to_df(
        df=data_df_tp, 
        outg_rec_nb_idfr=outg_rec_nb_idfr, 
        build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    )
    #-----
    data_df_fn = DOVSOutages.append_outg_info_to_df(
        df=data_df_fn, 
        outg_rec_nb_idfr=outg_rec_nb_idfr, 
        build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    )
    #--------------------------------------------------
    dovs_location_id_col = Utilities_df.find_single_col_in_multiindex_cols(
        df=data_df_tp, 
        col='LOCATION_ID'
    )
    dovs_equip_typ_nm_col = Utilities_df.find_single_col_in_multiindex_cols(
        df=data_df_tp, 
        col='EQUIP_TYP_NM'
    )
    #--------------------------------------------------
    if data_df_tp.shape[0]>0:
        assert(dovs_location_id_col in data_df_tp.columns and dovs_equip_typ_nm_col in data_df_tp.columns)
        #-----
        data_df_tp_dir = get_direct_xfmrs_from_df(
            df=data_df_tp, 
            outgs_w_single_xfmr=outgs_w_single_xfmr, 
            xfmr_equip_typ_nms_of_interest=xfmr_equip_typ_nms_of_interest, 
            outg_rec_nb_idfr=outg_rec_nb_idfr, 
            trsf_pole_nb_idfr=trsf_pole_nb_idfr, 
            dovs_location_id_col=dovs_location_id_col, 
            dovs_equip_typ_nm_col=dovs_equip_typ_nm_col, 
            return_indirect=False
        )
        #-----
        data_df_tp_indir = get_direct_xfmrs_from_df(
            df=data_df_tp, 
            outgs_w_single_xfmr=outgs_w_single_xfmr, 
            xfmr_equip_typ_nms_of_interest=xfmr_equip_typ_nms_of_interest, 
            outg_rec_nb_idfr=outg_rec_nb_idfr, 
            trsf_pole_nb_idfr=trsf_pole_nb_idfr, 
            dovs_location_id_col=dovs_location_id_col, 
            dovs_equip_typ_nm_col=dovs_equip_typ_nm_col, 
            return_indirect=True
        )
    else:
        data_df_tp_dir   = pd.DataFrame()
        data_df_tp_indir = pd.DataFrame()
    #-----
    if data_df_fn.shape[0]>0:
        assert(dovs_location_id_col in data_df_fn.columns and dovs_equip_typ_nm_col in data_df_fn.columns)
        #-----
        data_df_fn_dir = get_direct_xfmrs_from_df(
            df=data_df_fn, 
            outgs_w_single_xfmr=outgs_w_single_xfmr, 
            xfmr_equip_typ_nms_of_interest=xfmr_equip_typ_nms_of_interest, 
            outg_rec_nb_idfr=outg_rec_nb_idfr, 
            trsf_pole_nb_idfr=trsf_pole_nb_idfr, 
            dovs_location_id_col=dovs_location_id_col, 
            dovs_equip_typ_nm_col=dovs_equip_typ_nm_col, 
            return_indirect=False
        )
        #-----
        data_df_fn_indir = get_direct_xfmrs_from_df(
            df=data_df_fn, 
            outgs_w_single_xfmr=outgs_w_single_xfmr, 
            xfmr_equip_typ_nms_of_interest=xfmr_equip_typ_nms_of_interest, 
            outg_rec_nb_idfr=outg_rec_nb_idfr, 
            trsf_pole_nb_idfr=trsf_pole_nb_idfr, 
            dovs_location_id_col=dovs_location_id_col, 
            dovs_equip_typ_nm_col=dovs_equip_typ_nm_col, 
            return_indirect=True
        )
    else:
        data_df_fn_dir   = pd.DataFrame()
        data_df_fn_indir = pd.DataFrame()
    #--------------------------------------------------
    return_dict = dict(
        TP_dir   = data_df_tp_dir, 
        TP_indir = data_df_tp_indir, 
        FN_dir   = data_df_fn_dir, 
        FN_indir = data_df_fn_indir, 
    )
    return return_dict


def get_n_direct_xfmrs_in_tp_fn(
    data_df, 
    y_pred, 
    y_col=('is_outg', 'is_outg'), 
    outgs_w_single_xfmr=None, 
    xfmr_equip_typ_nms_of_interest=None, 
    outg_rec_nb_idfr='index_0', 
    trsf_pole_nb_idfr='index_1'
):
    r"""
    Returns the number of direct (and indirect) transformers in the true positive (tp) and 
      false negative (fn) groups.
    Return object is dictionary with keys = TP_dir, TP_indir, FN_dir, FN_indir
    NOTE: Can only return results for TP and FN because the idea of a direct transformer only exists
            for real outages (no such analog in baseline data)
            
    Direct transformers must always have their pole number (trsf_pole_nb) equal to the DOVS location ID (LOCATION_ID).
    An additional constrain on the equipment type name (EQUIP_TYP_NM in DOVS) may be enforced by setting the
      xfmr_equip_typ_nms_of_interest parameters (e.g., xfmr_equip_typ_nms_of_interest = ['TRANSFORMER, OH', 'TRANSFORMER, UG'])
    Transformers from outages affecting only a single transformer may also be considered as direct by setting the
      outgs_w_single_xfmr parameter (using, e.g., the get_outgs_w_single_xfmr method).
    See LOGIC EXPLANATIONS below
      
      
    outgs_w_single_xfmr: 
        Should be a list containing outg_rec_nbs.
        Can be obtained, e.g., from the get_outgs_w_single_xfmr method.
        IMPORTANT: When obtaining outgs_w_single_xfmr, one should use the full dataset, not, e.g. results broken up by confusion 
                     matrix results (e.g., df_TP).
                   The latter will not work because transformers belonging to the same outage may have different
                     predictions, meaning one can be tp and another fn.
                   Thus, using df_TP would incorrectly see such a situation as an outage with a single transformer!
                   SHORT: Even if using this function on the subset df_TP, one should obtain outgs_w_single_xfmr from df.
    """
    #--------------------------------------------------
    dfs_dict = get_direct_xfmrs_in_tp_fn_from_df(
        data_df=data_df, 
        y_pred=y_pred, 
        y_col=y_col, 
        outgs_w_single_xfmr=outgs_w_single_xfmr, 
        xfmr_equip_typ_nms_of_interest=xfmr_equip_typ_nms_of_interest, 
        outg_rec_nb_idfr=outg_rec_nb_idfr, 
        trsf_pole_nb_idfr=trsf_pole_nb_idfr
    )
    #--------------------------------------------------
    return_dict = dict(
        TP_dir   = dfs_dict['TP_dir'].shape[0], 
        TP_indir = dfs_dict['TP_indir'].shape[0], 
        FN_dir   = dfs_dict['FN_dir'].shape[0], 
        FN_indir = dfs_dict['FN_indir'].shape[0], 
    )
    return return_dict

# ---------------------------------------------------------------------------------------------------
# Time info related methods
# ---------------------------------------------------------------------------------------------------

In [None]:
def standardize_baseline_time_infos_df(
    bsln_time_infos_df
):
    r"""
    """
    #-------------------------
    time_infos_df_bsln = bsln_time_infos_df.copy()
    time_infos_df_bsln = time_infos_df_bsln.reset_index()
    #-----
    necessary_cols = ['no_outg_rec_nb', 'trsf_pole_nb', 't_min', 't_max']
    assert(len(set(necessary_cols).difference(set(time_infos_df_bsln.columns)))==0)
    #-----
    time_infos_df_bsln = time_infos_df_bsln.set_index(['no_outg_rec_nb', 'trsf_pole_nb'])
    time_infos_df_bsln = time_infos_df_bsln[['t_min', 't_max']]
    #-----
    time_infos_df_bsln.index.names = ['outg_rec_nb', 'trsf_pole_nb']
    #-------------------------
    return time_infos_df_bsln

In [None]:
def build_baseline_time_infos_df(
    ede_data_dirs_bsln,
    standardize=True, 
    save_path=None
):
    r"""
    """
    #-------------------------
    bsln_time_infos_dfs = []
    #-----
    for ede_dir_i in ede_data_dirs_bsln:
        time_infos_df_i = MECPOAn.get_bsln_time_interval_infos_df_for_data_in_dir(
            data_dir=ede_dir_i, 
            make_addtnl_groupby_idx=True, 
            include_summary_paths=False
        )
        bsln_time_infos_dfs.append(time_infos_df_i)
    #-----
    bsln_time_infos_df = pd.concat(bsln_time_infos_dfs)
    #-------------------------
    # Index names should be 'trsf_pole_nb', 'no_outg_rec_nb', and possibly 'is_first_after_outg' 
    #   (typically in that order, but not necessarily)
    assert(len(set(bsln_time_infos_df.index.names).difference(set(['trsf_pole_nb', 'no_outg_rec_nb', 'is_first_after_outg'])))==0)
    # Typically, want index as ['no_outg_rec_nb', 'trsf_pole_nb', 'is_first_after_outg']
    assert(bsln_time_infos_df.index.nlevels==2 or bsln_time_infos_df.index.nlevels==3)
    return_index_levels = ['no_outg_rec_nb', 'trsf_pole_nb']
    if bsln_time_infos_df.index.nlevels==3:
        return_index_levels.append('is_first_after_outg')
    assert(len(set(bsln_time_infos_df.index.names).symmetric_difference(set(return_index_levels)))==0)
    if bsln_time_infos_df.index.names!=return_index_levels:
        bsln_time_infos_df = bsln_time_infos_df.reset_index().set_index(return_index_levels)    
    #-------------------------
    if standardize:
        bsln_time_infos_df = standardize_baseline_time_infos_df(
            bsln_time_infos_df = bsln_time_infos_df
        )
    #-------------------------
    if save_path:
        bsln_time_infos_df.to_pickle(save_path)
    #-------------------------
    return bsln_time_infos_df

In [None]:
# USE build_baseline_time_infos_df instead (should be exactly the same)
def build_no_outg_time_infos_df(
    ede_data_dirs_no_outg,
    save_path=None
):
    r"""
    """
    #-------------------------
    no_outg_time_infos_dfs = []
    #-----
    for ede_dir_i in ede_data_dirs_no_outg:
        time_infos_df_i = MECPOAn.get_bsln_time_interval_infos_df_for_data_in_dir(
            data_dir=ede_dir_i, 
            make_addtnl_groupby_idx=True, 
            include_summary_paths=False
        )
        no_outg_time_infos_dfs.append(time_infos_df_i)
    #-----
    no_outg_time_infos_df = pd.concat(no_outg_time_infos_dfs)
    #-------------------------
    # Index names should be 'trsf_pole_nb', 'no_outg_rec_nb', and possibly 'is_first_after_outg' 
    #   (typically in that order, but not necessarily)
    assert(len(set(no_outg_time_infos_df.index.names).difference(set(['trsf_pole_nb', 'no_outg_rec_nb', 'is_first_after_outg'])))==0)
    # Typically, want index as ['no_outg_rec_nb', 'trsf_pole_nb', 'is_first_after_outg']
    assert(no_outg_time_infos_df.index.nlevels==2 or no_outg_time_infos_df.index.nlevels==3)
    return_index_levels = ['no_outg_rec_nb', 'trsf_pole_nb']
    if no_outg_time_infos_df.index.nlevels==3:
        return_index_levels.append('is_first_after_outg')
    assert(len(set(no_outg_time_infos_df.index.names).symmetric_difference(set(return_index_levels)))==0)
    if no_outg_time_infos_df.index.names!=return_index_levels:
        no_outg_time_infos_df = no_outg_time_infos_df.reset_index().set_index(return_index_levels)    
    #-------------------------
    if save_path:
        no_outg_time_infos_df.to_pickle(save_path)
    #-------------------------
    return no_outg_time_infos_df

In [None]:
# MOVED TO MECPOAn
def build_outg_time_infos_df(
    rcpx_df, 
    outg_rec_nb_idfr=('index', 'outg_rec_nb'), 
    dummy_col_levels_prefix='dummy_lvl_',     
):
    r"""
    """
    #-------------------------
    tmp_og_cols = rcpx_df.columns.tolist()
    #-------------------------
    time_infos_df_outg = DOVSOutages.append_outg_dt_off_ts_full_to_df(
        df=rcpx_df.copy(), 
        outg_rec_nb_idfr=outg_rec_nb_idfr, 
        dummy_col_levels_prefix=dummy_col_levels_prefix, 
        include_dt_on_ts=True
    )
    #-------------------------
    time_info_cols = list(set(time_infos_df_outg.columns.tolist()).difference(set(tmp_og_cols)))
    time_infos_df_outg = time_infos_df_outg[time_info_cols]
    if time_infos_df_outg.columns.nlevels>1:
        assert(time_infos_df_outg.columns.nlevels==2)
        assert(time_infos_df_outg.columns.get_level_values(0).nunique()==1)
        time_infos_df_outg.columns = time_infos_df_outg.columns.droplevel(0)
    #-------------------------
    assert(len(set(['DT_OFF_TS_FULL', 'DT_ON_TS']).difference(set(time_infos_df_outg.columns)))==0)
    time_infos_df_outg = time_infos_df_outg.rename(columns={
        'DT_OFF_TS_FULL':'t_min', 
        'DT_ON_TS':'t_max'
    })
    #-------------------------
    return time_infos_df_outg

In [None]:
def build_baseline_time_infos_df_for_eemsp(
    bsln_time_infos_df
):
    r"""
    """
    #-------------------------
    time_infos_df_bsln = bsln_time_infos_df.copy()
    time_infos_df_bsln = time_infos_df_bsln.reset_index()
    #-----
    necessary_cols = ['no_outg_rec_nb', 'trsf_pole_nb', 't_min', 't_max']
    assert(len(set(necessary_cols).difference(set(time_infos_df_bsln.columns)))==0)
    #-----
    time_infos_df_bsln = time_infos_df_bsln.set_index(['no_outg_rec_nb', 'trsf_pole_nb'])
    time_infos_df_bsln = time_infos_df_bsln[['t_min', 't_max']]
    #-----
    time_infos_df_bsln.index.names = ['outg_rec_nb', 'trsf_pole_nb']
    #-------------------------
    return time_infos_df_bsln

In [None]:
# USE build_baseline_time_infos_df_for_eemsp instead (should be exactly the same)
def build_no_outg_time_infos_df_for_eemsp(
    no_outg_time_infos_df
):
    r"""
    """
    #-------------------------
    time_infos_df_no_outg = no_outg_time_infos_df.copy()
    time_infos_df_no_outg = time_infos_df_no_outg.reset_index()
    #-----
    necessary_cols = ['no_outg_rec_nb', 'trsf_pole_nb', 't_min', 't_max']
    assert(len(set(necessary_cols).difference(set(time_infos_df_no_outg.columns)))==0)
    #-----
    time_infos_df_no_outg = time_infos_df_no_outg.set_index(['no_outg_rec_nb', 'trsf_pole_nb'])
    time_infos_df_no_outg = time_infos_df_no_outg[['t_min', 't_max']]
    #-----
    time_infos_df_no_outg.index.names = ['outg_rec_nb', 'trsf_pole_nb']
    #-------------------------
    return time_infos_df_no_outg

In [None]:
def build_time_infos_df_for_eemsp(
    time_infos_df_outg, 
    time_infos_df_otbl, 
    time_infos_df_prbl
):
    r"""
    """
    #-------------------------
    assert(
        time_infos_df_outg.index.names == 
        time_infos_df_otbl.index.names == 
        ['outg_rec_nb', 'trsf_pole_nb']
    )
    #-----
    assert(len(set(['t_min', 't_max']).difference(set(time_infos_df_outg.columns)))==0)
    assert(len(set(['t_min', 't_max']).difference(set(time_infos_df_otbl.columns)))==0)
    #-------------------------
    time_infos_df_outg = time_infos_df_outg[['t_min', 't_max']]
    time_infos_df_otbl = time_infos_df_otbl[['t_min', 't_max']]
    #-------------------------
    time_infos_df = pd.concat([time_infos_df_outg, time_infos_df_otbl])
    #-------------------------
    if time_infos_df_prbl is not None:
        assert(time_infos_df_prbl.index.names == time_infos_df_outg.index.names)
        assert(len(set(['t_min', 't_max']).difference(set(time_infos_df_prbl.columns)))==0)
        time_infos_df_prbl = time_infos_df_prbl[['t_min', 't_max']]
        #-----
        time_infos_df = pd.concat([time_infos_df, time_infos_df_prbl])
    #-------------------------
    return time_infos_df

# ---------------------------------------------------------------------------------------------------
# EEMSP
# ---------------------------------------------------------------------------------------------------

In [None]:
# # BOTH FUNCTIONS MOVED TO EEMSP.py!!!!!
# def reduce1_eemsp_for_outg_trsf_i(
#     time_infos_df_i, 
#     df_eemsp, 
#     outg_rec_nb_idfr, 
#     trsf_pole_nb_idfr, 
#     dt_min_col, 
#     dt_max_col, 
#     eemsp_location_nb_col = 'LOCATION_NB', 
#     eemsp_install_dt_col  = 'INSTALL_DT', 
#     eemsp_removal_dt_col  = 'REMOVAL_DT', 
#     return_eemsp_outg_rec_nb_col = 'OUTG_REC_NB_TO_MERGE'
# ):
#     r"""
#     For a particular (outg_rec_nb, trsf_pole_nb) group, find the corresponding EEMSP entries.
#     Typically, EEMSP will have multiple entries for each transformer, so the point of this function
#       is to find the correct entries at the time of the outage.
#     After this procedure, all entries will be time-appropriate, but there may still be multiple entries
#       for a particular (outg_rec_nb, trsf_pole_nb) group.
#     Multiple entries will occur, e.g., if there are multiple transformers on the particular trsf_pole_nb
#       (at this point, I do not know how to determine to which transformer a meter is connected, only the
#       trsf_pole_nb).
#     For reductions down to one entry, see reduce2_eemsp_for_outg_trsf_i
      
#     NOTE: For this case, the outg_rec_nb_idfr/trsf_pole_nb_idfr should be equal to a column.
#           One cannot use, e.g., 'index_0'
#           This function is not really meant to be used on its own, and the correct formatting of time_infos_df_i
#             is taken care of by the calling function
#     """
#     #-------------------------
#     assert(isinstance(time_infos_df_i, pd.Series))
#     #-------------------------
#     outg_rec_nb  = time_infos_df_i[outg_rec_nb_idfr]
#     trsf_pole_nb = time_infos_df_i[trsf_pole_nb_idfr]
#     dt_min       = time_infos_df_i[dt_min_col]
#     if dt_max_col is not None:
#         dt_max = time_infos_df_i[dt_max_col]
#     else:
#         dt_max = dt_min
#     #-------------------------
#     df_eemsp_i = df_eemsp[df_eemsp[eemsp_location_nb_col]==trsf_pole_nb]
#     #-----
#     df_eemsp_i = df_eemsp_i[
#         (df_eemsp_i[eemsp_install_dt_col] <= dt_min) & 
#         (df_eemsp_i[eemsp_removal_dt_col].fillna(pd.Timestamp.max) > dt_max)
#     ]
#     df_eemsp_i = df_eemsp_i.drop_duplicates()
#     #-------------------------    
#     df_eemsp_i[return_eemsp_outg_rec_nb_col] = outg_rec_nb
#     #-------------------------
#     return df_eemsp_i


# def reduce1_eemsp_for_outg_trsf(
#     time_infos_df, 
#     df_eemsp, 
#     outg_rec_nb_idfr  = 'index_0', 
#     trsf_pole_nb_idfr = 'index_1', 

#     dt_min_col = ('dummy_lvl_0', 'DT_OFF_TS_FULL'), 
#     dt_max_col = None, 
    
#     eemsp_location_nb_col = 'LOCATION_NB', 
#     eemsp_install_dt_col  = 'INSTALL_DT', 
#     eemsp_removal_dt_col  = 'REMOVAL_DT', 
#     return_eemsp_outg_rec_nb_col = 'OUTG_REC_NB_TO_MERGE', 
#     verbose=True,
#     n_update=1000
# ):
#     r"""
#     For each (outg_rec_nb, trsf_pole_nb) group, find the corresponding EEMSP entries.
#     Typically, EEMSP will have multiple entries for each transformer/pole, so the point of this function
#       is to find the correct entries at the time of the outage.
#     After this procedure, all entries will be time-appropriate, but there may still be multiple entries
#       for a particular (outg_rec_nb, trsf_pole_nb) group.
#     Multiple entries will occur, e.g., if there are multiple transformers on the particular trsf_pole_nb
#       (at this point, I do not know how to determine to which transformer a meter is connected, only the
#       trsf_pole_nb).
#     For reductions down to one entry, see reduce2_eemsp_for_outg_trsf_i
    
#     If the outg_rec_nbs/trsf_pole_nbs are stored in the indices, and the indices are named,
#       one can simply supply the corresponding names.
#     Otherwise, one can always supply, e.g., index_0 and index_1

#     If only a single time is to be used (e.g., the outage starting time), set only dt_min_col
#     If two times (e.g., outage starting and stopping) set both dt_min_col and dt_max_col
#     """
#     #----------------------------------------------------------------------------------------------------
#     #----------------------------------------------------------------------------------------------------
#     # 1. First, determine where exactly outg_rec_nb and trsf_pole_nb are located in time_infos_df.
#     # 2. If in indices, call reset_index()
#     # 3. Set grp_by_cols, and make sure each combination of outg_rec_nb, trsf_pole_nb 
#     #    has a single datetime entry.
#     #-------------------------
#     outg_rec_nb_idfr_loc  = Utilities_df.get_idfr_loc(time_infos_df, outg_rec_nb_idfr)
#     trsf_pole_nb_idfr_loc = Utilities_df.get_idfr_loc(time_infos_df, trsf_pole_nb_idfr)
#     #--------------------------------------------------
#     # If outg_rec_nbs in index (i.e., outg_rec_nb_idfr_loc[1]==True), grab the index name and set 
#     #   outg_rec_nb_idfr equal to it.
#     #   If no index name, give it a random name
#     # If outg_rec_nbs in a column (i.e., outg_rec_nb_idfr_loc[1]==True), simply set outg_rec_nb_idfr
#     #   equal to it (i.e., set equal to outg_rec_nb_idfr_loc[0])
#     if outg_rec_nb_idfr_loc[1]:
#         if time_infos_df.index.names[outg_rec_nb_idfr_loc[0]]:
#             outg_rec_nb_idfr = time_infos_df.index.names[outg_rec_nb_idfr_loc[0]]
#         else:
#             tmp_outg_rec_nb_name = 'outg_rec_nb_'+Utilities.generate_random_string(str_len=4)
#             time_infos_df.index = time_infos_df.index.set_names(tmp_outg_rec_nb_name, level=outg_rec_nb_idfr_loc[0])
#             outg_rec_nb_idfr = tmp_outg_rec_nb_name
#     else:
#         outg_rec_nb_idfr = outg_rec_nb_idfr_loc[0]
#     #-------------------------
#     # Do the same thing for trsf_pole_nbs
#     if trsf_pole_nb_idfr_loc[1]:
#         if time_infos_df.index.names[trsf_pole_nb_idfr_loc[0]]:
#             trsf_pole_nb_idfr = time_infos_df.index.names[trsf_pole_nb_idfr_loc[0]]
#         else:
#             tmp_trsf_pole_nb_name = 'trsf_pole_nb_'+Utilities.generate_random_string(str_len=4)
#             time_infos_df.index = time_infos_df.index.set_names(tmp_trsf_pole_nb_name, level=trsf_pole_nb_idfr_loc[0])
#             trsf_pole_nb_idfr = tmp_trsf_pole_nb_name
#     else:
#         trsf_pole_nb_idfr = trsf_pole_nb_idfr_loc[0]
#     #--------------------------------------------------
#     grp_by_cols = [outg_rec_nb_idfr, trsf_pole_nb_idfr, dt_min_col]
#     if dt_max_col is None:
#         dt_max_col = dt_min_col
#     else:
#         grp_by_cols.append(dt_max_col)
#     #-------------------------
#     # Even if outg_rec_nbs and/or trsf_pole_nbs are in the index, the above methods ensure the indices will be 
#     #   named in such a case.
#     # Therefore, one can use .groupby with the index names or columns
#     # NOTE: Each combination of outg_rec_nb, trsf_pole_nb should only have a single datetime entry!
#     #       The assertion below enforces that
#     n_groups = time_infos_df.groupby(grp_by_cols).ngroups
#     assert(
#         n_groups == time_infos_df.groupby([outg_rec_nb_idfr, trsf_pole_nb_idfr]).ngroups
#     )    
    
#     #----------------------------------------------------------------------------------------------------
#     #----------------------------------------------------------------------------------------------------
#     # Form groups_df, which will have columns housing the outg_rec_nb, trsf_pole_nb, and the time info (one
#     #   additional column if dt_max_col is None, two additional columns if not None)
#     #-------------------------
#     #--------------------------------------------------
#     # Using the groupby method is much easier, but also much slower!
#     #   group by method: groups = list(time_infos_df.groupby(grp_by_cols).groups.keys())
#     # So, instead, I will use .value_counts method after calling reset_index()
#     # This is annoying because if outg_rec_nb or trsf_pole_nb in index, I must call reset_index
#     # Furthermore, if time_infos_df has MultiIndex columns, the identifiers will change from strings to tuples
#     # In such a case, pandas still is able to grab time_infos_df[[outg_rec_nb_idfr, trsf_pole_nb_idfr]]
#     #   i.e., is smart enough to find, e.g., ['outg_rec_nb', 'trsf_pole_nb'], even though the columns
#     #   are technically [('outg_rec_nb', ''), ('trsf_pole_nb', '')]
#     # But, pandas is NOT smart enough to grab time_infos_df[grp_by_cols] 
#     #   i.e., too dumb to find, e.g., ['outg_rec_nb', 'trsf_pole_nb', ('dummy_lvl_0', 'DT_OFF_TS_FULL')] when
#     #   the columns are technically  [('outg_rec_nb', ''), ('trsf_pole_nb', ''), ('dummy_lvl_0', 'DT_OFF_TS_FULL')]
#     #-------------------------
#     if outg_rec_nb_idfr_loc[1] and time_infos_df.columns.nlevels>1:
#         outg_rec_nb_idfr = tuple([outg_rec_nb_idfr] + ['' for _ in range(time_infos_df.columns.nlevels-1)])
#         grp_by_cols[0] = outg_rec_nb_idfr
#     #-----
#     if trsf_pole_nb_idfr_loc[1] and time_infos_df.columns.nlevels>1:
#         trsf_pole_nb_idfr = tuple([trsf_pole_nb_idfr] + ['' for _ in range(time_infos_df.columns.nlevels-1)])
#         grp_by_cols[1] = trsf_pole_nb_idfr
#     #-------------------------
#     groups_df = time_infos_df.reset_index()[grp_by_cols].value_counts()
#     groups_df=groups_df.reset_index().drop(columns='count')
#     assert(groups_df.shape[0]==n_groups)
    
#     #----------------------------------------------------------------------------------------------------
#     #----------------------------------------------------------------------------------------------------
#     # Chop down size of df_eemsp first
#     # Get rid of any with install date after last time or removal date before first time
#     df_eemsp=df_eemsp[~(
#         (df_eemsp[eemsp_install_dt_col]>time_infos_df[dt_max_col].max()) |
#         (df_eemsp[eemsp_removal_dt_col]<time_infos_df[dt_min_col].min())
#     )]
#     df_eemsp = df_eemsp.drop_duplicates()
#     #----------------------------------------------------------------------------------------------------
#     #----------------------------------------------------------------------------------------------------
#     # Iterate through groups_df, and grab the appropriate entries from df_eemsp
#     eemsp_dfs = []
#     for i,(idx_i, df_i) in enumerate(groups_df.iterrows()):
#         if verbose and i%n_update==0:
#             print(f"{i} of {groups_df.shape[0]}")
#         eemsp_df_i = reduce1_eemsp_for_outg_trsf_i(
#             time_infos_df_i=df_i, 
#             df_eemsp=df_eemsp, 
#             outg_rec_nb_idfr=outg_rec_nb_idfr, 
#             trsf_pole_nb_idfr=trsf_pole_nb_idfr, 
#             dt_min_col=dt_min_col, 
#             dt_max_col=dt_max_col, 
#             eemsp_location_nb_col=eemsp_location_nb_col, 
#             eemsp_install_dt_col= eemsp_install_dt_col, 
#             eemsp_removal_dt_col= eemsp_removal_dt_col, 
#             return_eemsp_outg_rec_nb_col=return_eemsp_outg_rec_nb_col
#         )
#         eemsp_dfs.append(eemsp_df_i)
#     return_df_eemsp = pd.concat(eemsp_dfs)
#     return return_df_eemsp

# BEG OLD
Still need to implement new versions in other notebooks, so old are temporarilly kept

In [None]:
def remove_ambiguous_from_df_eemsp_OLD(
    df_eemsp, 
    outg_rec_nb_col='OUTG_REC_NB_TO_MERGE', 
    location_nb_col='LOCATION_NB'
):
    r"""
    Any (outg_rec_nb, location_nb) group with multiple entries will be removed
    """
    #-------------------------
    # Make sure outg_rec_nb_col, location_nb_col, and columns in numeric_cols/dt_cols are present in df_eemsp
    # NOTE: If simple string given for columns whereas df_eemsp has MultiIndex columns, the
    #       following should remdy such a situation by changing string to appropriate tuple
    outg_rec_nb_col = Utilities_df.find_single_col_in_multiindex_cols(
        df=df_eemsp, 
        col=outg_rec_nb_col
    )
    #-------------------------
    location_nb_col = Utilities_df.find_single_col_in_multiindex_cols(
        df=df_eemsp, 
        col=location_nb_col
    )
    #-------------------------
    no_ambiguity = (df_eemsp[[outg_rec_nb_col, location_nb_col]].value_counts()==1)
    no_ambiguity = no_ambiguity[no_ambiguity==True].index
    #-------------------------
    df_eemsp_singles = df_eemsp.set_index(
        [outg_rec_nb_col, location_nb_col]
    ).loc[no_ambiguity].reset_index()
    #-------------------------
    return df_eemsp_singles


def reduce2_eemsp_for_outg_trsf_i_OLD(
    df_eemsp_i, 
    agg_dict, 
    return_idx_order, 
    mult_strategy='agg', 
    include_n_eemsp=True
):
    r"""
    Helper function fo reduce2_eemsp_for_outg_trsf
    In reality, only mult_strategy used here should be 'agg' ('first' is handled in 
      reduce2_eemsp_for_outg_trsf with the nth() function)
    """
    #-------------------------
    # NOTE: If mult_strategy=='exclude' in reduce2_eemsp_for_outg_trsf, the workflow
    #         should not utilize this function!
    assert(mult_strategy in ['agg', 'first'])
    #-------------------------
    if df_eemsp_i.shape[0]==1:
        srs_i = df_eemsp_i.iloc[0]
    else:
        if mult_strategy=='agg':
            srs_i = df_eemsp_i.agg(agg_dict)
        else:
            assert(mult_strategy=='first')
            srs_i = df_eemsp_i.iloc[0]
    #-------------------------
    srs_i=srs_i.reindex(index=return_idx_order)
    #-------------------------
    if include_n_eemsp:
        srs_i['n_eemsp'] = df_eemsp_i.shape[0]
    #-------------------------
    return srs_i


def reduce2_eemsp_for_outg_trsf_OLD(
    df_eemsp, 
    mult_strategy='agg', 
    include_n_eemsp=True, 
    outg_rec_nb_col='OUTG_REC_NB_TO_MERGE', 
    location_nb_col='LOCATION_NB', 
    numeric_cols = ['KVA_SIZE'], 
    dt_cols = ['INSTALL_DT', 'REMOVAL_DT'], 
    ignore_cols = ['SERIAL_NB'], 
    cat_cols_as_strings=True
):
    r"""
    To be run after reduce1_eemsp_for_outg_trsf_i!!!!!
    The intent of this function is to reduce df_eemsp down to one row per outg_rec_nb_col/location_nb_col group
    
    mult_strategy:
        Dictates how (outg_rec_nb, trsf_pole_nb) groups with multiple EEMSP entries will be handled.
        Can be 'agg', 'first', or 'exclude'
        'agg':
            For (outg_rec_nb, trsf_pole_nb) groups with multiple entries, aggregate
        'first':
            For (outg_rec_nb, trsf_pole_nb) groups with multiple entries, take the first
        'exclude':
            Exclude (outg_rec_nb, trsf_pole_nb) groups with multiple entries
            
    cat_cols_as_strings:
        Categorical columns can either be aggregated as (sorted) lists of unique values (cat_cols_as_strings==False),
          or as strings (cat_cols_as_strings==True)
        If cat_cols_as_strings==True, a given string is simply the (sorted) lists of unique values joined by commas
        
    
    """
    #-------------------------
    assert(mult_strategy in ['agg', 'first', 'exclude'])
    #-------------------------
    # Copy below probably not necessary
    df_eemsp = df_eemsp.copy()
    #-------------------------
    # Make sure outg_rec_nb_col, location_nb_col, and columns in numeric_cols/dt_cols are present in df_eemsp
    # NOTE: If simple string given for columns whereas df_eemsp has MultiIndex columns, the
    #       following should remdy such a situation by changing string to appropriate tuple
    outg_rec_nb_col = Utilities_df.find_single_col_in_multiindex_cols(
        df=df_eemsp, 
        col=outg_rec_nb_col
    )
    #-------------------------
    location_nb_col = Utilities_df.find_single_col_in_multiindex_cols(
        df=df_eemsp, 
        col=location_nb_col
    )
    #-------------------------
    for i,col in enumerate(numeric_cols):
        if col not in df_eemsp.columns:
            numeric_cols[i] = Utilities_df.find_single_col_in_multiindex_cols(
                df=df_eemsp, 
                col=col
            )
        #-----    
        if not is_numeric_dtype(df_eemsp[numeric_cols[i]].dtype):
            df_eemsp = Utilities_df.convert_col_type(
                df=df_eemsp, 
                column=numeric_cols[i], 
                to_type=float
            )
    #-------------------------
    for i,col in enumerate(dt_cols):
        if col not in df_eemsp.columns:
            dt_cols[i] = Utilities_df.find_single_col_in_multiindex_cols(
                df=df_eemsp, 
                col=col
            )
        #-----    
        if not is_datetime64_dtype(df_eemsp[dt_cols[i]].dtype):
            df_eemsp = Utilities_df.convert_col_type(
                df=df_eemsp, 
                column=dt_cols[i], 
                to_type=datetime.datetime
            )
    #-------------------------
    if ignore_cols:
        # ignore_cols may or may not actually be contained in df_eemsp, hence the need for try/except
        ignore_cols_OG = copy.deepcopy(ignore_cols)
        ignore_cols = []
        for i,col in enumerate(ignore_cols_OG):
            try:
                col = Utilities_df.find_single_col_in_multiindex_cols(
                    df=df_eemsp, 
                    col=col
                )
                ignore_cols.append(col)
            except:
                pass
    else:
        ignore_cols=[]
    #-------------------------
    # All columns not in numeric_cols + dt_cols will be considered categorical
    cat_cols = [
        x for x in df_eemsp.columns 
        if x not in numeric_cols+dt_cols+ignore_cols+[outg_rec_nb_col, location_nb_col]
    ]
    #--------------------------------------------------
    # Build the aggregation dictionary and aggregate
    numeric_dict = {k:np.mean for k in numeric_cols}
    #-----
    dt_dict      = {k:np.mean for k in dt_cols}
    #-----
    # For whatever reason, using list(set(x)) in this case (using dictionary
    #   with column keys and function values) returns a list of all the unique
    #   characters in the column.
    # Instead, I must use list(set(x.tolist())) 
    cat_dict     = {k:lambda x: natsorted(list(set(x.tolist()))) for k in cat_cols}
    if cat_cols_as_strings:
        # NOTE: Need .astype(str) below because Python apparently only likes joining lists of strings!
        cat_dict = {k:lambda x: ', '.join(natsorted(list(set(x.astype(str).tolist())))) for k in cat_cols}
    #-------------------------
    agg_dict = (numeric_dict | dt_dict | cat_dict)
    #-------------------------
    agg_dict[outg_rec_nb_col] = lambda x: x.tolist()[0]
    agg_dict[location_nb_col] = lambda x: x.tolist()[0]
    #-------------------------
    return_idx_order = (
        [outg_rec_nb_col, location_nb_col] + 
        natsorted(numeric_dict.keys()) + 
        natsorted(dt_dict.keys()) + 
        natsorted(cat_dict.keys())
    )
    assert(len(set(agg_dict.keys()).symmetric_difference(set(return_idx_order)))==0)
    #-------------------------
    if mult_strategy=='exclude':
        if ignore_cols:
            df_eemsp = df_eemsp.drop(columns=ignore_cols)
        return_df = remove_ambiguous_from_df_eemsp_OLD(
            df_eemsp=df_eemsp, 
            outg_rec_nb_col=outg_rec_nb_col, 
            location_nb_col=location_nb_col
        )
    else:
        if mult_strategy=='first':
            if ignore_cols:
                df_eemsp = df_eemsp.drop(columns=ignore_cols)
            if dt_cols:
                df_eemsp = df_eemsp.sort_values(by=dt_cols, ascending=False)
            # NOTE: I want to use nth() below, NOT first()
            #       Apparently, first doesn't grab the first row of each group, 
            #         it returns the first non-null entry of each column (so, if null values
            #         exist, this will be a mixture of 2 or more rows)
            return_df = df_eemsp.groupby(
                [outg_rec_nb_col, location_nb_col], 
                as_index=False, 
                group_keys=False, 
                dropna=False
            ).nth(0)
            #----------
            if include_n_eemsp:
                n_eemsp_df = df_eemsp.groupby(
                    [outg_rec_nb_col, location_nb_col], 
                    as_index=False, 
                    group_keys=False, 
                    dropna=False
                ).size()
                assert('size' in n_eemsp_df.columns)
                n_eemsp_df = n_eemsp_df.rename(columns={'size':'n_eemsp'})
                #-----
                tmp_shape = return_df.shape
                return_df = pd.merge(
                    return_df, 
                    n_eemsp_df, 
                    left_on= [outg_rec_nb_col, location_nb_col], 
                    right_on=[outg_rec_nb_col, location_nb_col], 
                    how='inner'
                )
                #-----
                assert(return_df.shape[0]==tmp_shape[0])
                assert(return_df.shape[1]==tmp_shape[1]+1)
            
        else:
            assert(mult_strategy=='agg')
            return_df = df_eemsp.groupby(
                [outg_rec_nb_col, location_nb_col], 
                as_index=False, 
                group_keys=False, 
                dropna=False
            ).apply(
                lambda x: reduce2_eemsp_for_outg_trsf_i_OLD(
                    df_eemsp_i=x, 
                    agg_dict=agg_dict, 
                    return_idx_order=return_idx_order, 
                    mult_strategy=mult_strategy, 
                    include_n_eemsp=include_n_eemsp
                )
            )
    #-------------------------
    return return_df

In [None]:
def merge_rcpx_with_eemsp_OLD(
    df_rcpx, 
    df_eemsp, 
    outg_rec_nb_idfr_rcpx ='index_0', 
    trsf_pole_nb_idfr_rcpx='index_1', 
    outg_rec_nb_idfr_eemsp='OUTG_REC_NB_TO_MERGE', 
    location_nb_idfr_eemsp='LOCATION_NB', 
    set_index=True, 
    drop_eemsp_merge_cols=True
):
    r"""
    set_index:
        If True, the index of return_df will be set equal to the contents of 
          outg_rec_nb_idfr_rcpx and trsf_pole_nb_idfr_rcpx
    """
    #-------------------------
    outg_rec_nb_idfr_rcpx_loc  = Utilities_df.get_idfr_loc(df_rcpx, outg_rec_nb_idfr_rcpx)
    trsf_pole_nb_idfr_rcpx_loc = Utilities_df.get_idfr_loc(df_rcpx, trsf_pole_nb_idfr_rcpx)
    #-----
    outg_rec_nb_idfr_eemsp_loc = Utilities_df.get_idfr_loc(df_eemsp, outg_rec_nb_idfr_eemsp)
    location_nb_idfr_eemsp_loc = Utilities_df.get_idfr_loc(df_eemsp, location_nb_idfr_eemsp)
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    # If either outg_rec_nb_idfr_rcpx or trsf_pole_nb_idfr_rcpx found in index, must call reset_index()
    # Reason: If one did not call reset_index and instead used, e.g., left_index=True, the actual values
    #           in the indices will not be included in the merged df
    # If found in the index, use name of index level as identifier.  If no name exists, give it a random one
    if outg_rec_nb_idfr_rcpx_loc[1] or trsf_pole_nb_idfr_rcpx_loc[1]:
        #-----
        if outg_rec_nb_idfr_rcpx_loc[1]:
            if df_rcpx.index.names[outg_rec_nb_idfr_rcpx_loc[0]]:
                outg_rec_nb_idfr_rcpx = df_rcpx.index.names[outg_rec_nb_idfr_rcpx_loc[0]]
            else:
                tmp_name = 'outg_rec_nb_'+Utilities.generate_random_string(str_len=4)
                df_rcpx.index = df_rcpx.index.set_names(tmp_name, level=outg_rec_nb_idfr_rcpx_loc[0])
                outg_rec_nb_idfr_rcpx = tmp_name
        #-----
        if trsf_pole_nb_idfr_rcpx_loc[1]:
            if df_rcpx.index.names[trsf_pole_nb_idfr_rcpx_loc[0]]:
                trsf_pole_nb_idfr_rcpx = df_rcpx.index.names[trsf_pole_nb_idfr_rcpx_loc[0]]
            else:
                tmp_name = 'trsf_pole_nb_'+Utilities.generate_random_string(str_len=4)
                df_rcpx.index = df_rcpx.index.set_names(tmp_name, level=trsf_pole_nb_idfr_rcpx_loc[0])
                trsf_pole_nb_idfr_rcpx = tmp_name
        #-----
        df_rcpx = df_rcpx.reset_index()
    else:
        outg_rec_nb_idfr_rcpx  = outg_rec_nb_idfr_rcpx_loc[0]
        trsf_pole_nb_idfr_rcpx = trsf_pole_nb_idfr_rcpx_loc[0]

    #----------------------------------------------------------------------
    # Same for outg_rec_nb_idfr_eemsp_loc, location_nb_idfr_eemsp_loc with df_eemsp
    if outg_rec_nb_idfr_eemsp_loc[1] or location_nb_idfr_eemsp_loc[1]:
        #-----
        if outg_rec_nb_idfr_eemsp_loc[1]:
            if df_eemsp.index.names[outg_rec_nb_idfr_eemsp_loc[0]]:
                outg_rec_nb_idfr_eemsp = df_eemsp.index.names[outg_rec_nb_idfr_eemsp_loc[0]]
            else:
                tmp_name = 'outg_rec_nb_'+Utilities.generate_random_string(str_len=4)
                df_eemsp.index = df_eemsp.index.set_names(tmp_name, level=outg_rec_nb_idfr_eemsp_loc[0])
                outg_rec_nb_idfr_eemsp = tmp_name
        #-----
        if location_nb_idfr_eemsp_loc[1]:
            if df_eemsp.index.names[location_nb_idfr_eemsp_loc[0]]:
                location_nb_idfr_eemsp = df_eemsp.index.names[location_nb_idfr_eemsp_loc[0]]
            else:
                tmp_name = 'location_nb_'+Utilities.generate_random_string(str_len=4)
                df_eemsp.index = df_eemsp.index.set_names(tmp_name, level=location_nb_idfr_eemsp_loc[0])
                location_nb_idfr_eemsp = tmp_name
        #-----
        df_eemsp = df_eemsp.reset_index()
    else:
        outg_rec_nb_idfr_eemsp  = outg_rec_nb_idfr_eemsp_loc[0]
        location_nb_idfr_eemsp = location_nb_idfr_eemsp_loc[0]
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    # Make sure the necessary merge columns are found in the DFs
    if outg_rec_nb_idfr_rcpx not in df_rcpx.columns:
        print(f"outg_rec_nb_idfr_rcpx = {outg_rec_nb_idfr_rcpx} not in  df_rcpx.columns!")
        print('CRASH IMMINENT')
        assert(0)
    #-----
    if trsf_pole_nb_idfr_rcpx not in df_rcpx.columns:
        print(f"trsf_pole_nb_idfr_rcpx = {trsf_pole_nb_idfr_rcpx} not in  df_rcpx.columns!")
        print('CRASH IMMINENT')
        assert(0)
    #----------
    if outg_rec_nb_idfr_eemsp not in df_eemsp.columns:
        print(f"outg_rec_nb_idfr_eemsp = {outg_rec_nb_idfr_eemsp} not in  df_eemsp.columns!")
        print('CRASH IMMINENT')
        assert(0)
    #-----
    if location_nb_idfr_eemsp not in df_eemsp.columns:
        print(f"location_nb_idfr_eemsp = {location_nb_idfr_eemsp} not in  df_eemsp.columns!")
        print('CRASH IMMINENT')
        assert(0)
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    # In order to merge, df_rcpx and df_eemsp must have the same number of levels of columns
    if df_rcpx.columns.nlevels != df_eemsp.columns.nlevels:
        if df_rcpx.columns.nlevels > df_eemsp.columns.nlevels:
            n_levels_to_add = df_rcpx.columns.nlevels - df_eemsp.columns.nlevels
            #-----
            df_eemsp = Utilities_df.prepend_levels_to_MultiIndex(
                df=df_eemsp, 
                n_levels_to_add=n_levels_to_add, 
                dummy_col_levels_prefix='EEMSP_'
            )
            #-----
            # Get new MultiIndex versions of outg_rec_nb_idfr_eemsp and location_nb_idfr_eemsp
            outg_rec_nb_idfr_eemsp = Utilities_df.find_single_col_in_multiindex_cols(
                df=df_eemsp, 
                col=outg_rec_nb_idfr_eemsp
            )
            #-------------------------
            location_nb_idfr_eemsp = Utilities_df.find_single_col_in_multiindex_cols(
                df=df_eemsp, 
                col=location_nb_idfr_eemsp
            )
        elif df_rcpx.columns.nlevels < df_eemsp.columns.nlevels:
            n_levels_to_add = df_eemsp.columns.nlevels - df_rcpx.columns.nlevels
            #-----
            df_rcpx = Utilities_df.prepend_levels_to_MultiIndex(
                df=df_rcpx, 
                n_levels_to_add=n_levels_to_add, 
                dummy_col_levels_prefix='RCPX_'
            )
            #-----
            # Get new MultiIndex versions of outg_rec_nb_idfr_rcpx and trsf_pole_nb_idfr_rcpx
            outg_rec_nb_idfr_rcpx = Utilities_df.find_single_col_in_multiindex_cols(
                df=df_rcpx, 
                col=outg_rec_nb_idfr_rcpx
            )
            #-------------------------
            trsf_pole_nb_idfr_rcpx = Utilities_df.find_single_col_in_multiindex_cols(
                df=df_rcpx, 
                col=trsf_pole_nb_idfr_rcpx
            )
        else:
            assert(0)
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    # Merge
    assert(
        df_rcpx[[outg_rec_nb_idfr_rcpx, trsf_pole_nb_idfr_rcpx]].dtypes.tolist() ==
        df_eemsp[[outg_rec_nb_idfr_eemsp, location_nb_idfr_eemsp]].dtypes.tolist()
    )
    #-----
    return_df = pd.merge(
        df_rcpx, 
        df_eemsp, 
        left_on=[outg_rec_nb_idfr_rcpx, trsf_pole_nb_idfr_rcpx], 
        right_on=[outg_rec_nb_idfr_eemsp, location_nb_idfr_eemsp], 
        how='inner'
    )
    #-------------------------
    if drop_eemsp_merge_cols:
        # NOTE: If rcpx and eemsp columns are the same, after merge only one will remain, and 
        #       do not want to drop in such a case
        cols_to_drop=[]
        if outg_rec_nb_idfr_eemsp != outg_rec_nb_idfr_rcpx:
            cols_to_drop.append(outg_rec_nb_idfr_eemsp)
        if location_nb_idfr_eemsp != trsf_pole_nb_idfr_rcpx:
            cols_to_drop.append(location_nb_idfr_eemsp)
        #-----
        if cols_to_drop:
            return_df = return_df.drop(columns=cols_to_drop)
    
    #-------------------------
    if set_index:
        return_df = return_df.set_index([outg_rec_nb_idfr_rcpx, trsf_pole_nb_idfr_rcpx])
    #-------------------------
    return return_df

# END OLD

In [None]:
# MOVED TO EEMSP.py
def remove_ambiguous_from_df_eemsp(
    df_eemsp, 
    grp_by_cols=['OUTG_REC_NB_TO_MERGE', 'LOCATION_NB']
):
    r"""
    Any group with multiple entries will be removed
    """
    #-------------------------
    # Make sure grp_by_cols are present in df_eemsp
    # NOTE: If simple string given for columns whereas df_eemsp has MultiIndex columns, the
    #       following should remdy such a situation by changing string to appropriate tuple
    grp_by_cols = [Utilities_df.find_single_col_in_multiindex_cols(df=df_eemsp, col=x) for x in grp_by_cols]
    #-------------------------
    no_ambiguity = (df_eemsp[grp_by_cols].value_counts()==1)
    no_ambiguity = no_ambiguity[no_ambiguity==True].index
    #-------------------------
    df_eemsp_singles = df_eemsp.set_index(grp_by_cols).loc[no_ambiguity].reset_index()
    #-------------------------
    return df_eemsp_singles

In [None]:
# MOVED TO EEMSP.py
def reduce2_eemsp_for_outg_trsf_i(
    df_eemsp_i, 
    agg_dict, 
    return_idx_order, 
    mult_strategy='agg', 
    include_n_eemsp=True
):
    r"""
    Helper function for reduce2_eemsp_for_outg_trsf
    In reality, only mult_strategy used here should be 'agg' ('first' is handled in 
      reduce2_eemsp_for_outg_trsf with the nth() function)
    """
    #-------------------------
    # NOTE: If mult_strategy=='exclude' in reduce2_eemsp_for_outg_trsf, the workflow
    #         should not utilize this function!
    assert(mult_strategy in ['agg', 'first'])
    #-------------------------
    if df_eemsp_i.shape[0]==1:
        srs_i = df_eemsp_i.iloc[0]
    else:
        if mult_strategy=='agg':
            srs_i = df_eemsp_i.agg(agg_dict)
        else:
            assert(mult_strategy=='first')
            srs_i = df_eemsp_i.iloc[0]
    #-------------------------
    srs_i=srs_i.reindex(index=return_idx_order)
    #-------------------------
    if include_n_eemsp:
        srs_i['n_eemsp'] = df_eemsp_i.shape[0]
    #-------------------------
    return srs_i

In [None]:
# MOVED TO EEMSP.py
def reduce2_eemsp_for_outg_trsf(
    df_eemsp, 
    mult_strategy='agg', 
    include_n_eemsp=True, 
    grp_by_cols=['OUTG_REC_NB_TO_MERGE', 'LOCATION_NB'], 
    numeric_cols = ['KVA_SIZE'], 
    dt_cols = ['INSTALL_DT', 'REMOVAL_DT'], 
    ignore_cols = ['SERIAL_NB'], 
    cat_cols_as_strings=True
):
    r"""
    To be run after reduce1_eemsp_for_outg_trsf_i!!!!!
    The intent of this function is to reduce df_eemsp down to one row per group (typically location_nb_col
      and/or outg_rec_nb_col)
    
    mult_strategy:
        Dictates how (outg_rec_nb, trsf_pole_nb) groups with multiple EEMSP entries will be handled.
        Can be 'agg', 'first', or 'exclude'
        'agg':
            For (outg_rec_nb, trsf_pole_nb) groups with multiple entries, aggregate
        'first':
            For (outg_rec_nb, trsf_pole_nb) groups with multiple entries, take the first
        'exclude':
            Exclude (outg_rec_nb, trsf_pole_nb) groups with multiple entries
            
    cat_cols_as_strings:
        Categorical columns can either be aggregated as (sorted) lists of unique values (cat_cols_as_strings==False),
          or as strings (cat_cols_as_strings==True)
        If cat_cols_as_strings==True, a given string is simply the (sorted) lists of unique values joined by commas
        
    
    """
    #-------------------------
    assert(mult_strategy in ['agg', 'first', 'exclude'])
    #-------------------------
    # Copy below probably not necessary
    df_eemsp = df_eemsp.copy()
    #-------------------------
    # Make sure grp_by_cols and columns in numeric_cols/dt_cols are present in df_eemsp
    # NOTE: If simple string given for columns whereas df_eemsp has MultiIndex columns, the
    #       following should remdy such a situation by changing string to appropriate tuple
    grp_by_cols  = [Utilities_df.find_single_col_in_multiindex_cols(df=df_eemsp, col=x) for x in grp_by_cols]
    numeric_cols = [Utilities_df.find_single_col_in_multiindex_cols(df=df_eemsp, col=x) for x in numeric_cols]
    dt_cols      = [Utilities_df.find_single_col_in_multiindex_cols(df=df_eemsp, col=x) for x in dt_cols]
    #-------------------------
    # Make sure numeric_cols are numeric types and dt_cols are datetime
    for i,col in enumerate(numeric_cols):
        if not is_numeric_dtype(df_eemsp[numeric_cols[i]].dtype):
            df_eemsp = Utilities_df.convert_col_type(
                df=df_eemsp, 
                column=numeric_cols[i], 
                to_type=float
            )
    #-----
    for i,col in enumerate(dt_cols):
        if not is_datetime64_dtype(df_eemsp[dt_cols[i]].dtype):
            df_eemsp = Utilities_df.convert_col_type(
                df=df_eemsp, 
                column=dt_cols[i], 
                to_type=datetime.datetime
            )
    #-------------------------
    if ignore_cols:
        # ignore_cols may or may not actually be contained in df_eemsp, hence the need for try/except
        ignore_cols_OG = copy.deepcopy(ignore_cols)
        ignore_cols = []
        for i,col in enumerate(ignore_cols_OG):
            try:
                col = Utilities_df.find_single_col_in_multiindex_cols(df=df_eemsp, col=col)
                ignore_cols.append(col)
            except:
                pass
    else:
        ignore_cols=[]
    #-------------------------
    # All columns not in numeric_cols + dt_cols will be considered categorical
    cat_cols = [
        x for x in df_eemsp.columns 
        if x not in numeric_cols+dt_cols+ignore_cols+grp_by_cols
    ]
    #--------------------------------------------------
    # Build the aggregation dictionary and aggregate
    numeric_dict = {k:np.mean for k in numeric_cols}
    #-----
    dt_dict      = {k:np.mean for k in dt_cols}
    #-----
    # For whatever reason, using list(set(x)) in this case (using dictionary
    #   with column keys and function values) returns a list of all the unique
    #   characters in the column.
    # Instead, I must use list(set(x.tolist())) 
    cat_dict     = {k:lambda x: natsorted(list(set(x.tolist()))) for k in cat_cols}
    if cat_cols_as_strings:
        # NOTE: Need .astype(str) below because Python apparently only likes joining lists of strings!
        cat_dict = {k:lambda x: ', '.join(natsorted(list(set(x.astype(str).tolist())))) for k in cat_cols}
    #-------------------------
    agg_dict = (numeric_dict | dt_dict | cat_dict)
    #-------------------------
    for col_i in grp_by_cols:
        agg_dict[col_i] = lambda x: x.tolist()[0]
    #-------------------------
    return_idx_order = (
        grp_by_cols + 
        natsorted(numeric_dict.keys()) + 
        natsorted(dt_dict.keys()) + 
        natsorted(cat_dict.keys())
    )
    assert(len(set(agg_dict.keys()).symmetric_difference(set(return_idx_order)))==0)
    #-------------------------
    if mult_strategy=='exclude':
        if ignore_cols:
            df_eemsp = df_eemsp.drop(columns=ignore_cols)
        return_df = remove_ambiguous_from_df_eemsp(
            df_eemsp=df_eemsp, 
            grp_by_cols=grp_by_cols
        )
    else:
        if mult_strategy=='first':
            if ignore_cols:
                df_eemsp = df_eemsp.drop(columns=ignore_cols)
            if dt_cols:
                df_eemsp = df_eemsp.sort_values(by=dt_cols, ascending=False)
            # NOTE: I want to use nth() below, NOT first()
            #       Apparently, first doesn't grab the first row of each group, 
            #         it returns the first non-null entry of each column (so, if null values
            #         exist, this will be a mixture of 2 or more rows)
            return_df = df_eemsp.groupby(
                grp_by_cols, 
                as_index=False, 
                group_keys=False, 
                dropna=False
            ).nth(0)
            #----------
            if include_n_eemsp:
                n_eemsp_df = df_eemsp.groupby(
                    grp_by_cols, 
                    as_index=False, 
                    group_keys=False, 
                    dropna=False
                ).size()
                assert('size' in n_eemsp_df.columns)
                n_eemsp_df = n_eemsp_df.rename(columns={'size':'n_eemsp'})
                #-----
                tmp_shape = return_df.shape
                return_df = pd.merge(
                    return_df, 
                    n_eemsp_df, 
                    left_on= grp_by_cols, 
                    right_on=grp_by_cols, 
                    how='inner'
                )
                #-----
                assert(return_df.shape[0]==tmp_shape[0])
                assert(return_df.shape[1]==tmp_shape[1]+1)
            
        else:
            assert(mult_strategy=='agg')
            return_df = df_eemsp.groupby(
                grp_by_cols, 
                as_index=False, 
                group_keys=False, 
                dropna=False
            ).apply(
                lambda x: reduce2_eemsp_for_outg_trsf_i(
                    df_eemsp_i=x, 
                    agg_dict=agg_dict, 
                    return_idx_order=return_idx_order, 
                    mult_strategy=mult_strategy, 
                    include_n_eemsp=include_n_eemsp
                )
            )
    #-------------------------
    return return_df

In [None]:
# MOVED to Utilities_df!!!!!!!
# def prep_df_for_merge(
#     df, 
#     merge_on, 
#     inplace=False
# ):
#     r"""
#     Helper function for new version of merge_rcpx_with_eemsp.
#     This will locate where the merge_on are in df, i.e., if in columns of indices.
#     If any are in index, .reset_index() will be called on df.
#         All index levels are ensured to have an identifiable name before reset_index is called
        
#     Returns: df, merge_on
#         Where df may possibly be modified (reset_index called) and merge_on will content the full and 
#           correct columns to merge on.
#     """
#     #-------------------------
#     if not inplace:
#         df = df.copy()
#     #-------------------------
#     assert(isinstance(merge_on, list))
#     #-------------------------
#     merge_on_locs = [get_idfr_loc(df, x) for x in merge_on]
#     #-------------------------
#     # Determine whether any of the data to be used in the merge come from the index
#     # Reason: When merging, it is easier to have all the merging fields in the index of both
#     #           dfs or in the columns of both dfs, but not a mixture.
#     #         e.g., if left_index=True and right_on=[col_1, ..., col_m], the merged df will have
#     #           index equal to [0, n-1], i.e., the index of df_1 will not be included in final result    
#     merge_needs_idx = any([True if x[1]==True else False for x in merge_on_locs])
    
#     #-------------------------
#     # If any merge_on are found in index, .reset_index() is going to be called.
#     # To make life easier (by making the indices traceable/identifiable), make sure all index levels have a name
#     # If an index level does not have a name, it will be named f'idx_{level}'
#     # Before calling reset_index, if any of the index level names is already found in the columns (probably shouldn't 
#     #   happen, but can) all of the index names will be given an random suffix
#     if merge_needs_idx:
#         # Name any unnamed
#         df.index.names = [name_i if name_i else f'idx_{i}' for i,name_i in enumerate(df.index.names)]
#         # Add random suffix if needed
#         if any([name_i in df.columns for name_i in df.index.names]):
#             rand_pfx = Utilities.generate_random_string(str_len=4)
#             df.index.names = [f"{name_i}_{rand_pfx}" for name_i in df.index.names]
#         #-------------------------
#         # Update merge_on using merge_on_locs:
#         #   NOTE: Those with merge_on_locs[i][1]==True are found in index, others are not
#         #   For those found in columns, the column (i.e., merge_on_locs[i][0]) is simply used
#         #   For those found in index, the index level (=merge_on_locs[i][0]) name is used
#         merge_on = [df.index.names[x[0]] if x[1]==True else x[0] 
#                     for x in merge_on_locs]
#         #-------------------------
#         # As promised, reset the index
#         df = df.reset_index()
#     else:
#         merge_on = [x[0] for x in merge_on_locs]

#     # If df has MultiIndex columns, make sure merge_on contains the full MutliIndex column name
#     #     When df has MultiIndex columns and merge_needs_idx was run above, this procedure is
#     #       explicitly necessary (e.g., when n_levels=2, after reset_index, idx_1 ==> (idx_1, '') column)
#     #     This will also be necessary if user lazilly inputs merge_col_i when, e.g., in reality the 
#     #       full columns is (merge_needs_idx, '')
#     merge_on = [Utilities_df.find_single_col_in_multiindex_cols(df=df, col=x) for x in merge_on]

#     # Make sure all columns are found
#     assert(all([x in df.columns.tolist() for x in merge_on]))
    
#     #-------------------------
#     return df, merge_on

In [None]:
#MOVED TO EEMSP.py
def merge_rcpx_with_eemsp(
    df_rcpx, 
    df_eemsp, 
    merge_on_rcpx, 
    merge_on_eems,
    set_index=True, 
    drop_eemsp_merge_cols=True
):
    r"""
    merge_on_rcpx/merge_on_eems:
        These should both be lists of equal length.
        Pairs to be merged should have the same index between the two lists
    
    set_index:
        If True, the index of return_df will be set to merge_on_rcpx
    """
    #-------------------------
    # I will likely be manipulating df_rcpx and df_eemsp.
    # I probably don't want to change the DFs outside of this function, so copy
    df_rcpx = df_rcpx.copy()
    df_eemsp = df_eemsp.copy()
    
    #-------------------------
    # merge_on_rcpx and merge_on_eems must be lists of the same length
    assert(isinstance(merge_on_rcpx, list) and isinstance(merge_on_eems, list))
    assert(len(merge_on_rcpx)==len(merge_on_eems))
    
    #-------------------------
    # Call reset_index if needed and identify full/true merge_on values
    df_rcpx, merge_on_rcpx = Utilities_df.prep_df_for_merge(
        df=df_rcpx, 
        merge_on=merge_on_rcpx, 
        inplace=True
    )
    #-----
    df_eemsp, merge_on_eems = Utilities_df.prep_df_for_merge(
        df=df_eemsp, 
        merge_on=merge_on_eems, 
        inplace=True
    )
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    # In order to merge, df_rcpx and df_eemsp must have the same number of levels of columns
    if df_rcpx.columns.nlevels != df_eemsp.columns.nlevels:
        if df_rcpx.columns.nlevels > df_eemsp.columns.nlevels:
            n_levels_to_add = df_rcpx.columns.nlevels - df_eemsp.columns.nlevels
            #-----
            df_eemsp = Utilities_df.prepend_levels_to_MultiIndex(
                df=df_eemsp, 
                n_levels_to_add=n_levels_to_add, 
                dummy_col_levels_prefix='EEMSP_'
            )
            #-----
            # Get new MultiIndex versions of merge_on_eems
            merge_on_eems = [Utilities_df.find_single_col_in_multiindex_cols(df=df_eemsp, col=x) for x in merge_on_eems]
        elif df_rcpx.columns.nlevels < df_eemsp.columns.nlevels:
            n_levels_to_add = df_eemsp.columns.nlevels - df_rcpx.columns.nlevels
            #-----
            df_rcpx = Utilities_df.prepend_levels_to_MultiIndex(
                df=df_rcpx, 
                n_levels_to_add=n_levels_to_add, 
                dummy_col_levels_prefix='RCPX_'
            )
            #-----
            # Get new MultiIndex versions of merge_on_rcpx
            merge_on_rcpx = [Utilities_df.find_single_col_in_multiindex_cols(df=df_rcpx, col=x) for x in merge_on_rcpx]        
        else:
            assert(0)
    #----------------------------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------------------------
    # Merge
    assert(
        df_rcpx[merge_on_rcpx].dtypes.tolist() ==
        df_eemsp[merge_on_eems].dtypes.tolist()
    )
    #-----
    return_df = pd.merge(
        df_rcpx, 
        df_eemsp, 
        left_on=merge_on_rcpx, 
        right_on=merge_on_eems, 
        how='inner'
    )
    #-------------------------
    if drop_eemsp_merge_cols:
        # NOTE: If rcpx and eemsp columns are the same, after merge only one will remain, and 
        #       do not want to drop in such a case!
        cols_to_drop=[]
        for i_col in range(len(merge_on_rcpx)):
            if merge_on_eems[i_col] != merge_on_rcpx[i_col]:
                cols_to_drop.append(merge_on_eems[i_col])
        #-----
        if cols_to_drop:
            return_df = return_df.drop(columns=cols_to_drop)
    #-------------------------
    if set_index:
        return_df = return_df.set_index(merge_on_rcpx)
        #-----
        # Resolve any ugly resulting index names, e.g., [('outg_rec_nb', ''), ('trsf_pole_nb', '')]
        #   i.e., if name_i is a tuple where 0th element is not empty but all other are, then change
        #         name to 0th element
        fnl_idx_names = []
        for idx_name_i in return_df.index.names:
            if(
                isinstance(idx_name_i, tuple) and 
                idx_name_i[0] and
                not any([True if idx_name_i[i] else False for i in range(1, len(idx_name_i))])
            ):
                fnl_idx_names.append(idx_name_i[0])
            else:
                fnl_idx_names.append(idx_name_i)
        return_df.index.names = fnl_idx_names
    #-------------------------
    return return_df

# ---------------------------------------------------------------------------------------------------
# PLOTTING AND POST-PROCESSING
# ---------------------------------------------------------------------------------------------------

In [None]:
from itertools import product
def get_outg_confusion_matrix_text_colors(
    cmd
):
    r"""
    Basically taken from sklearn.metrics.ConfusionMatrixDisplay.plot
    Colors are needed so my additional text matches what's provided by sklearn method.
    
    SHOULD BE CALLED AFTER sklearn.metrics.ConfusionMatrixDisplay.plot!
    """
    #-------------------------
    cm = cmd.confusion_matrix
    assert(cm.shape[0]==2)
    n_classes = cm.shape[0]
    #-------------------------
    cmap_min, cmap_max = cmd.im_.cmap(0), cmd.im_.cmap(1.0)
    thresh = (cm.max() + cm.min()) / 2.0
    #-------------------------
    colors = np.empty((2,2), dtype=object)
    for i,j in product(range(2), range(2)):
        color = cmap_max if cm[i, j] < thresh else cmap_min
        colors[i,j] = color
    #-------------------------
    return colors

In [None]:
def draw_outg_confusion_matrix(
    y, 
    y_pred, 
    title=None, 
    normalize=None, 
    scientific=True, 
    ax=None, 
    text_kw=dict(fontsize='xx-large'), 
    n_dir_indir_tp_fn=None
):
    r"""
    Visualize confusion matrix for outages using sklearn.metrics.ConfusionMatrixDisplay
    
    n_dir_indir_tp_fn:
        If included, plot the number of direct and indirect transformers in the FN and TP cells.
        n_dir_indir_tp_fn should be a dict with keys = ['FN_dir', 'FN_indir', 'TP_dir', 'TP_indir'] and
          values equal to the associated number of transformers
    """
    #-------------------------
    cmd = ConfusionMatrixDisplay(
        confusion_matrix(y, y_pred, normalize=normalize), 
        display_labels=['No Outg.','Outage']
    )
    #-----
    if scientific:
        cmd.plot(values_format='.3e', ax=ax, text_kw=text_kw)
    else:
        cmd.plot(values_format='', ax=ax, text_kw=text_kw)
    #-------------------------
    cmd.ax_.set_xlabel('Predicted', fontsize='xx-large')
    cmd.ax_.set_ylabel('True', fontsize='xx-large')
    cmd.ax_.set_title(title, fontsize='xx-large', fontweight='semibold')
    #ax.set_size_inches(12, 10)
    #-------------------------
    if scientific:
        txt_fmt     = '{:.4e}'
        pct_txt_fmt = txt_fmt
    else:
        txt_fmt     = '{}'
        pct_txt_fmt = '{:.4f}'
    #-----
    ax = cmd.ax_
    ax.text(1.5, 0.9, "# Entries:    {}".format(txt_fmt).format(y.shape[0]), fontsize=14, transform=ax.transAxes)
    ax.text(1.5, 0.8, "# Outages:  {}".format(txt_fmt).format(y.sum()), fontsize=14, transform=ax.transAxes)
    ax.text(1.5, 0.7, "# Baseline:  {}".format(txt_fmt).format(y.shape[0]-y.sum()), fontsize=14, transform=ax.transAxes)
    ax.text(1.5, 0.6, "% Outages:  {}".format(pct_txt_fmt).format(100*y.sum()/y.shape[0]), fontsize=14, transform=ax.transAxes)
    #-------------------------
    ax.text(1.5, 0.4, "ACCURACY:  {:.4f}".format(accuracy_score(y, y_pred)), fontsize=14, transform=ax.transAxes)
    ax.text(1.5, 0.3, "PRECISION:  {:.4f}".format(precision_score(y, y_pred)), fontsize=14, transform=ax.transAxes)
    ax.text(1.5, 0.2, "RECALL:       {:.4f}".format(recall_score(y, y_pred)), fontsize=14, transform=ax.transAxes)
    #--------------------------------------------------
    # Include TN, FP, FN, TP labels
    # NOTES:
    #   Text stored in cmd.text_ is stored in row-major fashion
    #   Therefore, when plotting the value, the y-value corresponds to the 0th
    #     index and the x-value corresponds to the 1st index
    #
    #   Axes are defined with limits:
    #     x_lim = (-0.5, 1.5)
    #     y_lim = (1.5, -0.5)
    #   The axes are defined such that: 
    #     top-left corner     = (-0.5, -0.5)
    #     bottom-right corner = (1.5, 1.5) 
    #-----
    colors = get_outg_confusion_matrix_text_colors(cmd)
    cat_fontsize = text_kw.get('fontsize', 'xx-large')
    #-----
    cmd.ax_.text(0,   -0.25, 'TN', ha='center', va='center', color=colors[0,0], fontweight='bold', fontsize=cat_fontsize)
    cmd.ax_.text(1.0, -0.25, 'FP', ha='center', va='center', color=colors[0,1], fontweight='bold', fontsize=cat_fontsize)

    cmd.ax_.text(0,    0.75, 'FN', ha='center', va='center', color=colors[1,0], fontweight='bold', fontsize=cat_fontsize)
    cmd.ax_.text(1.0,  0.75, 'TP', ha='center', va='center', color=colors[1,1], fontweight='bold', fontsize=cat_fontsize)    
    #--------------------------------------------------
    if n_dir_indir_tp_fn is not None:
        assert(len(set(['FN_dir', 'FN_indir', 'TP_dir', 'TP_indir']).difference(set(n_dir_indir_tp_fn.keys())))==0)
        #----------
        # FN
        cmd.ax_.text(0,   1.25,  "#Dir = {}".format(txt_fmt).format(n_dir_indir_tp_fn['FN_dir']), 
                     ha='center', va='center', fontsize='medium', color=colors[1,0])
        cmd.ax_.text(0,   1.375, "#Indir = {}".format(txt_fmt).format(n_dir_indir_tp_fn['FN_indir']), 
                     ha='center', va='center', fontsize='medium', color=colors[1,0])
        #----------
        # TP
        cmd.ax_.text(1.0, 1.25,  "#Dir = {}".format(txt_fmt).format(n_dir_indir_tp_fn['TP_dir']), 
                     ha='center', va='center', fontsize='medium', color=colors[1,1])
        cmd.ax_.text(1.0, 1.375, "#Indir = {}".format(txt_fmt).format(n_dir_indir_tp_fn['TP_indir']), 
                     ha='center', va='center', fontsize='medium', color=colors[1,1])
    #--------------------------------------------------
    #return ax, cmd.ax_
    return cmd