In [None]:
%run ./check_DOVS_METHODS.ipynb

In [None]:
from importlib import reload
#reload(Utilities)
#reload(clm)

import sys, os
import re

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns
from packaging import version

import copy

import itertools

import pyodbc
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
import CommonLearningMethods as clm
#-----
from MeterPremise import MeterPremise
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
import Utilities_dt
from Utilities_df import DFConstructType
import Plot_General
import Plot_Box_sns
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer

In [None]:
conn_outages = Utilities.get_utldb01p_oracle_connection()

In [None]:
outgs_file_from_mico = r'C:\Users\s346557\Documents\LocalData\dovs_check\forMico2\2023-04-08 to 04-15 Reviews (1).xlsx'
expand_time = pd.Timedelta('1 day')

In [None]:
#-------------------------
mico_df_raw = pd.read_excel(outgs_file_from_mico, sheet_name='Scorecard')
mico_df = mico_df_raw.copy()
#-----
# For now, keep only the following columns:
mico_cols_to_keep = [
    'Outage #', 
    'Outage Start DT', 
    'Adj Outage Start DT', 
    'Outage End DT',
    'Adj Outage End DT', 
    'Circuit Name',
    'Step CMI'
]
mico_df = mico_df[mico_cols_to_keep]

#-------------------------
# Currently, outage numbers have -1, -2, etc. appended.
# I believe an outage number will have such multiple rows when the outage affects more than one circuit.
# In the DOVS database, these will be split iunto separate outg_rec_nbs
#-----
# I will instead merge the data via the outage number and circuit name, so remove the -1, -2, etc. from 
#   the 'Outage #', store the result in 'OUTAGE_NB' (to be consistent with DOVS), and drop 'Outage #'
mico_df['OUTAGE_NB'] = mico_df['Outage #'].apply(lambda x: re.sub('(\d*)-\d*', r'\1', x))
mico_df=mico_df.drop(columns=['Outage #'])

#-------------------------
# Each outage can also have multiple rows corresponding to the power recover steps
# Aggregate the steps into a single row for each outage
mico_df = mico_df.groupby(
    ['OUTAGE_NB', 'Circuit Name'], 
    dropna=False, 
    as_index=False, 
    group_keys=False
).agg({
    'Outage Start DT':     'min', 
    'Adj Outage Start DT': 'min', 
    'Outage End DT':       'max', 
    'Adj Outage End DT':   'max', 
    'Step CMI':            'sum'
})

# At this point, each outage (unique combinatino of 'OUTAGE_NB' and 'Circuit Name') should
#   correspond to a single row
assert(mico_df.shape[0] == mico_df.groupby(['OUTAGE_NB', 'Circuit Name']).ngroups)

#-------------------------
mico_df['Min Start Date'] = mico_df[['Outage Start DT', 'Adj Outage Start DT']].min(axis=1).dt.date - expand_time
mico_df['Max End Date']   = mico_df[['Outage End DT',   'Adj Outage End DT'  ]].max(axis=1).dt.date + expand_time

In [None]:
start = time.time()

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# NOTE: A single OUTAGE_NB can correspond to more than one OUTG_REC_NBs!
#       It appears this is the case when the outage affects multiple GIS_CRCT_NBs, in which case,
#         each GIS_CRCT_NB gets its own OUTG_REC_NB
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# First, grab DF containing all OUTAGE_NBs
# Note, the OUTAGE_NB is not unique, so there will generally be multiple entries
#   Here, I'm talking about the same OUTAGE_NB being used for different outages throughout the years, 
#     not a single OUTAGE_NB corresponding to multiple OUTG_REC_NBs, as described above.
#   Determine which is correct entry using the times from mico_df
sql_using_outage_nbs = DOVSOutages_SQL.build_sql_std_outage(
    mjr_mnr_cause=None, 
    include_premise=True, 
    outage_nbs=mico_df['OUTAGE_NB'].unique().tolist(), 
    date_range=[mico_df['Min Start Date'].min(), mico_df['Max End Date'].max()], 
    MJR_CAUSE_CD=None, 
    DEVICE_CD=None, 
    INTRPTN_TYP_CD=None, 
    CURR_REC_STAT_CD=None, 
    select_cols_DOVS_PREMISE_DIM=['CIRCT_NM']
).get_sql_statement()
#-----
df_using_outage_nbs = pd.read_sql_query(
    sql_using_outage_nbs, 
    conn_outages, 
    dtype={
        'CI_NB':np.int32, 
        'CMI_NB':np.float64, 
        'OUTG_REC_NB':np.int32
    }
)
#-------------------------
# Determine appropriate OUTG_REC_NBs by using 'Min Start Date', 'Max End Date'
df_using_outage_nbs = pd.merge(
    df_using_outage_nbs, 
    mico_df[['OUTAGE_NB', 'Circuit Name', 'Min Start Date', 'Max End Date']], 
    left_on=['OUTAGE_NB', 'CIRCT_NM'], 
    right_on=['OUTAGE_NB', 'Circuit Name'], 
    how='inner'
)
df_using_outage_nbs= df_using_outage_nbs[
    (df_using_outage_nbs['DT_OFF_TS_FULL'].dt.date >= df_using_outage_nbs['Min Start Date']) & 
    (df_using_outage_nbs['DT_ON_TS'].dt.date       <= df_using_outage_nbs['Max End Date'])
]
df_using_outage_nbs = df_using_outage_nbs.drop(columns=['Min Start Date', 'Max End Date', 'Circuit Name'])

print(time.time()-start)

In [None]:
df_using_outage_nbs

In [None]:
df_outage = df_using_outage_nbs
outg_rec_nbs_all = df_outage['OUTG_REC_NB'].unique().tolist()
#-----
print(f"df_outage.shape = {df_outage.shape}")
print(f"# OUTG_REC_NBs  = {df_outage['OUTG_REC_NB'].nunique()}")

In [None]:
start=time.time()
#-----
# df_mp_outg_OG = build_active_MP_for_xfmrs_in_outages_df(
#     df_outage=df_outage, 
#     prem_nb_col='PREMISE_NB', 
#     is_slim=False, 
#     addtnl_mp_df_curr_cols=['technology_tx', 'state_cd'], 
#     addtnl_mp_df_hist_cols=['technology_tx', 'state_cd']
# )
# #-----
# print(f'Time for build_active_MP_for_xfmrs_in_outages_df: {time.time()-start}')
df_mp_outg_OG = build_active_MP_for_outages_df(
    df_outage=df_outage, 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    addtnl_mp_df_curr_cols=['technology_tx', 'state_cd'], 
    addtnl_mp_df_hist_cols=['technology_tx', 'state_cd'], 
    assert_all_PNs_found=False
)
#-----
print(f'Time for build_active_MP_for_outages_df: {time.time()-start}')
start=time.time()
#-----
df_mp_outg_OG['inst_ts'] = pd.to_datetime(df_mp_outg_OG['inst_ts'])
df_mp_outg_OG['rmvl_ts'] = pd.to_datetime(df_mp_outg_OG['rmvl_ts'])
#-------------------------
df_mp_outg = MeterPremise.drop_approx_mp_duplicates(
    mp_df = df_mp_outg_OG.copy(), 
    fuzziness=pd.Timedelta('1 hour'), 
    assert_single_overlap=True, 
    addtnl_groupby_cols=['OUTG_REC_NB', 'technology_tx', 'state_cd'], 
    gpby_dropna=False
)
#-----
print(f'Time for drop_approx_mp_duplicates: {time.time()-start}')

In [None]:
# Some premises are listed in DOVS are simply not found in AMI
print(f"#PNs DOVS: {df_outage['PREMISE_NB'].nunique()}")
print(f"#PNs AMI:  {df_mp_outg['prem_nb'].nunique()}")

In [None]:
set(df_mp_outg['prem_nb'].unique().tolist()).difference(set(df_outage['PREMISE_NB'].unique().tolist()))

In [None]:
set(df_outage['PREMISE_NB'].unique().tolist()).difference(set(df_mp_outg['prem_nb'].unique().tolist()))

In [None]:
# Really only want one entry per meter (here, meter being a mfr_devc_ser_nbr/prem_nb combination)
# ALthough drop_duplicates was used, multiple entries could still exist if, e.g., a meter has two
#   non-fuzzy-overlapping intervals
assert(all(df_mp_outg[['mfr_devc_ser_nbr', 'prem_nb', 'OUTG_REC_NB']].value_counts()==1))

# # Simple-minded (if assertion fails): Let's just keep the one with the most recent install date
# df_mp_outg = df_mp_outg.iloc[df_mp_outg.reset_index().groupby(['mfr_devc_ser_nbr', 'prem_nb', 'OUTG_REC_NB'])['inst_ts'].idxmax()]
# assert(all(df_mp_outg[['mfr_devc_ser_nbr', 'prem_nb', 'OUTG_REC_NB']].value_counts()==1))

In [None]:
df_mp_outg.groupby(['OUTG_REC_NB']).apply(lambda x: 100*(x[x['technology_tx']=='AMI'].shape[0]/x.shape[0]))

In [None]:
df_mp_outg

In [None]:
df_outage

In [None]:
check_df = DOVSOutages.merge_df_outage_with_mp(
    df_outage=df_outage.copy(), 
    df_mp=df_mp_outg.copy(), 
    merge_on_outg=['OUTG_REC_NB', 'PREMISE_NB'], 
    merge_on_mp=['OUTG_REC_NB', 'prem_nb'], 
    cols_to_include_mp=None, 
    drop_cols = None, 
    rename_cols=None, 
    inplace=True
)

In [None]:
print(df_outage.shape[0])
print(df_mp_outg.shape[0])
print(check_df.shape[0])

In [None]:
check_df['STATE_ABBR_TX'].equals(check_df['state_cd'])

In [None]:
print(f"STATE_ABBR_TX==state_cd: {(check_df['STATE_ABBR_TX']==check_df['state_cd']).sum()}")
print(f"STATE_ABBR_TX!=state_cd: {(check_df['STATE_ABBR_TX']!=check_df['state_cd']).sum()}")

In [None]:
check_df[check_df['STATE_ABBR_TX']!=check_df['state_cd']][['STATE_ABBR_TX', 'state_cd']]

In [None]:
df1 = check_df.groupby(['OUTG_REC_NB', 'OUTAGE_NB']).apply(lambda x: 100*(x[x['technology_tx']=='AMI'].shape[0]/x.shape[0])).to_frame(name='%AMI')
df1

In [None]:
# Some premises are listed in DOVS are simply not found in AMI
print(f"#PNs DOVS:            {df_outage['PREMISE_NB'].nunique()}")
print(f"#PNs AMI using xfmrs: {df_mp_outg['prem_nb'].nunique()}")

In [None]:
df2 = check_df.groupby(['OUTG_REC_NB', 'OUTAGE_NB']).apply(
    lambda x: len(set(df_outage[df_outage['OUTG_REC_NB']==x.name[0]]['PREMISE_NB'].unique().tolist()).difference(set(x['prem_nb'].unique().tolist())))
).to_frame(name='# PNs missing')
df2

In [None]:
df3 = check_df.groupby(['OUTG_REC_NB', 'OUTAGE_NB']).apply(
    lambda x: df_outage[df_outage['OUTG_REC_NB']==x.name[0]]['PREMISE_NB'].nunique()
).to_frame(name='# PNs total')
df3

In [None]:
df12 = pd.merge(df1, df2, how='inner', left_index=True, right_index=True)
df12

In [None]:
df123 = pd.merge(df12, df3, how='inner', left_index=True, right_index=True)
df123

In [None]:
(df123['%AMI']==0).sum()

In [None]:
df123.sort_values(by='%AMI', ascending=False)['%AMI'].tolist()

In [None]:
df123

In [None]:
min_pct_ami = 0
outg_rec_nbs_to_keep = df123[df123['%AMI']>=min_pct_ami].reset_index()['OUTG_REC_NB'].tolist()

In [None]:
len(set(df_outage['PREMISE_NB'].unique().tolist()).difference(set(df_mp_outg['prem_nb'].unique().tolist())))

In [None]:
print(df_outage.shape[0])
print(df_outage[df_outage['OUTG_REC_NB'].isin(outg_rec_nbs_to_keep)].shape[0])

In [None]:
df_outage_w_mp = DOVSOutages.merge_df_outage_with_mp(
    df_outage=df_outage[df_outage['OUTG_REC_NB'].isin(outg_rec_nbs_to_keep)].copy(), 
    df_mp=df_mp_outg, 
    merge_on_outg=['OUTG_REC_NB', 'PREMISE_NB'], 
    merge_on_mp=['OUTG_REC_NB', 'prem_nb'], 
    cols_to_include_mp=None, 
    drop_cols = None, 
    rename_cols=None, 
    inplace=True
)

In [None]:
df_outage_w_mp['OUTG_REC_NB'].nunique()

In [None]:
df_outage_w_mp

In [None]:
df_outage_w_mp_slim = DOVSOutages.consolidate_df_outage(df_outage_w_mp)

In [None]:
df_outage_w_mp_slim

In [None]:
df_outage_w_mp_slim = DOVSOutages.set_search_time_in_outage_df(
    df_outage=df_outage_w_mp_slim, 
    search_time_half_window=pd.Timedelta('24 hours')
)

In [None]:
df_outage_w_mp_slim

In [None]:
df_outage_w_mp_slim['OUTG_REC_NB'].nunique()

## AMI NonVee

In [None]:
#-------------------------
usg_split_to_CTEs=True
df_construct_type=DFConstructType.kRunSqlQuery
contstruct_df_args_ami=None
addtnl_groupby_cols=['OUTG_REC_NB', 'trsf_pole_nb']

cols_of_interest_ami = TableInfos.AMINonVee_TI.std_columns_of_interest
batch_size=10
verbose=True
n_update=1

In [None]:
ami_sql_function_kwargs = dict(
    cols_of_interest=cols_of_interest_ami, 
    df_outage=df_outage_w_mp_slim, 
    split_to_CTEs=usg_split_to_CTEs, 
    join_mp_args=False, 
    df_args = dict(
        addtnl_groupby_cols=addtnl_groupby_cols, 
        mapping_to_ami={'PREMISE_NBS':'premise_nbs'}, 
        is_df_consolidated=True
    ), 
    field_to_split='df_outage', 
    field_to_split_location_in_kwargs=['df_outage'], 
    save_and_dump=True,  
    sort_coll_to_split=True,
    batch_size=batch_size, verbose=verbose, n_update=n_update
)
# addtnl_ami_sql_function_kwargs = dict(
#     build_sql_function_kwargs=dict(opco=opcos)
# )
# ami_sql_function_kwargs = {**ami_sql_function_kwargs, 
#                            **addtnl_ami_sql_function_kwargs}


save_args = dict(
    save_to_file=True, 
    save_dir = r'C:\Users\s346557\Documents\LocalData\dovs_check\forMico2\AMINonVee', 
    save_name=r'ami_nonvee.csv', 
    index=True
)

In [None]:
start=time.time()
ami_nonvee = AMINonVee(
    df_construct_type=df_construct_type, 
    contstruct_df_args = contstruct_df_args_ami, 
    build_sql_function=AMINonVee_SQL.build_sql_usg_for_outages, 
    build_sql_function_kwargs=ami_sql_function_kwargs, 
    init_df_in_constructor=True, 
    save_args=save_args
)
build_time = time.time()-start
print(build_time)

# AMI End Events

In [None]:
#-------------------------
usg_split_to_CTEs=True
df_construct_type=DFConstructType.kRunSqlQuery
contstruct_df_args_end_events=None
addtnl_groupby_cols=['OUTG_REC_NB', 'trsf_pole_nb']

cols_of_interest_end_dev_event = TableInfos.AMIEndEvents_TI.std_columns_of_interest
batch_size=10
verbose=True
n_update=1

In [None]:
end_events_sql_function_kwargs = dict(
    cols_of_interest=cols_of_interest_end_dev_event, 
    df_outage=df_outage_w_mp_slim, 
    split_to_CTEs=usg_split_to_CTEs, 
    join_mp_args=False, 
    df_args = dict(
        addtnl_groupby_cols=addtnl_groupby_cols, 
        mapping_to_ami={'PREMISE_NBS':'premise_nbs'}, 
        is_df_consolidated=True
    ), 
    field_to_split='df_outage', 
    field_to_split_location_in_kwargs=['df_outage'], 
    save_and_dump=True, 
    sort_coll_to_split=True,
    batch_size=batch_size, verbose=verbose, n_update=n_update
)
# addtnl_end_events_sql_function_kwargs = dict(
#     build_sql_function_kwargs=dict(opco=opcos)
# )
# end_events_sql_function_kwargs = {**end_events_sql_function_kwargs, 
#                                   **addtnl_end_events_sql_function_kwargs}

end_events_save_args = dict(
    save_to_file=True, 
    save_dir = r'C:\Users\s346557\Documents\LocalData\dovs_check\forMico2\EndEvents', 
    save_name=r'end_events.csv', 
    index=True
)

In [None]:
start=time.time()
end_events = AMIEndEvents(
    df_construct_type=df_construct_type, 
    contstruct_df_args = contstruct_df_args_end_events, 
    build_sql_function=AMIEndEvents_SQL.build_sql_end_events_for_outages, 
    build_sql_function_kwargs=end_events_sql_function_kwargs, 
    init_df_in_constructor=True, 
    save_args=end_events_save_args
)
end_events_build_time = time.time()-start

In [None]:
assert(0)

In [None]:
def find_overlap_intervals_in_df(
    df, 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    fuzziness, 
    int_idxs=True
):
    r"""
    Given a pd.DataFrame df with intervals defined by starting values and ending values in columns ovrlp_intrvl_0_col
      and ovrlp_intrvl_1_col, respectively, find the reduced set of overlap intervals.
    The fuzziness argument sets how close two intervals must be to be considered overlapping (see Utilities.get_fuzzy_overlap_intervals 
      for more information.)
    Returns a list of dict objects, one for each overlap interval.
        The keys for each dict object are 'min_val', 'max_val', and 'idxs',  
        By default (when int_idxs==True) 'idxs' correspond to the integer index locations of the rows included in the overlap.
        If int_idxs==False, the index labels are instead used.
          
    NOTE: The function/operation calling this method should ensure all values are appropriate, meaning ensure that for each row
            row_i[ovrlp_intrvl_1_col] > row_i[ovrlp_intrvl_0_col]
          This method will not attempt to remedy any incorrect values, but will simply assert this is true
    """
    #-------------------------
    # Make sure fuzziness is compatible with ovrlp_intrvl_0(1)_col
    try:
        test_0 = df.iloc[0][ovrlp_intrvl_0_col]+fuzziness
        test_1 = df.iloc[0][ovrlp_intrvl_1_col]+fuzziness
    except:
        print(f'''
        In consolidate_df_according_to_fuzzy_overlap_intervals: Incompatible fuzziness type
            type(fuzziness) = {type(fuzziness)}
            df[ovrlp_intrvl_0_col].dtype = {df[ovrlp_intrvl_0_col].dtype}
            df[ovrlp_intrvl_1_col].dtype = {df[ovrlp_intrvl_1_col].dtype}
        CRASH IMMINENT!
        ''')
    #-------------------------
    # First, make sure the second element in each tuple should be greater than the first
    # NOTE: The second element can be NaN (either NaT for times, or NaN otherwise)
    #         Apparently, NaN evaluates as False when compared in any manner to anything else
    #         (i.e., anything>NaN = False, anything<NaN = False, anything==NaN = False)
    #       Thus, df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col] will result in False whenever
    #         df[ovrlp_intrvl_1_col] is NaN, and therefore the assertion:
    #           assert(all(df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]))
    #         would fail in in unwanted circumstances, since NaN value means and open ended interval
    #         and therefore any beginning value should be considered less than a NaN value.
    #       Therefore, instead of the single-line assertion above, I will include two assertions,
    #         one to ensure df[ovrlp_intrvl_0_col] doesn't contain any NaNs, and one to ensure
    #         either df[ovrlp_intrvl_1_col] is a Nan or df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]
    assert(all(df[ovrlp_intrvl_0_col].notna()))
    assert(all((df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]) | (df[ovrlp_intrvl_1_col].isna())))
    # For above assertion, probably could have isntead used: 
    #   assert(all(df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col].fillna(pd.Timestamp.max)))
    #-------------------------
    # Sort ranges, as will be necessary for this procedure
    df = df.sort_values(by=[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])
    #-------------------------
    # Set the first range in overlaps simply as from the first entry in df
    overlaps = []
    current_beg, current_end = df.iloc[0][[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]]
    overlaps.append(
        dict(min_val=current_beg, max_val=current_end)
    )
    if int_idxs:
        overlaps[0]['idxs'] = [0]
    else:
        overlaps[0]['idxs'] = [df.index[0]]
    #-------------------------
    # Iterate through and create the overlaps items, each of which will be a dict with key
    #   equal to min_val, max_val, and idxs (where idxs identifies which indices from df
    #   belong to each overlap group)
    if int_idxs:
        df_to_itr = df.reset_index()
    else:
        df_to_itr = df
    #-----
    for i, (idx, row) in enumerate(df_to_itr.iterrows()):
        if i==0:
            continue
        #---------------
        beg = row[ovrlp_intrvl_0_col]
        end = row[ovrlp_intrvl_1_col]
        if beg > current_end+fuzziness:
            # beg after current end (with fuzziness buffer), so new interval needed
            # NOTE: beg > current_end+fuzziness will evaluate to False whenever current_end
            #       is NaN, which is the desired functionality.
            overlaps.append(dict(min_val=beg, max_val=end, idxs=[idx]))
            current_beg, current_end = beg, end
        else:
            # beg <= current_end+fuzziness, so overlap
            # The beg of overlaps[-1] remains the same, but the end of overlaps[-1] should be changed to
            #   the max of current_end and end.
            # Also, idx needs to be added to the overlap
            # NOTE: max(any_non_NaT, NaT) = any_non_NaT (remember, NaN evaluates as False when compared in any manner to anything else), 
            #       which is not the funcionality I want, as an end of NaT essentially means no end (e.g., a meter which is still in service), 
            #       and should therefore be treated as Inf.  Thus, cannot simply use the one-liner 'current_end = max(current_end, end)'
            # NOTE 2: Cannot simply do 'if pd.isna(current_end) or pd.isna(end)' in single line because this function is
            #         designed to work with various data types, so in such a scenario it would be unclear what to set
            #         current_end to (e.g., should it be pd.NaT, pd.NaN, etc?).
            #         Thus, instead of if-else, need if-elif-else
            #current_end = max(current_end, end)
            if pd.isna(current_end):
                current_end=current_end
            elif pd.isna(end):
                current_end = end
            else:
                current_end = max(current_end, end)
            overlaps[-1]['max_val'] = current_end
            overlaps[-1]['idxs'].append(idx)
    #-------------------------
    return overlaps

In [None]:
def consolidate_df_group_according_to_fuzzy_overlap_intervals(
    df_i, 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    gpd_cols, 
    fuzziness, 
    assert_single_overlap=False, 
    maintain_original_cols=False, 
    enforce_list_cols_for_1d_df=True, 
    allow_duplicates_in_lists=False, 
    allow_NaNs_in_lists=False
):
    r"""
    This is for the specific case of a df group.
    It is expected, and enforced, that there exists a single unique value for each column in df_i outside of
      ovrlp_intrvl_0_col and ovrlp_intrvl_1_col
      
    gpd_cols:
        For the typical case where this function is used inside of a groupby().apply(lambda x:) function, the
          gpd_cols should match those input into groupby (for a typical use case, see 
          Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals)
        Each of these columns should contain a single unique value.
        This input is needed so the function knows which columns to collect in lists (those outside of gpd_cols+
          [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])
    """
    #-------------------------
    if not isinstance(gpd_cols, list):
        gpd_cols = [gpd_cols]
    #-------------------------
    assert(len(set(gpd_cols).intersection(set([ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])))==0)
    assert((df_i[gpd_cols].nunique()<=1).all())
    cols_to_collect_in_lists = [x for x in df_i.columns.tolist() if x not in gpd_cols+[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]]
    #-------------------------
    og_cols = df_i.columns
    #-------------------------
    if df_i.shape[0]<=1 and maintain_original_cols and len(cols_to_collect_in_lists)>0: 
        # Reason for maintain_original_cols:
        #   If this is used in .groupby, it is important for all to have same shape/labelling.
        #   So, if maintain_original_cols is False, in order for a df_i with df_i.shape[0]==1 to fit
        #     into the .groupby procedure, it must go through the steps below to ensure it has the same
        #     form as the other groups.
        if enforce_list_cols_for_1d_df:
        # Make any cols_to_collect_in_lists into lists EXCEPT ovrlp_intrvl_0_col and ovrlp_intrvl_1_col...
            lst_cols_1d = [x for x in cols_to_collect_in_lists if x not in [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]] #1d for 1-dimensional DF
            df_i[lst_cols_1d] = df_i[lst_cols_1d].apply(lambda x: [x.tolist()])
        return df_i
    #-------------------------
    if len(cols_to_collect_in_lists)>0:
        # Collect the values from cols_to_collect_in_lists, and remove from df_i
        if allow_NaNs_in_lists:
            if allow_duplicates_in_lists:
                agg_func = Utilities_df.agg_func_list
            else:
                agg_func = Utilities_df.agg_func_unq_list
        else:
            if allow_duplicates_in_lists:
                agg_func = Utilities_df.agg_func_list_dropna
            else:
                agg_func = Utilities_df.agg_func_unq_list_dropna
        list_cols_df = df_i.groupby(gpd_cols, as_index=False, group_keys=False)[cols_to_collect_in_lists].agg(lambda x: agg_func(x))
        assert(list_cols_df.shape[0]==1)
        #-----
        df_i = df_i.drop(columns=cols_to_collect_in_lists)
    #-------------------------
    overlaps = find_overlap_intervals_in_df(
        df=df_i, 
        ovrlp_intrvl_0_col=ovrlp_intrvl_0_col, 
        ovrlp_intrvl_1_col=ovrlp_intrvl_1_col, 
        fuzziness=fuzziness, 
        int_idxs=True
    )
    #-------------------------
    if assert_single_overlap and len(overlaps)!=1:
        print(f'assert_single_overlap and len(overlaps)={len(overlaps)}')
        print(overlaps)
        print(f'df_i.head():\n{df_i.head()}')
        assert(0)
    #-------------------------
    # The number of overlaps must be less than or equal to the shape of df_i
    assert(len(overlaps) <= df_i.shape[0])

    # Already know (assertion at top of function) that all columns are uniform except for [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]
    # Therefore, for return df, just grab as many rows as needed from df_i, and replace the [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]
    #   values with those from overlaps
    return_df = df_i.iloc[0:len(overlaps)].copy()
    assert(return_df.shape[0]==len(overlaps))
    #-------------------------
    # Determine the index positions of ovrlp_intrvl_0_col and ovrlp_intrvl_1_col, as these will be needed
    #   below to set to values in the consolidated DF using iloc
    ovrlp_intrvl_0_col_idx = Utilities_df.find_idxs_in_highest_order_of_columns(return_df, ovrlp_intrvl_0_col)
    assert(len(ovrlp_intrvl_0_col_idx)==1)
    ovrlp_intrvl_0_col_idx=ovrlp_intrvl_0_col_idx[0]
    #-----
    ovrlp_intrvl_1_col_idx = Utilities_df.find_idxs_in_highest_order_of_columns(return_df, ovrlp_intrvl_1_col)
    assert(len(ovrlp_intrvl_1_col_idx)==1)
    ovrlp_intrvl_1_col_idx=ovrlp_intrvl_1_col_idx[0]    
    #----------
    for i_row in range(return_df.shape[0]):
        return_df.iloc[i_row, ovrlp_intrvl_0_col_idx] = overlaps[i_row]['min_val']
        return_df.iloc[i_row, ovrlp_intrvl_1_col_idx] = overlaps[i_row]['max_val']
    #-------------------------
    if len(cols_to_collect_in_lists)>0:
        # Add back on the list_cols_df
        return_df = return_df.merge(
            pd.concat([list_cols_df]*return_df.shape[0]), 
            left_on=gpd_cols, right_on=gpd_cols, how='left'
        )
    #-------------------------
    if maintain_original_cols:
        assert(len(set(og_cols).difference(set(return_df.columns)))==0)
        return_df = return_df[og_cols]
    #-------------------------
    return return_df

In [None]:
consolidate_df_group_according_to_fuzzy_overlap_intervals(
    df_i.drop(columns=['technology_tx', 'state_cd']), 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    gpd_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
    fuzziness=pd.Timedelta('1hour'), 
    assert_single_overlap=False, 
    maintain_original_cols=False, 
    enforce_list_cols_for_1d_df=True, 
    allow_duplicates_in_lists=False, 
    allow_NaNs_in_lists=False
)

In [None]:

def consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
    df, 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    fuzziness, 
    groupby_cols, 
    assert_single_overlap=True, 
    cols_to_collect_in_lists=None, 
    recover_uniqueness_violators=False, 
    gpby_dropna=True, 
    allow_duplicates_in_lists=False, 
    allow_NaNs_in_lists=False
):
    r"""
    """
    #-------------------------
    # Don't want to alter df itself
    df = df.copy()
    og_cols = df.columns
    #-------------------------
    if groupby_cols is None:
        tmp_col = Utilities.generate_random_string()
        df[tmp_col] = 1
        groupby_cols = [tmp_col]
    else:
        tmp_col = None
    #-------------------------
    if cols_to_collect_in_lists is None:
        cols_to_collect_in_lists=[x for x in df.columns if x not in groupby_cols]
    #-------------------------
    assert(len(set(groupby_cols).intersection(set([ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])))==0)
    #-------------------------
    # Make sure fuzziness is compatible with ovrlp_intrvl_0(1)_col
    try:
        test_0 = df.iloc[0][ovrlp_intrvl_0_col]+fuzziness
        test_1 = df.iloc[0][ovrlp_intrvl_1_col]+fuzziness
    except:
        print(f'''
        In consolidate_df_according_to_fuzzy_overlap_intervals: Incompatible fuzziness type
            type(fuzziness) = {type(fuzziness)}
            df[ovrlp_intrvl_0_col].dtype = {df[ovrlp_intrvl_0_col].dtype}
            df[ovrlp_intrvl_1_col].dtype = {df[ovrlp_intrvl_1_col].dtype}
        CRASH IMMINENT!
        ''')
        
    #-------------------------
    # First, make sure the second element in each tuple should be greater than the first
    # NOTE: The second element can be NaN (either NaT for times, or NaN otherwise)
    #         Apparently, NaN evaluates as False when compared in any manner to anything else
    #         (i.e., anything>NaN = False, anything<NaN = False, anything==NaN = False)
    #       Thus, df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col] will result in False whenever
    #         df[ovrlp_intrvl_1_col] is NaN, and therefore the assertion:
    #           assert(all(df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]))
    #         would fail in in unwanted circumstances, since NaN value means and open ended interval
    #         and therefore any beginning value should be considered less than a NaN value.
    #       Therefore, instead of the single-line assertion above, I will include two assertions,
    #         one to ensure df[ovrlp_intrvl_0_col] doesn't contain any NaNs, and one to ensure
    #         either df[ovrlp_intrvl_1_col] is a Nan or df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]
    # NOTE: I actually have found at least one case where ovrlp_intrvl_1_col<ovrlp_intrvl_0_col (by one second)
    #       Instead of throwing an assertion error, I will instead print a warning and simply remove the offending entry(ies)
    if any(df[ovrlp_intrvl_0_col].isna()):
        print(f'''
            !!!!! WARNING !!!!! In consolidate_df_according_to_fuzzy_overlap_intervals:
            df[ovrlp_intrvl_0_col] has NaN values!
            df[ovrlp_intrvl_0_col].isna().sum() = {df[ovrlp_intrvl_0_col].isna().sum()}
            df.shape[0]                         = {df.shape[0]}
            Row containing these NaNs will be omitted!
        ''')
        df = df[df[ovrlp_intrvl_0_col].notna()]
    #-----
    # NOTE: In printing the output below, the parentheses in (~scnd_gt_frst_srs).sum() are important!
    #       (~scnd_gt_frst_srs).sum() != ~scnd_gt_frst_srs.sum() (the latter essentially equals -1*scnd_gt_frst_srs.sum())
    scnd_gt_frst_srs = ((df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]) | (df[ovrlp_intrvl_1_col].isna()))
    if any(~scnd_gt_frst_srs):
        print(f'''
            !!!!! WARNING !!!!! In consolidate_df_according_to_fuzzy_overlap_intervals:
            df has values for which df[ovrlp_intrvl_0_col]>=df[ovrlp_intrvl_1_col]!
            Number of violators = {(~scnd_gt_frst_srs).sum()}
            df.shape[0]         = {df.shape[0]}
            Rows containing these violators will be omitted!
        ''')
        df = df[scnd_gt_frst_srs]
    # Now, at this stage assertions should both pass    
    assert(all(df[ovrlp_intrvl_0_col].notna()))
    assert(all((df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]) | (df[ovrlp_intrvl_1_col].isna())))
    # Also, sort ranges, as will be necessary for this procedure
    df = df.sort_values(by=[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])
    #-------------------------
    return_df = df.groupby(groupby_cols, dropna=gpby_dropna, as_index=False, group_keys=False).apply(
        lambda x: consolidate_df_group_according_to_fuzzy_overlap_intervals(
            df_i=x, 
            ovrlp_intrvl_0_col=ovrlp_intrvl_0_col, 
            ovrlp_intrvl_1_col=ovrlp_intrvl_1_col, 
            gpd_cols=groupby_cols, 
            fuzziness=fuzziness, 
            assert_single_overlap=assert_single_overlap, 
            maintain_original_cols=True, 
            enforce_list_cols_for_1d_df=True, 
            allow_duplicates_in_lists=allow_duplicates_in_lists,
            allow_NaNs_in_lists=allow_NaNs_in_lists
        )
    )
    return return_df

In [None]:

Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals(
                df=df_i.drop(columns=['technology_tx', 'state_cd']), 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
                drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:

consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
                df=df_i.drop(columns=['technology_tx', 'state_cd']), 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:
# df_mp_hist_exp

In [None]:

Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals(
                df=df_i, 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
                drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:

consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
                df=df_i, 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:
df_i

In [None]:
ovrlp_intrvl_0_col='inst_ts'
ovrlp_intrvl_1_col='rmvl_ts'

In [None]:
df_i[
    [x for x in df_i.columns.tolist() 
     if x not in [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]]
].nunique()<=1

In [None]:

Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals(
                df=df_i, 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'technology_tx'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:

consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
                df=df_i, 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'technology_tx'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:

consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
                df=df_i, 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'technology_tx', 'state_cd'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:
df_mp_hist_exp

In [None]:

Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals(
                df=df_mp_hist_exp.drop(columns=['technology_tx', 'state_cd']), 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
                drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:

consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
                df=df_mp_hist_exp.drop(columns=['technology_tx', 'state_cd']), 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )

In [None]:
df_mp_hist_exp

In [None]:
start=time.time()
consol2 = consolidate_df_according_to_fuzzy_overlap_intervals_NEW(
                df=df_mp_hist_exp, 
                ovrlp_intrvl_0_col='inst_ts', 
                ovrlp_intrvl_1_col='rmvl_ts', 
                fuzziness=pd.Timedelta('1 hour'), 
                groupby_cols=['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb'], 
                assert_single_overlap=False, 
#                 cols_to_collect_in_lists=None, 
#                 recover_uniqueness_violators=False, 
#                 gpby_dropna=gpby_dropna, 
#                 drop_idx_cols=True, 
#                 maintain_original_cols=True, 
#                 enforce_list_cols_for_1d_df=True
            )
print(time.time()-start)
consol2

In [None]:
        return_df_w_dups = Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals(
            df=return_df_w_dups, 
            ovrlp_intrvl_0_col=df_mp_install_time_col, 
            ovrlp_intrvl_1_col=df_mp_removal_time_col, 
            fuzziness=fuzziness, 
            groupby_cols=groupby_cols, 
            assert_single_overlap=assert_single_overlap, 
            cols_to_collect_in_lists=cols_to_collect_in_lists, 
            recover_uniqueness_violators=False, 
            gpby_dropna=gpby_dropna, 
            allow_duplicates_in_lists=False, 
            allow_NaNs_in_lists=False
        )

In [None]:
def find_overlap_intervals_in_df(
    df, 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    fuzziness, 
    int_idxs=True
):
    r"""
    Given a pd.DataFrame df with intervals defined by starting values and ending values in columns ovrlp_intrvl_0_col
      and ovrlp_intrvl_1_col, respectively, find the reduced set of overlap intervals.
    The fuzziness argument sets how close two intervals must be to be considered overlapping (see Utilities.get_fuzzy_overlap_intervals 
      for more information.)
    Returns a list of dict objects, one for each overlap interval.
        The keys for each dict object are 'min_val', 'max_val', and 'idxs',  
        By default (when int_idxs==True) 'idxs' correspond to the integer index locations of the rows included in the overlap.
        If int_idxs==False, the index labels are instead used.
          
    NOTE: The function/operation calling this method should ensure all values are appropriate, meaning ensure that for each row
            row_i[ovrlp_intrvl_1_col] > row_i[ovrlp_intrvl_0_col]
          This method will not attempt to remedy any incorrect values, but will simply assert this is true
    """
    #-------------------------
    # Make sure fuzziness is compatible with ovrlp_intrvl_0(1)_col
    try:
        test_0 = df.iloc[0][ovrlp_intrvl_0_col]+fuzziness
        test_1 = df.iloc[0][ovrlp_intrvl_1_col]+fuzziness
    except:
        print(f'''
        In consolidate_df_according_to_fuzzy_overlap_intervals: Incompatible fuzziness type
            type(fuzziness) = {type(fuzziness)}
            df[ovrlp_intrvl_0_col].dtype = {df[ovrlp_intrvl_0_col].dtype}
            df[ovrlp_intrvl_1_col].dtype = {df[ovrlp_intrvl_1_col].dtype}
        CRASH IMMINENT!
        ''')
    #-------------------------
    # First, make sure the second element in each tuple should be greater than the first
    # NOTE: The second element can be NaN (either NaT for times, or NaN otherwise)
    #         Apparently, NaN evaluates as False when compared in any manner to anything else
    #         (i.e., anything>NaN = False, anything<NaN = False, anything==NaN = False)
    #       Thus, df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col] will result in False whenever
    #         df[ovrlp_intrvl_1_col] is NaN, and therefore the assertion:
    #           assert(all(df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]))
    #         would fail in in unwanted circumstances, since NaN value means and open ended interval
    #         and therefore any beginning value should be considered less than a NaN value.
    #       Therefore, instead of the single-line assertion above, I will include two assertions,
    #         one to ensure df[ovrlp_intrvl_0_col] doesn't contain any NaNs, and one to ensure
    #         either df[ovrlp_intrvl_1_col] is a Nan or df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]
    assert(all(df[ovrlp_intrvl_0_col].notna()))
    assert(all((df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]) | (df[ovrlp_intrvl_1_col].isna())))
    # For above assertion, probably could have isntead used: 
    #   assert(all(df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col].fillna(pd.Timestamp.max)))
    #-------------------------
    # Sort ranges, as will be necessary for this procedure
    df = df.sort_values(by=[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])
    #-------------------------
    # Set the first range in overlaps simply as from the first entry in df
    overlaps = []
    current_beg, current_end = df.iloc[0][[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]]
    overlaps.append(
        dict(min_val=current_beg, max_val=current_end)
    )
    if int_idxs:
        overlaps[0]['idxs'] = [0]
    else:
        overlaps[0]['idxs'] = [df.index[0]]
    #-------------------------
    # Iterate through and create the overlaps items, each of which will be a dict with key
    #   equal to min_val, max_val, and idxs (where idxs identifies which indices from df
    #   belong to each overlap group)
    if int_idxs:
        df_to_itr = df.reset_index()
    else:
        df_to_itr = df
    #-----
    for i, (idx, row) in enumerate(df_to_itr.iterrows()):
        if i==0:
            continue
        #---------------
        beg = row[ovrlp_intrvl_0_col]
        end = row[ovrlp_intrvl_1_col]
        if beg > current_end+fuzziness:
            # beg after current end (with fuzziness buffer), so new interval needed
            # NOTE: beg > current_end+fuzziness will evaluate to False whenever current_end
            #       is NaN, which is the desired functionality.
            overlaps.append(dict(min_val=beg, max_val=end, idxs=[idx]))
            current_beg, current_end = beg, end
        else:
            # beg <= current_end+fuzziness, so overlap
            # The beg of overlaps[-1] remains the same, but the end of overlaps[-1] should be changed to
            #   the max of current_end and end.
            # Also, idx needs to be added to the overlap
            # NOTE: max(any_non_NaT, NaT) = any_non_NaT (remember, NaN evaluates as False when compared in any manner to anything else), 
            #       which is not the funcionality I want, as an end of NaT essentially means no end (e.g., a meter which is still in service), 
            #       and should therefore be treated as Inf.  Thus, cannot simply use the one-liner 'current_end = max(current_end, end)'
            # NOTE 2: Cannot simply do 'if pd.isna(current_end) or pd.isna(end)' in single line because this function is
            #         designed to work with various data types, so in such a scenario it would be unclear what to set
            #         current_end to (e.g., should it be pd.NaT, pd.NaN, etc?).
            #         Thus, instead of if-else, need if-elif-else
            #current_end = max(current_end, end)
            if pd.isna(current_end):
                current_end=current_end
            elif pd.isna(end):
                current_end = end
            else:
                current_end = max(current_end, end)
            overlaps[-1]['max_val'] = current_end
            overlaps[-1]['idxs'].append(idx)
    #-------------------------
    return overlaps

def consolidate_df_group_according_to_fuzzy_overlap_intervals(
    df_i, 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    gpd_cols, 
    fuzziness, 
    assert_single_overlap=False, 
    maintain_original_cols=False, 
    enforce_list_cols_for_1d_df=True, 
    allow_duplicates_in_lists=False, 
    allow_NaNs_in_lists=False
):
    r"""
    This is for the specific case of a df group.
    It is expected, and enforced, that there exists a single unique value for each column in df_i outside of
      ovrlp_intrvl_0_col and ovrlp_intrvl_1_col
      
    gpd_cols:
        For the typical case where this function is used inside of a groupby().apply(lambda x:) function, the
          gpd_cols should match those input into groupby (for a typical use case, see 
          Utilities_df.consolidate_df_according_to_fuzzy_overlap_intervals)
        Each of these columns should contain a single unique value.
        This input is needed so the function knows which columns to collect in lists (those outside of gpd_cols+
          [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])
    """
    #-------------------------
    if not isinstance(gpd_cols, list):
        gpd_cols = [gpd_cols]
    #-------------------------
    assert(len(set(gpd_cols).intersection(set([ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])))==0)
    assert((df_i[gpd_cols].nunique()<=1).all())
    cols_to_collect_in_lists = [x for x in df_i.columns.tolist() if x not in gpd_cols+[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]]
    #-------------------------
    og_cols = df_i.columns
    #-------------------------
    if df_i.shape[0]<=1 and maintain_original_cols and len(cols_to_collect_in_lists)>0: 
        # Reason for maintain_original_cols:
        #   If this is used in .groupby, it is important for all to have same shape/labelling.
        #   So, if maintain_original_cols is False, in order for a df_i with df_i.shape[0]==1 to fit
        #     into the .groupby procedure, it must go through the steps below to ensure it has the same
        #     form as the other groups.
        if enforce_list_cols_for_1d_df:
        # Make any cols_to_collect_in_lists into lists EXCEPT ovrlp_intrvl_0_col and ovrlp_intrvl_1_col...
            lst_cols_1d = [x for x in cols_to_collect_in_lists if x not in [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]] #1d for 1-dimensional DF
            df_i[lst_cols_1d] = df_i[lst_cols_1d].apply(lambda x: [x.tolist()])
        return df_i
    #-------------------------
    if len(cols_to_collect_in_lists)>0:
        # Collect the values from cols_to_collect_in_lists, and remove from df_i
        if allow_NaNs_in_lists:
            if allow_duplicates_in_lists:
                agg_func = agg_func_list
            else:
                agg_func = agg_func_unq_list
        else:
            if allow_duplicates_in_lists:
                agg_func = agg_func_list_dropna
            else:
                agg_func = agg_func_unq_list_dropna
        list_cols_df = df_i.groupby(gpd_cols, as_index=False, group_keys=False)[cols_to_collect_in_lists].agg(lambda x: agg_func(x))
        assert(list_cols_df.shape[0]==1)
        #-----
        df_i = df_i.drop(columns=cols_to_collect_in_lists)
    #-------------------------
    overlaps = find_overlap_intervals_in_df(
        df=df_i, 
        ovrlp_intrvl_0_col=ovrlp_intrvl_0_col, 
        ovrlp_intrvl_1_col=ovrlp_intrvl_1_col, 
        fuzziness=fuzziness, 
        int_idxs=True
    )
    #-------------------------
    if assert_single_overlap and len(overlaps)!=1:
        print(f'assert_single_overlap and len(overlaps)={len(overlaps)}')
        print(overlaps)
        print(f'df_i.head():\n{df_i.head()}')
        assert(0)
    #-------------------------
    # The number of overlaps must be less than or equal to the shape of df_i
    assert(len(overlaps) <= df_i.shape[0])

    # Already know (assertion at top of function) that all columns are uniform except for [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]
    # Therefore, for return df, just grab as many rows as needed from df_i, and replace the [ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]
    #   values with those from overlaps
    return_df = df_i.iloc[0:len(overlaps)].copy()
    assert(return_df.shape[0]==len(overlaps))
    #-------------------------
    # Determine the index positions of ovrlp_intrvl_0_col and ovrlp_intrvl_1_col, as these will be needed
    #   below to set to values in the consolidated DF using iloc
    ovrlp_intrvl_0_col_idx = find_idxs_in_highest_order_of_columns(return_df, ovrlp_intrvl_0_col)
    assert(len(ovrlp_intrvl_0_col_idx)==1)
    ovrlp_intrvl_0_col_idx=ovrlp_intrvl_0_col_idx[0]
    #-----
    ovrlp_intrvl_1_col_idx = find_idxs_in_highest_order_of_columns(return_df, ovrlp_intrvl_1_col)
    assert(len(ovrlp_intrvl_1_col_idx)==1)
    ovrlp_intrvl_1_col_idx=ovrlp_intrvl_1_col_idx[0]    
    #----------
    for i_row in range(return_df.shape[0]):
        return_df.iloc[i_row, ovrlp_intrvl_0_col_idx] = overlaps[i_row]['min_val']
        return_df.iloc[i_row, ovrlp_intrvl_1_col_idx] = overlaps[i_row]['max_val']
    #-------------------------
    if len(cols_to_collect_in_lists)>0:
        # Add back on the list_cols_df
        return_df = return_df.merge(
            pd.concat([list_cols_df]*return_df.shape[0]), 
            left_on=gpd_cols, right_on=gpd_cols, how='left'
        )
    #-------------------------
    if maintain_original_cols:
        assert(len(set(og_cols).difference(set(return_df.columns)))==0)
        return_df = return_df[og_cols]
    #-------------------------
    return return_df


def consolidate_df_according_to_fuzzy_overlap_intervals(
    df, 
    ovrlp_intrvl_0_col, 
    ovrlp_intrvl_1_col, 
    fuzziness, 
    groupby_cols, 
    assert_single_overlap=True, 
    cols_to_collect_in_lists=None, 
    recover_uniqueness_violators=False, 
    gpby_dropna=True, 
    allow_duplicates_in_lists=False, 
    allow_NaNs_in_lists=False
):
    r"""
    """
    #-------------------------
    # Don't want to alter df itself
    df = df.copy()
    og_cols = df.columns
    #-------------------------
    if groupby_cols is None:
        tmp_col = Utilities.generate_random_string()
        df[tmp_col] = 1
        groupby_cols = [tmp_col]
    else:
        tmp_col = None
    #-------------------------
    if cols_to_collect_in_lists is None:
        cols_to_collect_in_lists=[x for x in df.columns if x not in groupby_cols]
    #-------------------------
    assert(len(set(groupby_cols).intersection(set([ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])))==0)
    #-------------------------
    # Make sure fuzziness is compatible with ovrlp_intrvl_0(1)_col
    try:
        test_0 = df.iloc[0][ovrlp_intrvl_0_col]+fuzziness
        test_1 = df.iloc[0][ovrlp_intrvl_1_col]+fuzziness
    except:
        print(f'''
        In consolidate_df_according_to_fuzzy_overlap_intervals: Incompatible fuzziness type
            type(fuzziness) = {type(fuzziness)}
            df[ovrlp_intrvl_0_col].dtype = {df[ovrlp_intrvl_0_col].dtype}
            df[ovrlp_intrvl_1_col].dtype = {df[ovrlp_intrvl_1_col].dtype}
        CRASH IMMINENT!
        ''')
        
    #-------------------------
    # First, make sure the second element in each tuple should be greater than the first
    # NOTE: The second element can be NaN (either NaT for times, or NaN otherwise)
    #         Apparently, NaN evaluates as False when compared in any manner to anything else
    #         (i.e., anything>NaN = False, anything<NaN = False, anything==NaN = False)
    #       Thus, df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col] will result in False whenever
    #         df[ovrlp_intrvl_1_col] is NaN, and therefore the assertion:
    #           assert(all(df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]))
    #         would fail in in unwanted circumstances, since NaN value means and open ended interval
    #         and therefore any beginning value should be considered less than a NaN value.
    #       Therefore, instead of the single-line assertion above, I will include two assertions,
    #         one to ensure df[ovrlp_intrvl_0_col] doesn't contain any NaNs, and one to ensure
    #         either df[ovrlp_intrvl_1_col] is a Nan or df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]
    # NOTE: I actually have found at least one case where ovrlp_intrvl_1_col<ovrlp_intrvl_0_col (by one second)
    #       Instead of throwing an assertion error, I will instead print a warning and simply remove the offending entry(ies)
    if any(df[ovrlp_intrvl_0_col].isna()):
        print(f'''
            !!!!! WARNING !!!!! In consolidate_df_according_to_fuzzy_overlap_intervals:
            df[ovrlp_intrvl_0_col] has NaN values!
            df[ovrlp_intrvl_0_col].isna().sum() = {df[ovrlp_intrvl_0_col].isna().sum()}
            df.shape[0]                         = {df.shape[0]}
            Row containing these NaNs will be omitted!
        ''')
        df = df[df[ovrlp_intrvl_0_col].notna()]
    #-----
    # NOTE: In printing the output below, the parentheses in (~scnd_gt_frst_srs).sum() are important!
    #       (~scnd_gt_frst_srs).sum() != ~scnd_gt_frst_srs.sum() (the latter essentially equals -1*scnd_gt_frst_srs.sum())
    scnd_gt_frst_srs = ((df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]) | (df[ovrlp_intrvl_1_col].isna()))
    if any(~scnd_gt_frst_srs):
        print(f'''
            !!!!! WARNING !!!!! In consolidate_df_according_to_fuzzy_overlap_intervals:
            df has values for which df[ovrlp_intrvl_0_col]>=df[ovrlp_intrvl_1_col]!
            Number of violators = {(~scnd_gt_frst_srs).sum()}
            df.shape[0]         = {df.shape[0]}
            Rows containing these violators will be omitted!
        ''')
        df = df[scnd_gt_frst_srs]
    # Now, at this stage assertions should both pass    
    assert(all(df[ovrlp_intrvl_0_col].notna()))
    assert(all((df[ovrlp_intrvl_0_col]<df[ovrlp_intrvl_1_col]) | (df[ovrlp_intrvl_1_col].isna())))
    # Also, sort ranges, as will be necessary for this procedure
    df = df.sort_values(by=[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])
    #-------------------------
    return_df = df.groupby(groupby_cols, dropna=gpby_dropna, as_index=False, group_keys=False).apply(
        lambda x: consolidate_df_group_according_to_fuzzy_overlap_intervals(
            df_i=x, 
            ovrlp_intrvl_0_col=ovrlp_intrvl_0_col, 
            ovrlp_intrvl_1_col=ovrlp_intrvl_1_col, 
            gpd_cols=groupby_cols, 
            fuzziness=fuzziness, 
            assert_single_overlap=assert_single_overlap, 
            maintain_original_cols=True, 
            enforce_list_cols_for_1d_df=True, 
            allow_duplicates_in_lists=allow_duplicates_in_lists,
            allow_NaNs_in_lists=allow_NaNs_in_lists
        )
    )
    return return_df

In [None]:
mico_df_raw

In [None]:
mico_df

In [None]:
mico_df['OUTAGE_NB'].nunique()