# IDEA HERE:
Save time in data acquisiton by not grouping by anything (in reality, the data are already grouped by search times in AMI_SQL).
After the data acquistion, use the final df_no_outage to put the desired information back in (e.g., if we want no_outg_rec_nb, is_first_after_outg, etc.

In [None]:
from importlib import reload
#reload(Utilities)
#reload(clm)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re
import string
import copy

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version

import itertools
import copy
import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
import CommonLearningMethods as clm
#-----
from MeterPremise import MeterPremise
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_Box_sns
import GrubbsTest

In [None]:
def build_active_MP_for_outages_df(
    df_outage, 
    prem_nb_col, 
    df_mp_curr=None, 
    df_mp_hist=None, 
    assert_all_PNs_found=True, 
    drop_inst_rmvl_cols=False, 
    outg_rec_nb_col='OUTG_REC_NB',  #TODO!!!!!!!!!!!!!!!!!!!!!!! what if index?!
    is_slim=False, 
    dt_on_ts_col='DT_ON_TS', 
    df_off_ts_full_col='DT_OFF_TS_FULL', 
    consolidate_PNs_batch_size=1000, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb'
):
    r"""
    Similar to build_active_MP_for_outages
    """
    #-------------------------
    assert(prem_nb_col in df_outage.columns and 
           dt_on_ts_col in df_outage.columns and 
           df_off_ts_full_col in df_outage.columns)
    #-------------------------
    if not is_slim:
        PNs = df_outage[prem_nb_col].unique().tolist()
    else:
        PNs = Utilities_df.consolidate_column_of_lists(
            df=df_outage, 
            col=prem_nb_col, 
            sort=True,
            include_None=False,
            batch_size=consolidate_PNs_batch_size, 
            verbose=False
        )
    #-----
    PNs = [x for x in PNs if pd.notna(x)]
    #-------------------------
    mp_df_curr_hist_dict = MeterPremise.build_mp_df_curr_hist_for_PNs(
        PNs=PNs, 
        mp_df_curr=df_mp_curr,
        mp_df_hist=df_mp_hist, 
        join_curr_hist=False, 
        addtnl_mp_df_curr_cols=None, 
        addtnl_mp_df_hist_cols=None, 
        assert_all_PNs_found=assert_all_PNs_found, 
        assume_one_xfmr_per_PN=True, 
        drop_approx_duplicates=True
    )
    df_mp_curr = mp_df_curr_hist_dict['mp_df_curr']
    df_mp_hist = mp_df_curr_hist_dict['mp_df_hist']
    #-------------------------
    # Only reason for making dict is to ensure outg_rec_nbs are not repeated 
    active_SNs_in_outgs_dfs_dict = {}

    if not is_slim:
        for outg_rec_nb_i, df_i in df_outage.groupby(outg_rec_nb_col):
            # Don't want to include outg_rec_nb_i=-2147483648
            if int(outg_rec_nb_i) < 0:
                continue
            # There should only be a single unique dt_on_ts and dt_off_ts_full for each outage
            if(df_i[dt_on_ts_col].nunique()!=1 or 
               df_i[df_off_ts_full_col].nunique()!=1):
                print(f'outg_rec_nb_i = {outg_rec_nb_i}')
                print(f'df_i[dt_on_ts_col].nunique()       = {df_i[dt_on_ts_col].nunique()}')
                print(f'df_i[df_off_ts_full_col].nunique() = {df_i[df_off_ts_full_col].nunique()}')
                print('CRASH IMMINENT!')
                assert(0)
            # Grab power out/on time and PNs from df_i
            dt_on_ts_i       = df_i[dt_on_ts_col].unique()[0]
            df_off_ts_full_i = df_i[df_off_ts_full_col].unique()[0]
            PNs_i            = df_i[prem_nb_col].unique().tolist()

            # Just as was done above for PNs, NaN values must be removed from PNs_i
            #   The main purpose here is to remove instances where PNs_i = [nan]
            #   NOTE: For case of slim df, the NaNs should already be removed
            # After removal, if len(PNs_i)==0, contine
            PNs_i = [x for x in PNs_i if pd.notna(x)]
            if len(PNs_i)==0:
                continue
            
            # Build active_SNs_df_i and add it to active_SNs_in_outgs_dfs_dict
            # NOTE: assume_one_xfmr_per_PN=True above in MeterPremise.build_mp_df_curr_hist_for_PNs,
            #       so does not need to be set again (i.e., assume_one_xfmr_per_PN=False below)
            active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
                PNs=PNs_i,
                df_mp_curr=df_mp_curr, 
                df_mp_hist=df_mp_hist, 
                dt_0=df_off_ts_full_i,
                dt_1=dt_on_ts_i,
                assume_one_xfmr_per_PN=False, 
                output_index=None,
                output_groupby=None, 
                assert_all_PNs_found=False
            )
            active_SNs_df_i[outg_rec_nb_col] = outg_rec_nb_i
            assert(outg_rec_nb_i not in active_SNs_in_outgs_dfs_dict)
            active_SNs_in_outgs_dfs_dict[outg_rec_nb_i] = active_SNs_df_i
    else:
        for outg_rec_nb_i, row_i in df_outage.iterrows():
            # NOTE: assume_one_xfmr_per_PN=True above in MeterPremise.build_mp_df_curr_hist_for_PNs,
            #       so does not need to be set again (i.e., assume_one_xfmr_per_PN=False below)
            active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
                PNs=row_i[prem_nb_col],
                df_mp_curr=df_mp_curr, 
                df_mp_hist=df_mp_hist, 
                dt_0=row_i[df_off_ts_full_col],
                dt_1=row_i[dt_on_ts_col],
                assume_one_xfmr_per_PN=False, 
                output_index=None,
                output_groupby=None, 
                assert_all_PNs_found=False
            )
            active_SNs_df_i[outg_rec_nb_col] = outg_rec_nb_i
            assert(outg_rec_nb_i not in active_SNs_in_outgs_dfs_dict)
            active_SNs_in_outgs_dfs_dict[outg_rec_nb_i] = active_SNs_df_i
    #-------------------------
    active_SNs_df = pd.concat(list(active_SNs_in_outgs_dfs_dict.values()))
    #-------------------------
    if drop_inst_rmvl_cols:
        active_SNs_df = active_SNs_df.drop(columns=[df_mp_install_time_col, df_mp_removal_time_col])
    #-------------------------
    return active_SNs_df

In [None]:
def find_all_outages_for_pns(
    PNs, 
    date_0, 
    date_1, 
    cols_of_interest=None, 
    mjr_mnr_cause=None, 
    method='decide_at_runtime', 
    addtnl_build_sql_std_outage_kwargs=None, 
    verbose=True, 
    n_update=10, 
    batch_size=1000
):
    r"""
    By default, the returned columns are [DT_ON_TS, DT_OFF_TS_FULL, PREMISE_NB].
        The first two are explicitly added below under 'if cols_of_interest is None:'
        The last is more subtly added via the 'select_cols_DOVS_PREMISE_DIM=['PREMISE_NB']' input parameter
          to DOVSOutages_SQL.build_sql_std_outage
      
    method:
        Possible values: 'query_pns_only', 'query_all', 'decide_at_runtime'
        'query_pns_only':  Build the SQL queries using the premise numbers in PNs.
                           NOTE: With this method, likely the premise numbers will need to be split into multiple queries, 
                                 hence the need for full-blown DOVSOutages/GenAn.build_df_general as opposed to use of 
                                 DOVSOutages.build_sql_std_outage in 'query_all' method
                           Pro: Less memory, as only PNs we're interested in are grabbed in SQL queries
                           Con: Takes significantly more time to run when number of PNs is large.
                           NOTE: If len(PNs) > 350,000, then method 'query_all' will typically be faster.
                                 Take this with a grain on salt, as the number 350,000 was found for just one particular
                                 collection of PNs for specific date_0 and date_1, so others may differ.

        'query_all':       Build the SQL query using only date_0 and date_1 (together with mjr_mnr_cause, etc.), 
                             i.e., data for ALL premise numbers are grabbed
                           After the SQL query returns, slim the data down to include only the PNs of interest.
                           Pro: Takes significantly less time to run when number of PNs is large
                           Con: Consumes more memory, as we're grabbing everything then slimming down
                           
        'decide_at_runtime': Decide between methods 'query_pns_only' and 'query_all' at runtime.
                             If len(PNs) > 350000, use 'query_all', else use 'query_pns_all'
                             As mentioned above, the number 350,000 was found for just one particular
                             collection of PNs for specific date_0 and date_1, so others may differ.
    
    NOTE: This uses DOVSOutages_SQL.build_sql_std_outage, so the standard DOVS cuts (listed below) are included:
            DOV.MJR_CAUSE_CD <> 'NI'
            DOV.DEVICE_CD <> 85
            DOV2.INTRPTN_TYP_CD = 'S'
            DOV2.CURR_REC_STAT_CD = 'A'
    
    """
    #-------------------------
    conn_outages = Utilities.get_utldb01p_oracle_connection()
    #-------------------------
    if cols_of_interest is None:
        cols_of_interest = [
            'DT_ON_TS', 
            {'field_desc': 'DOV.DT_ON_TS - DOV.STEP_DRTN_NB/(60*24)',
             'alias': 'DT_OFF_TS_FULL',
             'table_alias_prefix': None}
        ]
    #-------------------------
    # Make sure only unique values in PNs
    PNs = list(set(PNs))
    #-------------------------
    assert(method in ['query_pns_only', 'query_all', 'decide_at_runtime'])
    if method=='decide_at_runtime':
        if len(PNs) > 350000:
            method='query_all'
        else:
            method='query_pns_only'
    #-------------------------
    if method=='query_pns_only':
        build_sql_std_outage_kwargs = dict(
            mjr_mnr_cause=mjr_mnr_cause, 
            include_premise=True, 
            cols_of_interest=cols_of_interest, 
            select_cols_DOVS_PREMISE_DIM=['PREMISE_NB'], 
            alias_DOVS_PREMISE_DIM='PRIM', 
            date_range=[date_0, date_1], 
            premise_nbs=PNs, 
            include_DOVS_MASTER_GEO_DIM=False, 
            include_DOVS_OUTAGE_ATTRIBUTES_DIM=False, 
            include_DOVS_CLEARING_DEVICE_DIM=False, 
            include_DOVS_EQUIPMENT_TYPES_DIM=False, 
            include_DOVS_OUTAGE_CAUSE_TYPES_DIM=False, 
            field_to_split='premise_nbs', 
            batch_size=batch_size, 
            n_update=n_update, 
            verbose=verbose
        )
        if addtnl_build_sql_std_outage_kwargs is not None:
            build_sql_std_outage_kwargs = Utilities.supplement_dict_with_default_values(
                to_supplmnt_dict=build_sql_std_outage_kwargs, 
                default_values_dict=addtnl_build_sql_std_outage_kwargs, 
                extend_any_lists=True, 
                inplace=True
            )
        return_df = GenAn.build_df_general(
            conn_db=conn_outages, 
            build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
            build_sql_function_kwargs=build_sql_std_outage_kwargs
        )
    elif method=='query_all':
        build_sql_std_outage_kwargs = dict(
            mjr_mnr_cause=mjr_mnr_cause, 
            include_premise=True, 
            cols_of_interest=cols_of_interest, 
            select_cols_DOVS_PREMISE_DIM=['PREMISE_NB'], 
            alias_DOVS_PREMISE_DIM='PRIM', 
            date_range=[date_0, date_1], 
            include_DOVS_MASTER_GEO_DIM=False, 
            include_DOVS_OUTAGE_ATTRIBUTES_DIM=False, 
            include_DOVS_CLEARING_DEVICE_DIM=False, 
            include_DOVS_EQUIPMENT_TYPES_DIM=False, 
            include_DOVS_OUTAGE_CAUSE_TYPES_DIM=False
        )
        if addtnl_build_sql_std_outage_kwargs is not None:
            build_sql_std_outage_kwargs = Utilities.supplement_dict_with_default_values(
                to_supplmnt_dict=build_sql_std_outage_kwargs, 
                default_values_dict=addtnl_build_sql_std_outage_kwargs, 
                extend_any_lists=True, 
                inplace=True
            )
        sql_outages_for_PNs = DOVSOutages_SQL.build_sql_std_outage(**build_sql_std_outage_kwargs)
        sql_outages_for_PNs = sql_outages_for_PNs.get_sql_statement()
        return_df = pd.read_sql_query(sql_outages_for_PNs, conn_outages)
        return_df = return_df[return_df['PREMISE_NB'].isin(PNs)]
    else:
        assert(0)
    #-------------------------
    return return_df

In [None]:
def find_clean_subwindows_for_group(
    final_df_i, 
    min_window_width, 
    include_is_first_after_outg_col=True, 
    t_clean_min_col='t_clean_min', 
    t_clean_max_col='t_clean_max', 
    return_t_search_min_col='t_search_min', 
    return_t_search_max_col='t_search_max'
):
    r"""
    Designed for use in find_clean_window_for_group when search_window_strategy=='all_subwindows'.
    For the clean windows in final_df_i, this will find all acceptable subwindows.
      So, e.g., if a clean window is of length 171 days and min_window_width=30 days, this will find 5 acceptable subwindows.
      
    It is expected that final_df_i contains data for a single group (typically, a single transformer).
    The DF will have as many rows as clean periods found (see find_clean_window_for_group for more information).
    It is expected that the buffer times have already been taken care of inf final_df_i when defining t_clean_min and max (as
      is the case when this function is used within find_clean_window_for_group)
    """
    #-------------------------
    # Generate random string to be safe when dong all the index re-naming below
    idx_rndm = Utilities.generate_random_string()

    # Grab final_df_i index name for later
    final_df_i_idx_nm = final_df_i.index.name

    # Iterate over each row in final_df_i, find acceptable subwindows, and add to return_dfs collection.
    # As noted in the above documentation, final_df_i should contain one row for each clean period
    return_dfs = []
    for idx, row_i in final_df_i.iterrows():
        t_clean_min_i=row_i[t_clean_min_col]
        t_clean_max_i=row_i[t_clean_max_col]
        n_subwindows_i = np.floor((t_clean_max_i-t_clean_min_i)/min_window_width).astype(int)
        #-------------------------
        windows_i = []
        for i_window in range(n_subwindows_i):
            window_i_min = t_clean_min_i + i_window*min_window_width
            window_i_max = t_clean_min_i + (i_window+1)*min_window_width
            windows_i.append([window_i_min, window_i_max])
        #-------------------------
        # Sanity check
        assert(windows_i[-1][1] <= t_clean_max_i)
        #-------------------------
        # Create len(windows_i) copies of row_i and merge (concat with axis=1) with windows_i
        #-----
        # Need to call reset_index on rows_i for merge to work, but want to use original index later,
        #   so rename original to more easily grab later
        # NOTE: In newer versions of pandas (>=1.5) one can use the names argument of .reset_index,
        #       allowing the merge to happen in a single line
        rows_i = pd.concat([pd.DataFrame(row_i).T]*len(windows_i))
        assert(rows_i.index.nlevels==1)
        idx_nm_og = 'index_og'+idx_rndm
        rows_i.index.name = idx_nm_og
        rows_i = rows_i.reset_index()
        #-----
        return_df_i = pd.concat([
            rows_i, 
            pd.DataFrame(windows_i, columns=[return_t_search_min_col, return_t_search_max_col])
        ], axis=1)
        #-----
        if include_is_first_after_outg_col:
            return_df_i['is_first_after_outg']=0
            return_df_i.loc[0, 'is_first_after_outg']=1
        #-----
        return_dfs.append(return_df_i)
    #-------------------------
    # Combine all return_dfs into return_df
    return_df = pd.concat(return_dfs)
    #-------------------------
    # Join together the original index and the new index (new index should be 0-len(subwindows_i)-1)
    # This will allow one to track where subwindows came from, in case one needs to debug or whatever
    # As noted above, in newer versions of pandas (>=1.5) one can use the names argument of .reset_index
    assert(return_df.index.nlevels==1)
    idx_nm_new = 'index_new'+idx_rndm
    return_df.index.name=idx_nm_new
    return_df=return_df.reset_index()
    #-----
    idx_nm_final = 'index_final'+idx_rndm
    return_df[idx_nm_final] = return_df[[idx_nm_og, idx_nm_new]].astype(str).agg('_'.join, axis=1)
    # Set index to combination of og and new, rename the index to match that of final_df_i, 
    #   and drop idx_nm_og and idx_nm_new columns as they are no longer needed
    return_df = return_df.set_index(idx_nm_final)
    return_df.index.name = final_df_i_idx_nm
    return_df = return_df.drop(columns=[idx_nm_og, idx_nm_new])
    #-------------------------
    return return_df


def find_clean_window_for_group(
    df_i, 
    min_window_width, 
    buffer_time_left, 
    buffer_time_rght, 
    set_search_window=True, 
    pd_selection_stategy = 'max', 
    search_window_strategy = 'centered', 
    needs_sorted=True, 
    outg_beg_col='DT_OFF_TS_FULL', 
    outg_end_col='DT_ON_TS', 
    record_clean_window_usable=True
):
    r"""
    INTENDED FOR USE IN .groupby().apply(lambda x) function.
    This can still be used on its own, but the user should be aware of the functionality and intent
    
    FASTEST RUN TIME SUGGESTIONS:
        - Sort the DF prior, and set needs_sorted=False
        - Use search_window_strategy = 'centered'
        
    NOTE: If search_window_strategy is a timedelta object, the search period will begin search_window_strategy
            after the buffer_time_left (not after the end of the previous outage)
    
    
    needs_sorted:
        IF YOU ARE NOT SURE, KEEP needs_sorted=True, as the proper sorting of the DF is vital for the functionality.
        When running this within a .groupby().apply(lambda x) function, a little bit of time can be saved by 
          sorting the overall DataFrame first before the groupby call.
        Regardless of needs_sorted, sorting first will save time.
        If sorting already done, no need to re-sort here, so a little more time can be saved by setting needs_sorted=False
    """
    #-------------------------
    assert(pd_selection_stategy in ['max', 'min', 'rand', 'all'])
    assert(search_window_strategy in ['centered', 'rand', 'all_subwindows'] or isinstance(search_window_strategy, datetime.timedelta))
    #-------------------------
    # For this to function properly, df_i must be sorted according to time
    if needs_sorted:
        df_i = df_i.sort_values(by=[outg_beg_col, outg_end_col], ascending=True).copy()

    #-------------------------
    # Find the clean periods of time following each outage by subtracting the beginning time
    # of the next outage from the end of the current outage.
    clean_windows_after = df_i[outg_beg_col].shift(-1)-df_i[outg_end_col]

    # To find the amount of clean time after the last outage, use date_1 as an endpoint
    #   i.e., subtract the end time of the current outage from the end of the overall interval, date_1
    clean_windows_after.iloc[-1] = pd.to_datetime(date_1) - df_i.iloc[-1][outg_end_col]

    #-------------------------
    # Find the acceptable periods for which the clean time is greater than the desired length
    # NOTE: The buffer_time_left/_rght arguments allow one to ensure the period of time is not 
    #       immediately proceeding or preceding an outage event
    # NOTE: good_clean_windows_after must have a name in order to merge with df_i
    good_clean_windows_after = clean_windows_after[clean_windows_after > min_window_width+buffer_time_left+buffer_time_rght]
    good_clean_windows_after.name='clean_window_full'
    if len(good_clean_windows_after)==0:
        return pd.DataFrame()

    #-------------------------
    # Construct good_df_i using the entries from good_clean_windows_after
    # Merge this with good_clean_windows_after to include clean_window information
    good_df_i = df_i.loc[good_clean_windows_after.index]
    good_df_i = pd.merge(good_df_i, good_clean_windows_after, left_index=True, right_index=True, how='inner')
    if record_clean_window_usable:
        good_df_i['clean_window_usable'] = good_df_i['clean_window_full'] - (buffer_time_left+buffer_time_rght)
    #-------------------------
    # Select subset of good_df_i according to pd_selection_stategy
    if pd_selection_stategy=='max':
        final_df_i = good_df_i.iloc[[good_df_i['clean_window_full'].argmax()]].copy()
    elif pd_selection_stategy=='min':
        final_df_i = good_df_i.iloc[[good_df_i['clean_window_full'].argmin()]].copy()
    elif pd_selection_stategy=='rand':
        final_df_i = good_df_i.sample().copy()
    elif pd_selection_stategy=='all':
        final_df_i = good_df_i.copy()
    else:
        assert(0)

    #-------------------------
    # Create columns to hold the min and max clean times
    #   The clean time begins (min) buffer_time_left after the outage ends
    #   The clean time ends (max) buffer_time_rght before the next outage
    #     (which is equal to the time the current outage ends, plus the clean window, 
    #      minus the buffer_time_rght)
    final_df_i['t_clean_min'] = final_df_i[outg_end_col] + buffer_time_left
    final_df_i['t_clean_max'] = final_df_i[outg_end_col] + final_df_i['clean_window_full'] - buffer_time_rght

    #-------------------------
    if set_search_window:
        if search_window_strategy=='centered':
            # Mid point of clean time interval = final_df_i[['t_clean_min', 't_clean_max']].mean(numeric_only=False, axis=1)
            # ==> Left  point = (final_df_i[['t_clean_min', 't_clean_max']].mean(numeric_only=False, axis=1)) - min_window_width/2
            # ==> Right point = (final_df_i[['t_clean_min', 't_clean_max']].mean(numeric_only=False, axis=1)) + min_window_width/2
            final_df_i['t_search_min'] = (final_df_i[['t_clean_min', 't_clean_max']].mean(numeric_only=False, axis=1)) - min_window_width/2
            final_df_i['t_search_max'] = (final_df_i[['t_clean_min', 't_clean_max']].mean(numeric_only=False, axis=1)) + min_window_width/2
        elif search_window_strategy=='rand':
            final_df_i['t_search_min'] = pd.NaT
            final_df_i['t_search_max'] = pd.NaT
            #-----
            for idx, row_i in final_df_i.iterrows():
                rnd_intrvl_i = Utilities_dt.get_random_datetime_interval_between(
                    date_0=row_i['t_clean_min'], 
                    date_1=row_i['t_clean_max'], 
                    window_width=min_window_width, 
                    rand_seed=None        
                )
                final_df_i.loc[idx, ['t_search_min', 't_search_max']] = rnd_intrvl_i
        elif search_window_strategy=='all_subwindows':
            final_df_i = find_clean_subwindows_for_group(
                final_df_i=final_df_i, 
                min_window_width=min_window_width, 
                include_is_first_after_outg_col=True, 
                t_clean_min_col='t_clean_min', 
                t_clean_max_col='t_clean_max', 
                return_t_search_min_col='t_search_min', 
                return_t_search_max_col='t_search_max'
            )
        elif isinstance(search_window_strategy, datetime.timedelta):
            final_df_i['t_search_min'] = final_df_i['t_clean_min'] + search_window_strategy
            final_df_i['t_search_max'] = final_df_i['t_clean_min'] + search_window_strategy + min_window_width
        else:
            assert(0)
    #-------------------------
    # Don't need outg_beg_col or outg_end_col anymore.
    # These columns contain information about the outage(s) after which the clean period(s) was(were) selected.
    # If pd_selection_stategy=='all', I suppose this information would make sense.  But, in any other case, 
    #   the information isn't really useful, as only one clean period is returned following 
    #   one of the (possibly randomly) selected outages
    final_df_i = final_df_i.drop(columns=[outg_beg_col, outg_end_col])
    #-------------------------
    return final_df_i

def find_clean_windows(
    df, 
    groupby_col, 
    min_window_width, 
    buffer_time_left, 
    buffer_time_rght,  
    set_search_window=True, 
    pd_selection_stategy = 'max', 
    search_window_strategy = 'centered', 
    outg_beg_col='DT_OFF_TS_FULL', 
    outg_end_col='DT_ON_TS', 
    record_clean_window_usable=True
):
    r"""
    FASTEST RUN TIME SUGGESTIONS:
        - Use search_window_strategy = 'centered' 
    """
    #-------------------------
    # Only really need the three columns [groupby_col, outg_beg_col, outg_end_col]
    df = df[[groupby_col, outg_beg_col, outg_end_col]].copy()
    #-------------------------
    # Drop any duplicates
    df = df.drop_duplicates()
    #-------------------------
    # Make sure outg_beg_col/outg_end_col are datetime
    if not is_datetime64_dtype(df[outg_beg_col]):
        df = Utilities_df.convert_col_type(df=df, column=outg_beg_col, to_type=datetime.datetime)
    if not is_datetime64_dtype(df[outg_end_col]):
        df = Utilities_df.convert_col_type(df=df, column=outg_end_col, to_type=datetime.datetime)
    #-------------------------
    # To speed things up, first sort df
    df = df.sort_values(by=[groupby_col, outg_beg_col, outg_end_col]).copy()
    #-----
    return_df = df.groupby(groupby_col, as_index=False, group_keys=False).apply(
        lambda x: find_clean_window_for_group(
            df_i=x, 
            min_window_width=min_window_width, 
            buffer_time_left=buffer_time_left, 
            buffer_time_rght=buffer_time_rght, 
            set_search_window=set_search_window, 
            pd_selection_stategy=pd_selection_stategy, 
            search_window_strategy=search_window_strategy, 
            outg_beg_col=outg_beg_col, 
            outg_end_col=outg_end_col, 
            record_clean_window_usable=record_clean_window_usable, 
            needs_sorted=False
        )
    )
    #-------------------------
    return return_df

# -----------------------------------------------------------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# -----------------------------------------------------------------------------------------------

# Will have two methods for building.  
# One using a supplied df_outage with (or, I suppose, without) accompanying meter premise
# One building from ground up, given date range

In [None]:
#----------------------------------------------------------------------------------------------------
# VARIABLES TO BE SET BY USER!
#----------------------------------------------------------------------------------------------------
# Unless absolutely certain df_mp in csv has all necessary data, use read_dfs_from_file=False
save_dfs_to_file   = False
read_dfs_from_file = True
save_end_events    = True

#-------------------------
# run_date is used to collect all results from a given acquisiton run together.
# As such, run_date should be set to the first date of the acquisition run, and
#   SHOULD NOT be changed for each individual date in a run (which typically lasts
#   over the course of days/weeks)
# run_date = '20221014'
# run_date = '20221216'
# run_date = '20230512'
run_date = '20231003' # Date of data acquisition

#-------------------------
# date_0 = '2022-01-01'
# date_1 = '2022-12-31'

date_0 = '2023-04-01' # Lower limit for end events
date_1 = '2023-09-30' # Upper limit for end events

#-------------------------
min_window_width = pd.Timedelta('31 days')
buffer_time_left = pd.Timedelta('1 days')
buffer_time_rght = pd.Timedelta('31 days')

#-------------------------
run_using_slim = False


#--------------------------------------------------
# If df_mp is read from csv, it will typically contain an outg_rec_nb column
#   and entries which are duplicates except for outg_rec_nb
# For this process to work correctly, these duplicates must be removed.
df_mp_outg_rec_nb_col = 'OUTG_REC_NB'

#-------------------------
groupby_col = 'trsf_pole_nb'
# groupby_col = 'PREMISE_NB'
assert(groupby_col in ['trsf_pole_nb', 'PREMISE_NB'])

#-------------------------
pd_selection_stategy = 'all'
# search_window_strategy = 'centered'
# search_window_strategy = pd.Timedelta('1 day')
search_window_strategy = 'all_subwindows'

#--------------------------------------------------
# NOTE: below, states and opcos should be consistent!
#       i.e., e.g., if states='OH', then opcos should be 'oh' (or None, I suppose)
#-------------------------
# states used to 
#   (1) find transformers which suffered at least one outage from DOVS
#   (2) find all transformers from MeterPremise
# states can be:
#   - a single string, e.g. 'OH'
#   - a list of strings, e.g., ['OH', 'WV']
#   - None
# NOTE: states tend to be upper-case!
states=['OH']

#-------------------------
# opcos used with AMIEndEvents to
#  (1) find the premise numbers which recorded an event between date_0 and date_1.
#  (2) selection/acquisiton of end_device_events
# opcos can be:
#   - a single string, e.g. 'oh'
#   - a list of strings, e.g., ['oh', 'tx']
#   - None
# NOTE: opcos tend to be lower-case!
# NOTE: Acceptable opcos appear to be: ['ap', 'im', 'oh', 'pso', 'swp', 'tx']
opcos='oh'

#--------------------------------------------------
trsf_pole_nbs_to_ignore = [' ', 'TRANSMISSION', 'PRIMARY', 'NETWORK']


In [None]:
#----------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------
# DFs will be saved in save_dir_base
# Collection of end events files will be saved in os.path.join(save_dir_base, 'EndEvents')
save_dir_base = os.path.join(
    Utilities.get_local_data_dir(), 
    r'dovs_and_end_events_data', 
    run_date, 
    f"{date_0.replace('-','')}_{date_1.replace('-','')}", 
    'NoOutgs'
)
#-------------------------
end_events_save_args = dict(
    save_to_file=save_end_events, 
    save_dir = os.path.join(save_dir_base, 'EndEvents_NEW2'), 
    save_name=r'end_events.csv', 
    index=True
)
#-------------------------
print(f"save_dir_base = {save_dir_base}")
print('end_events_save_args')
for k,v in end_events_save_args.items():
    print(f"\t{k} : {v}")
#-------------------------
if save_dfs_to_file or save_end_events:
    if not os.path.exists(save_dir_base):
        os.makedirs(save_dir_base)
    #-----
    if save_end_events and not os.path.exists(end_events_save_args['save_dir']):
        os.makedirs(end_events_save_args['save_dir'])

In [None]:
#----------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------
assert(save_dfs_to_file+read_dfs_from_file <=1) # Should never both read and write!
assert(pd.to_datetime(date_1)-pd.to_datetime(date_0) > min_window_width+buffer_time_left+buffer_time_rght)
#--------------------------------------------------
if not read_dfs_from_file:
    conn_outages = Utilities.get_utldb01p_oracle_connection()
    conn_aws = Utilities.get_athena_prod_aws_connection()

# ---------------------------------------------------------------
# OUTAGES
# ---------------------------------------------------------------

In [None]:
sql_outage_full = DOVSOutages_SQL.build_sql_std_outage(
    mjr_mnr_cause=None, 
    include_premise=True, 
    date_range=[date_0, date_1], 
    states=states
).get_sql_statement()
print(sql_outage_full)

In [None]:
if read_dfs_from_file:
    df_outage_OG = pd.read_csv(os.path.join(save_dir_base, 'df_outage_OG.csv'), dtype=str)
    csv_cols_and_types_to_convert_dict = {'CI_NB':np.int32, 'CMI_NB':np.float64, 'OUTG_REC_NB':[np.float64, np.int32]}
    df_outage_OG = Utilities_df.convert_col_types(df_outage_OG, csv_cols_and_types_to_convert_dict)
else:
    df_outage_OG = pd.read_sql_query(sql_outage_full, conn_outages, dtype={'CI_NB':np.int32, 
                                                                           'CMI_NB':np.float64, 
                                                                           'OUTG_REC_NB':np.int32})
#-------------------------
if save_dfs_to_file and not read_dfs_from_file:
    df_outage_OG.to_csv(os.path.join(save_dir_base, 'df_outage_OG.csv'), index=False)
#-------------------------
print(f'df_outage_OG.shape = {df_outage_OG.shape}')
# df_outage = df_outage_OG.copy()

# DELETE CELL BELOW!!!!!!!!!!!!!!!

In [None]:
# transmission_pns = [
#     '071511200', '073518871', '101470381', '075619000', '075972000',
#     '073209400', '107147577', '079177100', '073936100', '079587200',
#     '106865860', '074942500', '077269200'
# ]
# all_PNs = df_outage_OG['PREMISE_NB'].dropna().unique().tolist()

# all_PNs_0 = (
#     all_PNs[:1000]+
#     transmission_pns+
#     ['103280203', '105780203', '106380203']
# )

# df_outage_OG = df_outage_OG[df_outage_OG['PREMISE_NB'].isin(all_PNs_0)]

In [None]:
df_outage = df_outage_OG.copy()

# Find all outages between date_0 and date_1 for PNs in df_outage

# Need to find all PNs, which consist not only of those directly from df_outage, but also those not in df_outage who were connected to transformers having entries in df_outage

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()
PNs = df_outage['PREMISE_NB'].unique().tolist()

In [None]:
start=time.time()

# mp_PNs = MeterPremise(
#     df_construct_type=DFConstructType.kRunSqlQuery, 
#     contstruct_df_args=None, 
#     init_df_in_constructor=True, 
#     build_sql_function=MeterPremise.build_sql_meter_premise, 
#     build_sql_function_kwargs=dict(
#         cols_of_interest=['DISTINCT trsf_pole_nb'], 
#         premise_nbs=PNs, 
#         from_table_alias=None
#     )
# )
# mp_df_PNs = mp_PNs.df.copy()

mp_df_PNs = MeterPremise.get_distinct_trsf_pole_nbs_for_PNs(
    PNs=PNs, 
    batch_size=10000, 
    conn_aws=conn_aws
)

print(time.time()-start)

In [None]:
start=time.time()
mp_xfmrs = GenAn(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=dict(conn_db=Utilities.get_athena_prod_aws_connection()), 
    init_df_in_constructor=True, 
    build_sql_function=MeterPremise.build_sql_meter_premise, 
    build_sql_function_kwargs=dict(
        cols_of_interest=['trsf_pole_nb', 'prem_nb', 'mfr_devc_ser_nbr', 'inst_ts', 'rmvl_ts'], 
        trsf_pole_nb=[x for x in mp_df_PNs['trsf_pole_nb'].unique() if x!='TRANSMISSION'], 
        field_to_split='trsf_pole_nb'
    )
)
print(time.time()-start)
mp_df_xfmrs = mp_xfmrs.df.copy()

In [None]:
if save_dfs_to_file:
    mp_df_PNs.to_pickle(os.path.join(save_dir_base, 'mp_df_PNs_no_outg.pkl'))
    mp_df_xfmrs.to_pickle(os.path.join(save_dir_base, 'mp_df_xfmrs_no_outg.pkl'))

In [None]:
# mp_df_PNs = pd.read_pickle(os.path.join(save_dir_base, 'mp_df_PNs_no_outg.pkl'))
# mp_df_xfmrs = pd.read_pickle(os.path.join(save_dir_base, 'mp_df_xfmrs_no_outg.pkl'))

In [None]:
# Now, for all premise numbers in df_outage, we want to find all outages suffered between date_0 and date_1
# There are two methods for achieving this, using either method='query_pns_only' or method='query_all' in find_all_outages_for_pns
# NOTE: It is possible that all needed info already contained in df_outage! 
#       But that is reliant upon the user having everything set up perfectly....so safest just to build...
verbose=True
n_update=10
batch_size=1000

# PNs = df_outage['PREMISE_NB'].unique().tolist()
PNs = list(set(df_outage['PREMISE_NB'].unique().tolist() + mp_df_xfmrs['prem_nb'].unique().tolist()))

mjr_mnr_cause=None
method='decide_at_runtime'
# addtnl_build_sql_std_outage_kwargs=None
addtnl_build_sql_std_outage_kwargs=dict(
    states=states
)

In [None]:
start=time.time()
all_outages_df = find_all_outages_for_pns(
    PNs=PNs, 
    date_0=date_0, 
    date_1=date_1, 
    cols_of_interest=None, 
    mjr_mnr_cause=mjr_mnr_cause, 
    method=method, 
    addtnl_build_sql_std_outage_kwargs=addtnl_build_sql_std_outage_kwargs, 
    verbose=verbose, 
    n_update=n_update, 
    batch_size=batch_size
)
print(f"Time to run find_all_outages_for_pns: {time.time()-start}")
print(f"# Unique PNs in df_outage:      {df_outage['PREMISE_NB'].nunique()}")
print(f"# Unique PNs in all_outages_df: {all_outages_df['PREMISE_NB'].nunique()}")
print(time.time()-start)

In [None]:
# FUCK2

In [None]:
#-------------------------*************************-------------------------*************************
# If grouping by transformer, the trsf_pole_nb from MeterPremise must be merged with all_outages_df
# Also, the active meters at the time of outage must be selected by comparing inst_ts,rmvl_ts to 
#   DT_OFF_TS_FULL,DT_ON_TS.
# This is documented in the code below
#-------------------------
if groupby_col=='trsf_pole_nb':
    if read_dfs_from_file:
#         df_mp = pd.read_csv(os.path.join(save_dir_base, 'df_mp_dupls_dropped.csv'), dtype=str)
        df_mp = pd.read_pickle(os.path.join(save_dir_base, 'df_mp_no_outg.pkl'))
        #-----
        # Check for df_mp_outg_rec_nb_col in df_mp, regardless of case
        #   If contained, must be dropped and duplicates removed
        #   *See comment above df_mp_outg_rec_nb_col initial assignment
        #TODO: Use Utilities_df.drop_col_case_insensitive instead of if block below!
        if df_mp_outg_rec_nb_col.lower() in [x.lower() for x in df_mp.columns]:
            tmp_idx = [x.lower() for x in df_mp.columns].index(df_mp_outg_rec_nb_col.lower())
            df_mp_outg_rec_nb_col = df_mp.columns.tolist()[tmp_idx]
            df_mp = df_mp.drop(columns=[df_mp_outg_rec_nb_col]).drop_duplicates()
    else:
        df_mp = MeterPremise.build_mp_df_curr_hist_for_PNs(
            PNs=PNs, 
            mp_df_curr=None,
            mp_df_hist=None, 
            join_curr_hist=True, 
            addtnl_mp_df_curr_cols=None, 
            addtnl_mp_df_hist_cols=None, 
            assert_all_PNs_found=False, 
            assume_one_xfmr_per_PN=True, 
            drop_approx_duplicates=True
        )
        if save_dfs_to_file:
            df_mp.to_pickle(os.path.join(save_dir_base, 'df_mp_no_outg.pkl'))
    #--------------------------------------------------
    # Some premise numbers from DOVS are missing from df_mp.
    # This is not an issue with the code, I checked. 
    # This means DOVS says a premise was affected by an outage, but at the time of the outage there were 
    #   no active meters on the premise.
    # My question is: How did DOVS therefore know the premise was affected?
    # How are premise numbers in DOVS determined?
    #-------------------------
    # I want to at least get a count to quantify the situation described above, i.e., how many premises from DOVS
    #   did not have any active meters at the time of the outage.
    # Note, for this, I cannot simply do, e.g., 
    #     set(all_outages_df['PREMISE_NB'].unique()).difference(set(df_mp['prem_nb'].unique()))
    #   as this might reflect a smaller number of missing PNs than in reality, as df_mp has not yet been chopped down
    #   to only those present at time of outage (which is done below comparing 'inst_ts' to 'DT_OFF_TS_FULL' and 
    #   'rmvl_ts' to 'DT_ON_TS')
    #-------------------------
    # The meters present at the time of the outages can only be select after all_outages_df and df_mp are merged.
    #-------------------------
    # Note: A left merge is used below instead of an inner to protect against the case of a df_mp (being read in from a CSV 
    #       file) which contains extra entries than in all_outages_df
    #-------------------------
    all_outages_df = DOVSOutages.merge_df_outage_with_mp(
        df_outage=all_outages_df, 
        df_mp=df_mp,  
        merge_on_outg=['PREMISE_NB'], 
        merge_on_mp=['prem_nb'], 
        cols_to_include_mp=None, 
        drop_cols = None, 
        rename_cols=None, 
        how='left', 
        inplace=True
    )

    #-------------------------
    # Only include serial numbers which were present at the time of the outage.
    #-----
    # NOTE the use of .fillna(pd.Timestamp.min) (YES, MIN) below, as this is different from MeterPremise.get_active_SNs_for_PNs_at_datetime_interval
    #   and MeterPremise.merge_df_with_active_mp.
    # This is needed so the premises missing from df_mp are not removed at this stage (yes, they will ultimately be removed, 
    #   but I don't want them removed yet because I want to track them!)
    # Without this, any entry with 'inst_ts'=NaT would be removed, as a comparison of NaT to anything returns False
    #-----
    all_outages_df = Utilities_df.convert_col_type_w_pd_to_datetime(all_outages_df, 'inst_ts')
    all_outages_df = Utilities_df.convert_col_type_w_pd_to_datetime(all_outages_df, 'rmvl_ts')
    #-----
    all_outages_df=all_outages_df[(all_outages_df['inst_ts'].fillna(pd.Timestamp.min)<=all_outages_df['DT_OFF_TS_FULL']) & 
                                  (all_outages_df['rmvl_ts'].fillna(pd.Timestamp.max)>all_outages_df['DT_ON_TS'])]

    #-------------------------
    # Find the entries with missing df_mp data, i.e., find the entries where DOVS says a premise was affected by an outage, 
    #   but at the time of the outage there were no active meters on the premise.
    all_outages_df_for_non_active_pns = all_outages_df[all_outages_df[df_mp.columns].isna().all(axis=1)].copy()
    non_active_pns_from_DOVS = all_outages_df_for_non_active_pns['PREMISE_NB'].unique().tolist()

    # And remove the entries with missing df_mp data from all_outages_df
    all_outages_df = all_outages_df.dropna(subset=df_mp.columns, how='all')
    #-------------------------
    print("""
    Some premise numbers from DOVS are missing from df_mp
    This is not an issue with the code, I checked.  
    This means DOVS says a premise was affected by an outage, but at the time of the outage there were no active meters on the premise.
    My question is: How did DOVS therefore know the premise was affected?
    How are premise numbers in DOVS determined?
    """)
    print(f"Number of premise numbers from DOVS without an active meter at outage time: {len(non_active_pns_from_DOVS)}") 
    #-------------------------
    # At this point, any trsf_pole_nbs to be excluded can be removed
    # Remove 'TRANSMISSION', 'PRIMARY', and 'NETWORK' transformers
    all_outages_df=all_outages_df[~all_outages_df['trsf_pole_nb'].isin(['TRANSMISSION', 'NETWORK', 'PRIMARY'])] 
    #--------------------------------------------------

In [None]:
#-------------------------*************************-------------------------*************************
# Find the clean windows for each group and build df_no_outage
#-------------------------
start=time.time()
clean_windows_by_grp = find_clean_windows(
    df=all_outages_df, 
    groupby_col=groupby_col, 
    min_window_width=min_window_width, 
    buffer_time_left=buffer_time_left, 
    buffer_time_rght=buffer_time_rght, 
    set_search_window=True, 
    pd_selection_stategy = pd_selection_stategy, 
    search_window_strategy = search_window_strategy, 
    outg_beg_col='DT_OFF_TS_FULL', 
    outg_end_col='DT_ON_TS'
)
print(time.time()-start)
#-------------------------
# All groups (trsf_pole_nbs, PREMISE_NBs, etc.) in clean_windows_by_grp should also be found in all_outages_df, 
#   but the reverse is not true
assert(len(set(clean_windows_by_grp[groupby_col].unique()).difference(set(all_outages_df[groupby_col].unique())))==0)

# Groups where no clean time period was found
grps_with_no_clean = all_outages_df[~all_outages_df[groupby_col].isin(clean_windows_by_grp[groupby_col].unique())]

print(f"groupby_col = {groupby_col}")
print(f"a. # Groups:                      {all_outages_df[groupby_col].nunique()}")
print(f"b. # Groups with clean period:    {clean_windows_by_grp[groupby_col].nunique()}")
print(f"c. # Groups without clean period: {len(set(all_outages_df[groupby_col].unique()).difference(set(clean_windows_by_grp[groupby_col].unique())))}")
print("NOTE: There may be a difference of 1 between a and b+c due to fact that nunique() does not including NaNs but unique does")
#--------------------------------------------------
if save_dfs_to_file:
    all_outages_df.to_pickle(os.path.join(save_dir_base, 'all_outages_df.pkl'))
    clean_windows_by_grp.to_pickle(os.path.join(save_dir_base, 'clean_windows_by_grp.pkl'))

In [None]:
save_dir_base 

# MERGING clean_windows_by_grp with df_mp is the winner!

In [None]:
clean_windows_by_grp_mrg_mp = pd.merge(
    clean_windows_by_grp, 
    df_mp, 
    left_on='trsf_pole_nb', 
    right_on='trsf_pole_nb', 
    how='left'
)
clean_windows_by_grp_mrg_mp = Utilities_df.convert_col_type_w_pd_to_datetime(df=clean_windows_by_grp_mrg_mp, column='inst_ts', inplace=True)
clean_windows_by_grp_mrg_mp = Utilities_df.convert_col_type_w_pd_to_datetime(df=clean_windows_by_grp_mrg_mp, column='rmvl_ts', inplace=True)


clean_windows_by_grp_mrg_mp = clean_windows_by_grp_mrg_mp[
    (clean_windows_by_grp_mrg_mp['inst_ts'].fillna(pd.Timestamp.min)<=clean_windows_by_grp_mrg_mp['t_search_min']) & 
    (clean_windows_by_grp_mrg_mp['rmvl_ts'].fillna(pd.Timestamp.max)>clean_windows_by_grp_mrg_mp['t_search_max'])
]
if save_dfs_to_file:
    clean_windows_by_grp_mrg_mp.to_pickle(os.path.join(save_dir_base, 'clean_windows_by_grp_mrg_mp.pkl'))

In [None]:
df_no_outage = clean_windows_by_grp_mrg_mp.copy()
df_no_outage = df_no_outage.sort_values(by=[groupby_col, 'prem_nb', 't_search_min'], ignore_index=True)

In [None]:
df_no_outage

# Add no_outg_rec_nb column to allow easier grouping when building rcpo_dfs

In [None]:
rand_pfx = Utilities.generate_random_string(str_len=5, letters=string.ascii_letters + string.digits)
df_no_outage['no_outg_rec_nb'] = df_no_outage.groupby(['trsf_pole_nb', 't_search_min', 't_search_max']).ngroup()
df_no_outage['no_outg_rec_nb'] = rand_pfx + df_no_outage['no_outg_rec_nb'].astype(str)

In [None]:
if save_dfs_to_file:
    df_no_outage.to_pickle(os.path.join(save_dir_base, 'df_no_outage_FINAL.pkl'))

In [None]:
df_no_outage

In [None]:
FUCK

In [None]:
all_outages_df=pd.read_pickle(os.path.join(save_dir_base, 'all_outages_df.pkl'))
clean_windows_by_grp=pd.read_pickle(os.path.join(save_dir_base, 'clean_windows_by_grp.pkl'))
df_no_outage=pd.read_pickle(os.path.join(save_dir_base, 'df_no_outage_FINAL.pkl'))
df_mp = pd.read_pickle(os.path.join(save_dir_base, 'df_mp_no_outg.pkl'))

In [None]:
df_no_outage=df_no_outage.sort_values(by=['no_outg_rec_nb'], key=natsort_keygen())

In [None]:
df_no_outage=df_no_outage.sort_values(by=['no_outg_rec_nb'], key=natsort_keygen())
#-------------------------*************************-------------------------*************************
# Convert df_no_outage to consolidated (slim) verion if run_using_slim
#-------------------------
if run_using_slim:
    #--------------------------------------------------
    # Convert to slim 
    cols_shared_by_group = None
    cols_to_collect_in_lists = ['prem_nb', 'mfr_devc_ser_nbr', 'trsf_pole_nb']
    rename_cols = {
        'prem_nb'          : 'premise_nbs', 
        'mfr_devc_ser_nbr' : 'serial_numbers', 
        'trsf_pole_nb'     : 'trsf_pole_nbs'
    }        
    consol_groupby_cols = ['t_search_min', 't_search_max']
    #-------------------------
    df_no_outage_slim = Utilities_df.consolidate_df(
        df=df_no_outage, 
        groupby_cols=consol_groupby_cols, 
        cols_shared_by_group=cols_shared_by_group, 
        cols_to_collect_in_lists=cols_to_collect_in_lists, 
        rename_cols=rename_cols, 
        verbose=True
    )
    #-------------------------
    df_no_outage_slim=df_no_outage_slim.reset_index()
    #-------------------------
    df_no_outage_slim['premise_nbs']    = df_no_outage_slim['premise_nbs'].apply(sorted)
    df_no_outage_slim['serial_numbers'] = df_no_outage_slim['serial_numbers'].apply(sorted)
    df_no_outage_slim['trsf_pole_nbs']  = df_no_outage_slim['trsf_pole_nbs'].apply(sorted)
    #-------------------------
    if save_dfs_to_file:
        df_no_outage_slim.to_pickle(os.path.join(save_dir_base, 'df_no_outage_slim.pkl'))

# Collect events

In [None]:
# df_no_outage = pd.read_pickle(os.path.join(save_dir_base, 'df_no_outage_FINAL.pkl'))

In [None]:
#----------
usg_split_to_CTEs=True
df_construct_type=DFConstructType.kRunSqlQuery
contstruct_df_args_end_events=None



# cols_of_interest_end_dev_event = TableInfos.AMIEndEvents_TI.std_columns_of_interest
cols_of_interest_end_dev_event = ['*']
verbose=True
n_update=1

In [None]:
addtnl_groupby_cols = None

In [None]:
#--------------------------------------------------
end_events_sql_function_kwargs = dict(
    cols_of_interest=cols_of_interest_end_dev_event, 
    date_only=True, 
    split_to_CTEs=usg_split_to_CTEs, 
    join_mp_args=False, 
    df_args = dict(
        addtnl_groupby_cols=addtnl_groupby_cols, 
        t_search_min_col='t_search_min', 
        t_search_max_col='t_search_max'
    ), 
    field_to_split='df_mp_no_outg', 
    field_to_split_location_in_kwargs=['df_mp_no_outg'], 
    sort_coll_to_split=False,
    verbose=verbose, n_update=n_update
)
#----------
addtnl_end_events_sql_function_kwargs = dict(
    build_sql_function_kwargs = dict(
        schema_name='meter_events', 
        table_name='events_summary_vw', 
    )
)
if opcos is not None:
    addtnl_end_events_sql_function_kwargs['build_sql_function_kwargs']['opco'] = opcos
end_events_sql_function_kwargs = {**end_events_sql_function_kwargs, 
                                  **addtnl_end_events_sql_function_kwargs}
#--------------------------------------------------
if run_using_slim:
    batch_size=10
    #----------
    end_events_sql_function_kwargs = Utilities.supplement_dict_with_default_values(
        to_supplmnt_dict=end_events_sql_function_kwargs, 
        default_values_dict=dict(
            df_mp_no_outg=df_no_outage_slim, 
            batch_size=batch_size, 
            df_args=dict(
                mapping_to_ami={'premise_nbs':'premise_nbs'}, 
                is_df_consolidated=True
            )
        ), 
        extend_any_lists=True,
        inplace=True
    )
#-------------------------
else:
    batch_size=200
    #----------
    if groupby_col=='trsf_pole_nb':
        df_no_outage=df_no_outage.sort_values(by=['no_outg_rec_nb', 'trsf_pole_nb', 'prem_nb', 't_search_min'], ignore_index=True)
    if groupby_col=='PREMISE_NB':
        df_no_outage=df_no_outage.sort_values(by=['no_outg_rec_nb', 'PREMISE_NB', 't_search_min'], ignore_index=True)
    #----------
    end_events_sql_function_kwargs = Utilities.supplement_dict_with_default_values(
        to_supplmnt_dict=end_events_sql_function_kwargs, 
        default_values_dict=dict(
            df_mp_no_outg=df_no_outage, 
            batch_size=batch_size, 
            df_args=dict(
                mapping_to_ami={'prem_nb':'premise_nbs'}, 
                is_df_consolidated=False
            )
        ), 
        extend_any_lists=True,
        inplace=True
    )

In [None]:
# #--------------------------------------------------
# #--------------------------------------------------
# start=time.time()
# end_events = AMIEndEvents(
#     df_construct_type=df_construct_type, 
#     contstruct_df_args = contstruct_df_args_end_events, 
#     build_sql_function=AMIEndEvents_SQL.build_sql_end_events_for_no_outages, 
#     build_sql_function_kwargs=end_events_sql_function_kwargs, 
#     init_df_in_constructor=True, 
#     save_args=end_events_save_args
# )
# end_events_build_time = time.time()-start

In [None]:
start=time.time()

while True:
    try:
        end_events = AMIEndEvents(
            df_construct_type=df_construct_type, 
            contstruct_df_args = contstruct_df_args_end_events, 
            build_sql_function=AMIEndEvents_SQL.build_sql_end_events_for_no_outages, 
            build_sql_function_kwargs=end_events_sql_function_kwargs, 
            init_df_in_constructor=True, 
            save_args=end_events_save_args
        )
        break # stop the loop if the function completes sucessfully
    except Exception as e:
        print("Function errored out!", e)
        print("Retrying ... ")
        
build_time = time.time()-start
print(build_time)

In [None]:
assert(0)