In [None]:
from importlib import reload
#reload(Utilities)
#reload(clm)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re
from pathlib import Path
import json
import pickle

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version
import copy

import itertools

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
import matplotlib.colors as mcolors
import matplotlib.cm as cm #e.g. for cmap=cm.jet
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
import CommonLearningMethods as clm
#-----
from MeterPremise import MeterPremise
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from AMIEDE_DEV import AMIEDE_DEV
from MECPODf import MECPODf
from MECPOAn import MECPOAn
from MECPOCollection import MECPOCollection
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_General
import Plot_Box_sns
import Plot_Hist
import Plot_Bar
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer

# DEVELOPMENT OF finding what still needs to be run to complete XFMR/OUTAGE groups

In [None]:
def build_SNs_w_EDEs_df_from_local_files(
    date_0, 
    date_1, 
    opco='oh', 
    files_dir_base=r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\SNs_with_end_events', 
    allow_duplicate_dates=False, 
    drop_duplicate_dates=True
):
    r"""
    drop_duplicate_dates only has effect if allow_duplicate_dates is True, 
      in which case, any duplicate index entries will be dropped, with the first being kept
    Note: Cannot call drop_duplicates on DF because elements are lists
    
    Currently, FILES EXPECTED TO BE IN DIRECTORY os.path.join(files_dir_base, 'OPCO', opco)
    """
    #-------------------------
    files_dir = os.path.join(files_dir_base, 'OPCO', opco)
    assert(os.path.isdir(files_dir))
    #-------------------------
    year_0 = pd.to_datetime(date_0).year
    year_1 = pd.to_datetime(date_1).year
    years_needed = list(range(year_0,year_1+1))
    #-------------------------
    SNs_w_EDEs_dfs = []
    for year in years_needed:
        file_name = f'{year}.pkl'
        file_path = os.path.join(files_dir, file_name)
        assert(os.path.exists(file_path))
        #-----
        SNs_w_EDEs_df_i = pd.read_pickle(file_path)
        SNs_w_EDEs_dfs.append(SNs_w_EDEs_df_i)
    assert(len(SNs_w_EDEs_dfs)>0)
    if len(SNs_w_EDEs_dfs)==1:
        SNs_w_EDEs_df = SNs_w_EDEs_dfs[0]
    else:
        # Make sure all have same columns
        cols = SNs_w_EDEs_dfs[0].columns
        for df in SNs_w_EDEs_dfs:
            assert(df.columns==cols)
        #-----
        SNs_w_EDEs_df = pd.concat(SNs_w_EDEs_dfs)
        #-----
        if not allow_duplicate_dates:
            assert(SNs_w_EDEs_df.index.nunique()==SNs_w_EDEs_df.shape[0])
        else:
            if drop_duplicate_dates and SNs_w_EDEs_df.index.nunique()!=SNs_w_EDEs_df.shape[0]:
                SNs_w_EDEs_df = SNs_w_EDEs_df.groupby(SNs_w_EDEs_df.index).first()
                assert(SNs_w_EDEs_df.index.nunique()==SNs_w_EDEs_df.shape[0])
            else:
                # Not really necessary to check this...
                assert(SNs_w_EDEs_df.shape[0]==sum([x.shape[0] for x in SNs_w_EDEs_dfs]))
    #-------------------------
    # Make sure index is datetime with daily frequency
    SNs_w_EDEs_df.index = pd.to_datetime(SNs_w_EDEs_df.index)
    assert(pd.infer_freq(SNs_w_EDEs_df.index)=='D')
    SNs_w_EDEs_df.index.freq = 'D'

    # Make sure the index is sorted
    SNs_w_EDEs_df = SNs_w_EDEs_df.sort_index()
    #-------------------------
    return SNs_w_EDEs_df


def get_SNs_with_end_events_from_SNs_w_EDEs_df(
    SNs_w_EDEs_df, 
    date_0, 
    date_1, 
    SNs_col=None, 
    batch_size=None, 
    verbose=False
):
    r"""
    SNs_col:
      typically 'serialnumbers'
      If left as None, SNs_w_EDEs_df must have only a single column (which it typically does), and SNs_col
        is taken to be that column
    
    """
    #-------------------------
    if SNs_col is None:
        assert(SNs_w_EDEs_df.shape[1]==1)
        SNs_col = SNs_w_EDEs_df.columns[0]
    #-------------------------
    # Only date portion is considered here, since data taken daily
    # i.e., time is thrown away
    date_0 = pd.to_datetime(date_0).date()
    date_1 = pd.to_datetime(date_1).date()
    #-------------------------
    # Make sure index is datetime with daily frequency
    SNs_w_EDEs_df.index = pd.to_datetime(SNs_w_EDEs_df.index)
    assert(pd.infer_freq(SNs_w_EDEs_df.index)=='D')
    SNs_w_EDEs_df.index.freq = 'D'

    # Make sure the index is sorted
    SNs_w_EDEs_df = SNs_w_EDEs_df.sort_index()

    # Make sure date_0 and date_1 in bounds
    assert(pd.to_datetime(date_0)>=SNs_w_EDEs_df.index[0])
    assert(pd.to_datetime(date_1)<=SNs_w_EDEs_df.index[-1])

    sub_SNs_w_EDEs_df = SNs_w_EDEs_df[date_0:date_1]
    SNs = Utilities_df.consolidate_column_of_lists(
        df=sub_SNs_w_EDEs_df, 
        col=SNs_col, 
        sort=True, 
        include_None=False, 
        batch_size=batch_size, 
        verbose=verbose
    )
    #-------------------------
    return SNs

def get_SNs_with_end_events_from_local_files(
    date_0, 
    date_1, 
    opco='oh', 
    files_dir_base=r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\SNs_with_end_events', 
    batch_size=None, 
    verbose=False
):
    r"""
    """
    #-------------------------
    SNs_w_EDEs_df = build_SNs_w_EDEs_df_from_local_files(
        date_0=date_0, 
        date_1=date_1, 
        opco=opco, 
        files_dir_base=files_dir_base, 
        allow_duplicate_dates=False, 
        drop_duplicate_dates=True
    )
    #-------------------------
    SNs = get_SNs_with_end_events_from_SNs_w_EDEs_df(
        SNs_w_EDEs_df=SNs_w_EDEs_df, 
        date_0=date_0, 
        date_1=date_1,
        SNs_col=None, 
        batch_size=batch_size, 
        verbose=verbose
    )
    #-------------------------
    return SNs

In [None]:
def get_search_time_interval_infos_from_summary_file(summary_path):
    r"""
    Specialized function.
    TODO!!!!!!!!!!!!!!!!!!
    In the future, this stuff should probably be output at run-time somewhere
    """
    #-------------------------
    assert(os.path.exists(summary_path))
    #-------------------------
    f = open(summary_path)
    summary_json_data = json.load(f)
    assert('sql_statement' in summary_json_data)
    sql_statement = summary_json_data['sql_statement']
    #-------------------------
    f.close()
    #-------------------------
    # Find the last instance of "SELECT * FROM USG_X" to extract how many sets of 
    # t_min,t_max,prem_nbs to expect.
    # If not found, expect only one
    pattern = r"SELECT \* FROM .*_(\d*)$"
    found_all = re.findall(pattern, sql_statement)
    if len(found_all)==0:
        n_groups_expected = 1
    else:
        assert(len(found_all)==1)
        n_groups_expected = int(found_all[0])+1
    #-------------------------
    # So obnoxious...using flags=re.MULTILINE|re.DOTALL with .* was causing the trailing ) and \n to match in premise numbers
    #   This also made it such that only the last occurrence of the match was returned.
    #   What I found to work was eliminating the re.DOTALL flag and [\s\S] to match a newline or any symbol.
    #     Typically, . matches everything BUT newline characters (unless using re.DOTALL).
    #     The main idea is that the opposite shorthand classes inside a character class match any symbol there is in the input string.
    # NOTE: The new pattern should find both, e.g.:
    #       (a) un_rin.aep_premise_nb IN ('102186833','102252463','106876833','108452463')
    #       (b) un_rin.aep_premise_nb = '072759453'
    #       However, now need the if prem_nbs[0]=='(' block below
    #       ALSO: (?: TIMESTAMP){0,1} needed to be included (twice) after switch to Athena
    #             See, e.g., is_timestamp in SQLWhere class    
    pattern = r"SELECT[\s\S]+?"\
              r"(?:\'(\d*)\' AS OUTG_REC_NB[\s\S]+?)"\
              r"CAST.* BETWEEN(?: TIMESTAMP){0,1} '(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})' AND(?: TIMESTAMP){0,1} '(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'[\s\S]+?"\
              r"un_rin.aep_premise_nb\s*(?:IN|=)?\s*(\((?:.*)\)|(?:\'.*\'))[\s\S]+?"
    
    found_all = re.findall(pattern, sql_statement, flags=re.MULTILINE)
    assert(len(found_all)>0)
    #-------------------------
    return_coll=[]
    for found in found_all:
        assert(len(found)==4)
        outg_rec_nb,t_min,t_max,prem_nbs = found
        if prem_nbs[0]=='(':
            assert(prem_nbs[-1]==')')
            prem_nbs=prem_nbs[1:-1]
        prem_nbs = prem_nbs.replace('\'', '')
        prem_nbs = prem_nbs.split(',')
        return_dict_i = {
            'outg_rec_nb':outg_rec_nb, 
            'prem_nbs':prem_nbs, 
            't_min':t_min, 
            't_max':t_max
        }
        return_coll.append(return_dict_i)
    #-------------------------
    return return_coll


def get_search_time_interval_infos_df_from_summary_file(
    summary_path, 
    output_index_name='outg_rec_nb', 
    output_prem_nbs_col='prem_nbs', 
    output_t_min_col='t_min', 
    output_t_max_col='t_max', 
    include_summary_path=False
):
    r"""
    Returns a pd.DataFrame version of get_search_time_interval_infos_from_summary_file
    """
    #-------------------------
    return_df = pd.DataFrame()
    no_outg_time_infos = get_search_time_interval_infos_from_summary_file(summary_path)
    for i,no_outg_time_info_i in enumerate(no_outg_time_infos):
        no_outg_time_info_df_i = pd.DataFrame(
            data=[[no_outg_time_info_i['t_min'], no_outg_time_info_i['t_max']]], 
            columns=[output_t_min_col, output_t_max_col], 
            index=[no_outg_time_info_i['outg_rec_nb']]
        )
        no_outg_time_info_df_i.index.name=output_index_name
        no_outg_time_info_df_i[output_prem_nbs_col] = [no_outg_time_info_i['prem_nbs']]
        #-------------------------
        return_df = pd.concat([return_df, no_outg_time_info_df_i], ignore_index=False)
    return_df[output_t_min_col] = pd.to_datetime(return_df[output_t_min_col])
    return_df[output_t_max_col] = pd.to_datetime(return_df[output_t_max_col])
    if include_summary_path:
        return_df['summary_path'] = summary_path
    return return_df


def combine_search_time_interval_infos_dfs_w_like_indices(
    search_time_interval_infos_df_i, 
    index_name='outg_rec_nb', 
    prem_nbs_col='prem_nbs', 
    t_min_col='t_min', 
    t_max_col='t_max', 
    summary_path_col='summary_path'
):
    r"""
    Small helper function for use in get_search_time_interval_infos_df_from_summary_files
    """
    #-------------------------
    # If there is only one row in df, then no combining to be done, in which
    # case, simply return series version of df
    if search_time_interval_infos_df_i.shape[0]==1:
        return search_time_interval_infos_df_i.squeeze()
    #-------------------------
    assert(search_time_interval_infos_df_i.index.nunique()==1)
    assert(search_time_interval_infos_df_i[t_min_col].nunique()==1)
    assert(search_time_interval_infos_df_i[t_max_col].nunique()==1)
    #-------------------------
    return_series = search_time_interval_infos_df_i.iloc[0].copy()
    #-------------------------
    prem_nbs = Utilities_df.consolidate_column_of_lists(
        df=search_time_interval_infos_df_i, 
        col=prem_nbs_col, 
        sort=True, 
        include_None=True, 
        batch_size=None, 
        verbose=False
    )
    return_series[prem_nbs_col] = prem_nbs
    #-------------------------
    if summary_path_col in search_time_interval_infos_df_i.columns:
        summary_paths = Utilities_df.consolidate_column_of_lists(
            df=search_time_interval_infos_df_i, 
            col=summary_path_col, 
            sort=True, 
            include_None=True, 
            batch_size=None, 
            verbose=False
        )
        return_series[summary_path_col] = summary_paths
    #-------------------------
    return return_series

def get_search_time_interval_infos_df_from_summary_files(
    summary_paths, 
    output_index_name='outg_rec_nb', 
    output_prem_nbs_col='prem_nbs', 
    output_t_min_col='t_min', 
    output_t_max_col='t_max', 
    include_summary_paths=False
):
    r"""
    Handles multiple summary files
    
    Note: drop_duplicates will remove rows if indices are different (but all columns equal)
          Therefore, if make_prem_nbs_idx==True, this should only be done AFTER drop duplicates
          This explains why make_prem_nbs_idx=False in the call to get_search_time_interval_infos_df_from_summary_file
    Note: The reason for drop duplicates if for the case where a collection is split over mulitple
          files/runs (i.e., the asynchronous case)
    """
    return_df = pd.DataFrame()
    for summary_path in summary_paths:
        df_i = get_search_time_interval_infos_df_from_summary_file(
            summary_path=summary_path, 
            output_index_name=output_index_name, 
            output_prem_nbs_col=output_prem_nbs_col, 
            output_t_min_col=output_t_min_col, 
            output_t_max_col=output_t_max_col, 
            include_summary_path=include_summary_paths
        )
        return_df = pd.concat([return_df, df_i], ignore_index=False)
    #-------------------------
    # Now, take care of grouping together any repeated index values.
    # This would happen if, e.g., an outage were split acrosse multiple files, or even if an 
    #   outage were split across multiple CTEs in a single file.
    return_df = return_df.groupby(return_df.index).apply(
        lambda x: combine_search_time_interval_infos_dfs_w_like_indices(
            search_time_interval_infos_df_i=x, 
            index_name=output_index_name, 
            prem_nbs_col=output_prem_nbs_col, 
            t_min_col=output_t_min_col, 
            t_max_col=output_t_max_col, 
            summary_path_col='summary_path'
        )
    )    
    #-------------------------
    return return_df   

In [None]:
mecpo_dict_01_30_1 = AMIEDE_DEV.build_mecpo_dict_from_pkls(
    r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\rcpo_dfs\NEW_w_prems', 
    days_min_max_outg_td_window=[1,30], normalize_by_time_interval=True
)

In [None]:
files_dir_outg             = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\EndEvents'

is_no_outg = False
paths = Utilities.find_all_paths(base_dir=files_dir_outg, glob_pattern=file_path_glob)
# i_beg = 1000
# i_end = 2000
i_beg = 0
i_end = 1000
cols_and_types_to_convert_dict=None
to_numeric_errors='coerce'
assert_all_cols_equal=True
outg_rec_nb_col='outg_rec_nb'
addtnl_dropna_subset_cols=None

days_min_outg_td_window=1
days_max_outg_td_window=30

min_outg_td_window=datetime.timedelta(days=days_min_outg_td_window)
max_outg_td_window=datetime.timedelta(days=days_max_outg_td_window)

In [None]:
dev_rcpo_df         = mecpo_dict_01_30_1['full'].cpo_dfs['rcpo_df_raw'].copy()
dev_rcpo_no_outg_df = mecpo_dict_01_30_1['no_outg'].cpo_dfs['rcpo_df_raw'].copy()

In [None]:
# THIS IS ONLY USED TO DOUBLE CHECK TIMES
dovs_outgs = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True, 
    build_sql_function=DOVSOutages_SQL.build_sql_outage, 
    build_sql_function_kwargs=dict(
        outg_rec_nbs=dev_rcpo_df.index.tolist(), 
        from_table_alias='DOV', 
        datetime_col='DT_OFF_TS_FULL', 
        cols_of_interest=[
            'OUTG_REC_NB', 'DT_ON_TS', 
            dict(field_desc=f"DOV.DT_ON_TS - DOV.STEP_DRTN_NB/(60*24)", 
                 alias='DT_OFF_TS_FULL', table_alias_prefix=None)
        ], 
        field_to_split='outg_rec_nbs'
    ),
)
outg_dt_off_df = dovs_outgs.df
outg_dt_off_df = Utilities_df.convert_col_type(df=outg_dt_off_df, column='OUTG_REC_NB', to_type=str)
outg_dt_off_df=outg_dt_off_df.set_index('OUTG_REC_NB')

In [None]:
start=time.time()
# prem_nbs_with_end_events:
#   Simply a list of premise numbers which have at least one end event between date_0 and date_1
#   It is quicker to build this to encompass all needed dates first, as opposed to building it on-the-fly
#     for each individual outage.
prem_nbs_with_end_events = get_SNs_with_end_events_from_local_files(
    date_0='2017-01-01', 
    date_1='2022-06-30', 
    files_dir_base=r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\prem_nbs_with_end_events', 
    batch_size=100,
    verbose=True
)
print(time.time()-start)

In [None]:
files_dir_outg             = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\EndEvents'
paths_outg = Utilities.find_all_paths(base_dir=files_dir_outg, glob_pattern=file_path_glob)

# run_infos_df:
#   This is where t_min and t_max come from for each outage.  For the case of outages, run_infos_df will have indices
#     equal to outg_rec_nbs, and will have columns [t_min, t_max, prem_nbs, (and possibly) summary path]
#   NOTE: Unfortunately, since the original code was run with INNER join with MeterPremise, instead of LEFT join,
#         although run_infos_df has a prem_nbs column, it shows the full set of premise numbers in the outage, not 
#         the set for which data were actually acquired.  
#         If each premise in run_infos_df['prem_nbs'] has a match in MeterPremise, then these would be
#         the same, but in general they are not.
run_infos_df = get_search_time_interval_infos_df_from_summary_files(
    summary_paths=[AMIEDE_DEV.find_summary_file_from_csv(x) for x in paths_outg], 
    output_index_name='outg_rec_nb', 
    output_prem_nbs_col='prem_nbs', 
    output_t_min_col='t_min', 
    output_t_max_col='t_max', 
    include_summary_paths=True
)

In [None]:
# prem_nbs_for_outages:
#   A series object with indices equal to OUTG_REC_NBs and values equal to the premise numbers
#     contained in the outage.
#   If one wants to use serial numbers instead of premise numbers, one should use 
#     DOVSOutages.get_serial_numbers_for_outages
prem_nbs_for_outages = DOVSOutages.get_premise_nbs_for_outages(
    outg_rec_nbs=dev_rcpo_df.index.tolist(), 
    return_type=pd.Series, 
    col_type_outg_rec_nb=str, 
    col_type_premise_nb=None, 
    to_numeric_errors='coerce', 
    verbose=False
)

In [None]:
prem_nbs_col_rcpo = '_prem_nbs'

In [None]:
# rcpo_i = dev_rcpo_df.iloc[2]  #pd.Series object
# outg_rec_nb_i = rcpo_i.name
# prem_nbs_acquired_i = rcpo_i[prem_nbs_col_rcpo]
# #-----
# if outg_rec_nb_i not in prem_nbs_for_outages.index:
#     print(f'outg_rec_nb_i={outg_rec_nb_i} not in prem_nbs_for_outages!!!!!')
#     #TODO UNCOMMENT CONTINUE
#     #continue
    
# #UNCOMMENT ELSE
# #else
# prem_nbs_in_outg_i = prem_nbs_for_outages.loc[outg_rec_nb_i]
# #-----
# run_info_i = run_infos_df.loc[outg_rec_nb_i]
# t_min_i = run_info_i['t_min']
# t_max_i = run_info_i['t_max']
# #-----
# # prem_nbs_with_end_events = get_SNs_with_end_events_from_local_files(
# #     date_0=t_min_i, 
# #     date_1=t_max_i, 
# #     files_dir_base=r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\prem_nbs_with_end_events'
# # )
# #-----
# # Premise numbers needed are the difference between prem_nbs_in_outg_i and prem_nbs_acquired_i, but only if the premise numbers
# # have an end event
# # NOTE: Using set operations MUCH faster than list comprehension
# prem_nbs_w_events_needed_i = list((set(prem_nbs_in_outg_i).difference(set(prem_nbs_acquired_i)))
#                                   .intersection(prem_nbs_with_end_events))
# print(len(prem_nbs_w_events_needed_i))


# outgs_w_prem_nbs_and_search_times = []
# outgs_w_prem_nbs_and_search_times.append(
#     dict(
#         outg_rec_nb=outg_rec_nb_i, 
#         prem_nbs=prem_nbs_w_events_needed_i, 
#         t_min=t_min_i, 
#         t_max=t_max_i
#     )
# )

In [None]:
print(dev_rcpo_df.shape[0])
outgs_w_prem_nbs_and_search_times = []
for idx in range(dev_rcpo_df.shape[0]):
    #print(idx)
    if idx%50==0:
        print(idx)
    rcpo_i = dev_rcpo_df.iloc[idx]  #pd.Series object
    outg_rec_nb_i = rcpo_i.name
    prem_nbs_acquired_i = rcpo_i[prem_nbs_col_rcpo]
    #-----
    if outg_rec_nb_i not in prem_nbs_for_outages.index:
        print(f'outg_rec_nb_i={outg_rec_nb_i} not in prem_nbs_for_outages!!!!!')
        continue
    #-----
    prem_nbs_in_outg_i = prem_nbs_for_outages.loc[outg_rec_nb_i]
    #-----
    run_info_i = run_infos_df.loc[outg_rec_nb_i]
    t_min_i = run_info_i['t_min']
    t_max_i = run_info_i['t_max']
    #-----
    # prem_nbs_with_end_events = get_SNs_with_end_events_from_local_files(
    #     date_0=t_min_i, 
    #     date_1=t_max_i, 
    #     files_dir_base=r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\prem_nbs_with_end_events'
    # )
    #-----
    # Premise numbers needed are the difference between prem_nbs_in_outg_i and prem_nbs_acquired_i, but only if the premise numbers
    # have an end event
    # NOTE: Using set operations MUCH faster than list comprehension
    prem_nbs_w_events_needed_i = list((set(prem_nbs_in_outg_i).difference(set(prem_nbs_acquired_i)))
                                      .intersection(prem_nbs_with_end_events))
    #-----
    outgs_w_prem_nbs_and_search_times.append(
        dict(
            outg_rec_nb=outg_rec_nb_i, 
            prem_nbs=prem_nbs_w_events_needed_i, 
            t_min=t_min_i, 
            t_max=t_max_i
        )
    )

In [None]:
print(len(outgs_w_prem_nbs_and_search_times))

In [None]:
pd.DataFrame(outgs_w_prem_nbs_and_search_times)

In [None]:
cols_of_interest_end_dev_event = TableInfos.AMIEndEvents_TI.std_columns_of_interest

cols_of_interest_met_prem = TableInfos.MeterPremise_TI.std_columns_of_interest
cols_of_interest_met_prem.append('curr_acct_cls_cd')

df_construct_type=DFConstructType.kRunSqlQuery
contstruct_df_args_end_events=None

build_sql_function = AMIEndEvents_SQL.build_sql_end_events

In [None]:
print(len(outgs_w_prem_nbs_and_search_times))
for i in range(len(outgs_w_prem_nbs_and_search_times)):
    print(i)
    entry_i = outgs_w_prem_nbs_and_search_times[i]
    outg_rec_nb_i = entry_i['outg_rec_nb']
    prem_nbs_i = entry_i['prem_nbs']
    t_min_i = entry_i['t_min']
    t_max_i = entry_i['t_max']
    if len(prem_nbs_i)==0:
        continue #all already recorded!
    
    build_sql_function_kwargs = dict(
        cols_of_interest=cols_of_interest_end_dev_event, 
        premise_nbs=prem_nbs_i, 
        date_range=[str(t_min_i.date()), str(t_max_i.date())],
        datetime_range=[str(t_min_i), str(t_max_i)], 
        serialnumber_col='serialnumber', 
        from_table_alias='un_rin', 
        schema_name='meter_events', 
        table_name='end_device_event', 
        datetime_col = 'valuesinterval', 
        datetime_pattern = r"([0-9]{4}-[0-9]{2}-[0-9]{2})T([0-9]{2}:[0-9]{2}:[0-9]{2}).*", 
        date_col     = 'aep_event_dt', 
        join_mp_args=dict(
            join_with_CTE=True, 
            build_mp_kwargs=dict(cols_of_interest=cols_of_interest_met_prem), 
            join_type='LEFT'
        ), 
    #     field_to_split='df_mp_no_outg', 
    #     field_to_split_location_in_kwargs=['df_mp_no_outg'], 
    #     sort_coll_to_split=False,
    #     batch_size=10, verbose=True, n_update=1
        addtnl_select_elements = [dict(field_desc=f"'{outg_rec_nb_i}'", alias='OUTG_REC_NB_GPD_FOR_SQL')]
    )
    
    #--------------------------------------------------
    save_args = dict(save_to_file=True, 
                     save_dir = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\EndEvents', 
                     save_name=r'end_events.csv', 
                     index=True)

    save_args['offset_int'] = GenAn.get_next_summary_file_tag_int(save_args)
    save_args = GenAn.prepare_save_args(save_args, make_save_dir_if_dne=False)
    save_args['save_name'] = Utilities.append_to_path(save_args['save_name'], appendix=f'_{save_args["offset_int"]}', 
                                                        ext_to_find=save_args['save_ext'], append_to_end_if_ext_no_found=True)
    save_args = GenAn.prepare_save_args(save_args, make_save_dir_if_dne=False)

    end_events = AMIEndEvents(
        df_construct_type=df_construct_type, 
        contstruct_df_args = contstruct_df_args_end_events, 
        build_sql_function=build_sql_function, 
        build_sql_function_kwargs=build_sql_function_kwargs, 
        init_df_in_constructor=True, 
        save_args=save_args
    )