In [1]:
from importlib import reload
#reload(Utilities)
#reload(clm)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns
from packaging import version

import itertools

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
import CommonLearningMethods as clm
#-----
from MeterPremise import MeterPremise
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_Box_sns
import GrubbsTest

# -----------------------------------------------------------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# -----------------------------------------------------------------------------------------------

In [None]:
save_dfs_to_csv = False
read_dfs_from_csv = True
save_dir_base_csvs = os.path.join(Utilities.get_local_data_dir(), r'dovs_and_end_events_data')

assert(save_dfs_to_csv+read_dfs_from_csv <=1) # Should never both read and write!

In [None]:
fig_num = 0

In [None]:
if not read_dfs_from_csv:
    conn_outages = Utilities.get_utldb01p_oracle_connection()
    conn_aws = Utilities.get_athena_prod_aws_connection()

In [None]:
date_0 = '2021-01-01'
date_1 = '2021-12-31'

# date_0 = '2022-01-01'
# date_1 = '2022-05-31'

In [None]:
save_subdir_csvs = f"{date_0.replace('-','')}_{date_1.replace('-','')}"
save_dir_csvs = os.path.join(save_dir_base_csvs, save_subdir_csvs)
if not os.path.exists(save_dir_csvs):
    os.makedirs(save_dir_csvs)

# -----------------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------------

# Build no outage collection and df_mp_all
where df_mp_all is a DataFrame containing all meter premise data (from OH)

In [None]:
cols_of_interest_met_prem = [
    'mfr_devc_ser_nbr',
    'state_cd', 
    'prem_nb',
    'trsf_pole_nb',
    'annual_kwh',
    'annual_max_dmnd', 
    'mtr_stat_cd',
    'mtr_stat_cd_desc', 
    'devc_stat_cd', 
    'devc_stat_cd_desc'
]

### When df_outage_location_ids is large
i.e., when the date range used to find outages is large
<br> It is actually faster to grab ALL meters from default.meter_premise and then filter using pandas
<br> as opposed to using NOT IN and filtering with SQL

In [None]:
sql_mp_all = MeterPremise.build_sql_meter_premise(cols_of_interest_met_prem, states=['OH']).get_sql_statement()
print(sql_mp_all)

In [None]:
if read_dfs_from_csv:
    df_mp_all_OG = pd.read_csv(os.path.join(save_dir_csvs, 'df_mp_all_OG.csv'), dtype=str)
else:
    df_mp_all_OG = pd.read_sql(sql_mp_all, conn_aws) 
df_mp_all_OG = clm.remove_prepend_from_columns_in_df(df_mp_all_OG)
print(df_mp_all_OG.shape)
df_mp_all = df_mp_all_OG.copy()

# ---------------------------------------------------------------
# OUTAGES
# ---------------------------------------------------------------

In [None]:
sql_outage_full = DOVSOutages_SQL.build_sql_std_outage(
    mjr_mnr_cause=None, 
    include_premise=True, 
    date_range=[date_0, date_1], 
    states='OH'
).get_sql_statement()
print(sql_outage_full)

In [None]:
if read_dfs_from_csv:
    df_outage_OG = pd.read_csv(os.path.join(save_dir_csvs, 'df_outage_OG.csv'), dtype=str)
    csv_cols_and_types_to_convert_dict = {'CI_NB':np.int32, 'CMI_NB':np.float64, 'OUTG_REC_NB':[np.float64, np.int32]}
    df_outage_OG = Utilities_df.convert_col_types(df_outage_OG, csv_cols_and_types_to_convert_dict)
else:
    df_outage_OG = pd.read_sql_query(sql_outage_full, conn_outages, dtype={'CI_NB':np.int32, 
                                                                           'CMI_NB':np.float64, 
                                                                           'OUTG_REC_NB':np.int32})

# Save CSVs if save_dfs_to_csv = True

In [None]:
if save_dfs_to_csv:
    df_mp_all_OG.to_csv(os.path.join(save_dir_csvs, 'df_mp_all_OG.csv'), index=False)
    df_outage_OG.to_csv(os.path.join(save_dir_csvs, 'df_outage_OG.csv'), index=False)

In [None]:
df_outage_OG.shape

# Merge df_outage with df_mp_all and build subsets (e.g., df_outage_prim, df_outage_dl_eqf, etc.) 

In [None]:
print(df_mp_all.shape)

In [None]:
#TODO why are prem_nb in df_mp_all all integers?  Whereas PREMISE_NB in df_outage
# are strings, and can be purely numerical, a alphanumeric, or all letters
if read_dfs_from_csv:
    df_mp_all['prem_nb'] = df_mp_all['prem_nb'].astype(str)

In [None]:
print(df_outage_OG.shape)
print(df_mp_all.shape)

# BEGIN NEW DEV

In [None]:
"""
In MeterPremise, can I create a function which will build mp_df_hist given mp_df_curr?
In DOVSOutages I need to build new build_mp_for_outg (and update its use throughout)
    APPARENTLY I already have this, build_active_MP_for_outages or one of other similar functions
    
I need to basically replace everything in DOVSOutages which uses build_mp_for_outg
"""

In [None]:
df_outage_slim = DOVSOutages.read_df_outage_slim_from_csv(os.path.join(save_dir_csvs, 'df_outage_slim.csv'))

In [None]:
df_outage_slim.head()

In [None]:
df_outage = df_outage_OG.copy()

In [None]:
df_outage

In [None]:
reload(Utilities_df)

import MeterPremise
reload(MeterPremise)
from MeterPremise import MeterPremise

In [None]:
def build_active_MP_for_outages_df(
    df_outage, 
    prem_nb_col, 
    df_mp_curr=None, 
    df_mp_hist=None, 
    assert_all_PNs_found=True, 
    drop_inst_rmvl_cols=False, 
    outg_rec_nb_col='OUTG_REC_NB',  #TODO!!!!!!!!!!!!!!!!!!!!!!! what if index?!
    is_slim=False, 
    dt_on_ts_col='DT_ON_TS', 
    df_off_ts_full_col='DT_OFF_TS_FULL', 
    consolidate_PNs_batch_size=1000, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb'
):
    r"""
    Similar to build_active_MP_for_outages
    """
    #-------------------------
    assert(prem_nb_col in df_outage.columns and 
           dt_on_ts_col in df_outage.columns and 
           df_off_ts_full_col in df_outage.columns)
    #-------------------------
    if not is_slim:
        PNs = df_outage[prem_nb_col].unique().tolist()
    else:
        PNs = Utilities_df.consolidate_column_of_lists(
            df=df_outage, 
            col=prem_nb_col, 
            sort=True,
            include_None=False,
            batch_size=consolidate_PNs_batch_size, 
            verbose=False
        )
    #-----
    PNs = [x for x in PNs if pd.notna(x)]
    #-------------------------
    # For now, drop approx duplicates with assert_single_overlap=False
    # Towards end of function, a warning will be output if multiple overlap occurs
    mp_df_curr_hist_dict = MeterPremise.build_mp_df_curr_hist_for_PNs(
        PNs=PNs, 
        mp_df_curr=df_mp_curr,
        mp_df_hist=df_mp_hist, 
        join_curr_hist=False, 
        addtnl_mp_df_curr_cols=None, 
        addtnl_mp_df_hist_cols=None, 
        assert_all_PNs_found=assert_all_PNs_found, 
        assume_one_xfmr_per_PN=True, 
        drop_approx_duplicates=True, 
        drop_approx_duplicates_args=dict(
            fuzziness=pd.Timedelta('1 hour'), 
            assert_single_overlap=False, 
            addtnl_groupby_cols=None, 
            gpby_dropna=False
        ), 
        df_mp_serial_number_col=df_mp_serial_number_col, 
        df_mp_prem_nb_col=df_mp_prem_nb_col, 
        df_mp_install_time_col=df_mp_install_time_col, 
        df_mp_removal_time_col=df_mp_removal_time_col, 
        df_mp_trsf_pole_nb_col=df_mp_trsf_pole_nb_col       
    )
    df_mp_curr = mp_df_curr_hist_dict['mp_df_curr']
    df_mp_hist = mp_df_curr_hist_dict['mp_df_hist']
    #-------------------------
    # Only reason for making dict is to ensure outg_rec_nbs are not repeated 
    active_SNs_in_outgs_dfs_dict = {}

    if not is_slim:
        for outg_rec_nb_i, df_i in df_outage.groupby(outg_rec_nb_col):
            print(outg_rec_nb_i)
            # Don't want to include outg_rec_nb_i=-2147483648
            if int(outg_rec_nb_i) < 0:
                continue
            # There should only be a single unique dt_on_ts and dt_off_ts_full for each outage
            if(df_i[dt_on_ts_col].nunique()!=1 or 
               df_i[df_off_ts_full_col].nunique()!=1):
                print(f'outg_rec_nb_i = {outg_rec_nb_i}')
                print(f'df_i[dt_on_ts_col].nunique()       = {df_i[dt_on_ts_col].nunique()}')
                print(f'df_i[df_off_ts_full_col].nunique() = {df_i[df_off_ts_full_col].nunique()}')
                print('CRASH IMMINENT!')
                assert(0)
            # Grab power out/on time and PNs from df_i
            dt_on_ts_i       = df_i[dt_on_ts_col].unique()[0]
            df_off_ts_full_i = df_i[df_off_ts_full_col].unique()[0]
            PNs_i            = df_i[prem_nb_col].unique().tolist()

            # Just as was done above for PNs, NaN values must be removed from PNs_i
            #   The main purpose here is to remove instances where PNs_i = [nan]
            #   NOTE: For case of slim df, the NaNs should already be removed
            # After removal, if len(PNs_i)==0, contine
            PNs_i = [x for x in PNs_i if pd.notna(x)]
            if len(PNs_i)==0:
                continue
            
            # Build active_SNs_df_i and add it to active_SNs_in_outgs_dfs_dict
            # NOTE: assume_one_xfmr_per_PN=True above in MeterPremise.build_mp_df_curr_hist_for_PNs,
            #       so does not need to be set again (i.e., assume_one_xfmr_per_PN=False below)
            active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
                PNs=PNs_i,
                df_mp_curr=df_mp_curr, 
                df_mp_hist=df_mp_hist, 
                dt_0=df_off_ts_full_i,
                dt_1=dt_on_ts_i,
                assume_one_xfmr_per_PN=False, 
                output_index=None,
                output_groupby=None, 
                assert_all_PNs_found=False
            )
            active_SNs_df_i[outg_rec_nb_col] = outg_rec_nb_i
            assert(outg_rec_nb_i not in active_SNs_in_outgs_dfs_dict)
            active_SNs_in_outgs_dfs_dict[outg_rec_nb_i] = active_SNs_df_i
    else:
        for outg_rec_nb_i, row_i in df_outage.iterrows():
            # NOTE: assume_one_xfmr_per_PN=True above in MeterPremise.build_mp_df_curr_hist_for_PNs,
            #       so does not need to be set again (i.e., assume_one_xfmr_per_PN=False below)
            active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
                PNs=row_i[prem_nb_col],
                df_mp_curr=df_mp_curr, 
                df_mp_hist=df_mp_hist, 
                dt_0=row_i[df_off_ts_full_col],
                dt_1=row_i[dt_on_ts_col],
                assume_one_xfmr_per_PN=False, 
                output_index=None,
                output_groupby=None, 
                assert_all_PNs_found=False
            )
            active_SNs_df_i[outg_rec_nb_col] = outg_rec_nb_i
            assert(outg_rec_nb_i not in active_SNs_in_outgs_dfs_dict)
            active_SNs_in_outgs_dfs_dict[outg_rec_nb_i] = active_SNs_df_i
    #-------------------------
    active_SNs_df = pd.concat(list(active_SNs_in_outgs_dfs_dict.values()))
    #-------------------------
    if drop_inst_rmvl_cols:
        active_SNs_df = active_SNs_df.drop(columns=[df_mp_install_time_col, df_mp_removal_time_col])
    #-------------------------
    # At this point, for each outage there should be exactly one entry per meter (meter being a unique combination of premise number, 
    #   serial number, and transformer number)
    if any(active_SNs_df.groupby([df_mp_serial_number_col, df_mp_prem_nb_col, df_mp_trsf_pole_nb_col, outg_rec_nb_col]).size()>1):
        print('!!!!!WARNING!!!!!\nIn build_active_MP_for_outages_df: appear to be multiple entries for a meter in an outage!')
    #-----
    # Actually, a given meter (unique premise and serial number) should not have multiple transformer poles.  Therefore, the any check
    #   can stricter by excluding df_mp_trsf_pole_nb_col
    if any(active_SNs_df.groupby([df_mp_serial_number_col, df_mp_prem_nb_col, outg_rec_nb_col]).size()>1):
        print('!!!!!WARNING!!!!!\nIn build_active_MP_for_outages_df: appear to be multiple transformers tied to a single meter!')
    #-------------------------
    return active_SNs_df

In [None]:
# MP_sub = build_active_MP_for_outages_df(
#     df_outage=df_outage[df_outage['OUTG_REC_NB']==12335129].copy(), 
#     prem_nb_col='PREMISE_NB', 
#     is_slim=False, 
#     assert_all_PNs_found=False
# )

MP_sub = build_active_MP_for_outages_df(
    df_outage=df_outage[df_outage['OUTG_REC_NB'].isin(df_outage['OUTG_REC_NB'].unique().tolist()[:100])].copy(), 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    assert_all_PNs_found=False
)

In [None]:
MP_sub.head()

In [None]:
MP_sub.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).size()

In [None]:
MP_sub.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'OUTG_REC_NB']).size()

In [None]:
df_mp_serial_number_col='mfr_devc_ser_nbr'
df_mp_prem_nb_col='prem_nb'
df_mp_install_time_col='inst_ts'
df_mp_removal_time_col='rmvl_ts'
df_mp_trsf_pole_nb_col='trsf_pole_nb'
outg_rec_nb_col='OUTG_REC_NB'

In [None]:
# prem_nb_col='PREMISE_NB'
# df_mp_curr=None 
# df_mp_hist=None
# drop_inst_rmvl_cols=False
# outg_rec_nb_col='OUTG_REC_NB' #TODO!!!!!!!!!!!!!!!!!!!!!!! what if index?!
# is_slim=False
# dt_on_ts_col='DT_ON_TS'
# df_off_ts_full_col='DT_OFF_TS_FULL'
# consolidate_PNs_batch_size=1000
# df_mp_serial_number_col='mfr_devc_ser_nbr'
# df_mp_prem_nb_col='prem_nb'
# df_mp_install_time_col='inst_ts'
# df_mp_removal_time_col='rmvl_ts'
# df_mp_trsf_pole_nb_col='trsf_pole_nb'

In [None]:
FUCK

In [None]:
prem_nb_col='PREMISE_NB'
is_slim=False
assert_all_PNs_found=False

df_mp_curr=None
df_mp_hist=None

drop_inst_rmvl_cols=False 
outg_rec_nb_col='OUTG_REC_NB'
dt_on_ts_col='DT_ON_TS'
df_off_ts_full_col='DT_OFF_TS_FULL'
consolidate_PNs_batch_size=1000
df_mp_serial_number_col='mfr_devc_ser_nbr'
df_mp_prem_nb_col='prem_nb'
df_mp_install_time_col='inst_ts'
df_mp_removal_time_col='rmvl_ts'
df_mp_trsf_pole_nb_col='trsf_pole_nb'

In [None]:
#-------------------------
assert(prem_nb_col in df_outage.columns and 
       dt_on_ts_col in df_outage.columns and 
       df_off_ts_full_col in df_outage.columns)
#-------------------------
if not is_slim:
    PNs = df_outage[prem_nb_col].unique().tolist()
else:
    PNs = Utilities_df.consolidate_column_of_lists(
        df=df_outage, 
        col=prem_nb_col, 
        sort=True,
        include_None=False,
        batch_size=consolidate_PNs_batch_size, 
        verbose=False
    )
#-----
PNs = [x for x in PNs if pd.notna(x)]
#-------------------------
# For now, drop approx duplicates with assert_single_overlap=False
# Towards end of function, a warning will be output if multiple overlap occurs
mp_df_curr_hist_dict = MeterPremise.build_mp_df_curr_hist_for_PNs(
    PNs=PNs, 
    mp_df_curr=df_mp_curr,
    mp_df_hist=df_mp_hist, 
    join_curr_hist=False, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assert_all_PNs_found=assert_all_PNs_found, 
    assume_one_xfmr_per_PN=False, 
    drop_approx_duplicates=False, 
    drop_approx_duplicates_args=dict(
        fuzziness=pd.Timedelta('1 hour'), 
        assert_single_overlap=False, 
        addtnl_groupby_cols=None, 
        gpby_dropna=False
    ), 
    df_mp_serial_number_col=df_mp_serial_number_col, 
    df_mp_prem_nb_col=df_mp_prem_nb_col, 
    df_mp_install_time_col=df_mp_install_time_col, 
    df_mp_removal_time_col=df_mp_removal_time_col, 
    df_mp_trsf_pole_nb_col=df_mp_trsf_pole_nb_col       
)
df_mp_curr_OG = mp_df_curr_hist_dict['mp_df_curr'].copy()
df_mp_hist_OG = mp_df_curr_hist_dict['mp_df_hist'].copy()

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! TODO
Note: above assume_one_xfmr_per_PN==drop_approx_duplicates==False
I suspect one of these is causing the issue, so run end of build_mp_df_curr_hist_for_PNs by hand

In [None]:
mp_df_curr = df_mp_curr_OG.copy()
mp_df_hist = df_mp_hist_OG.copy()

In [None]:
mp_df_hist_trsf_filled = MeterPremise.fill_trsf_pole_nbs_in_mp_df_hist_from_curr(
    mp_df_hist=mp_df_hist, 
    mp_df_curr=mp_df_curr, 
    prem_nb_col=df_mp_prem_nb_col, 
    trsf_pole_nb_col=df_mp_trsf_pole_nb_col, 
    how_to_handle_ambiguities='first'
)

In [None]:
print(mp_df_hist_trsf_filled['trsf_pole_nb'].isna().sum())
print(mp_df_hist_trsf_filled.shape)

In [None]:
dflt_args_drop_approx_mp_duplicates = MeterPremise.get_dflt_args_drop_approx_mp_duplicates(
    df_mp_serial_number_col=df_mp_serial_number_col, 
    df_mp_prem_nb_col=df_mp_prem_nb_col, 
    df_mp_install_time_col=df_mp_install_time_col, 
    df_mp_removal_time_col=df_mp_removal_time_col, 
    df_mp_trsf_pole_nb_col=df_mp_trsf_pole_nb_col
)
drop_approx_duplicates_args = Utilities.supplement_dict_with_default_values(
    to_supplmnt_dict=None, 
    default_values_dict=dflt_args_drop_approx_mp_duplicates, 
    extend_any_lists=False, 
    inplace=True
)

In [None]:
start=time.time()
mp_df_hist_FINAL = MeterPremise.drop_approx_mp_duplicates(
    mp_df=mp_df_hist_trsf_filled, 
    **drop_approx_duplicates_args
)
print(time.time()-start)

In [None]:
start=time.time()
mp_df_curr_FINAL = MeterPremise.drop_approx_mp_duplicates(
    mp_df=mp_df_curr, 
    **drop_approx_duplicates_args
)
print(time.time()-start)

In [None]:
df_outage[df_outage['OUTG_REC_NB']==12046950]

In [None]:
mp_df_curr_FINAL[mp_df_curr_FINAL['prem_nb'].isin(df_outage[df_outage['OUTG_REC_NB']==12046950]['PREMISE_NB'].unique().tolist())]

In [None]:
mp_df_hist_FINAL[mp_df_hist_FINAL['prem_nb'].isin(df_outage[df_outage['OUTG_REC_NB']==12046950]['PREMISE_NB'].unique().tolist())]

In [None]:
mp_df_hist_FINAL[mp_df_hist_FINAL['prem_nb'].isin(df_outage[df_outage['OUTG_REC_NB']==12046950]['PREMISE_NB'].unique().tolist())].sort_values(
    by=['inst_ts', 'rmvl_ts']
)

In [None]:
# outg_rec_nb_i=12046950
outg_rec_nb_i=12061321
df_i = df_outage[df_outage['OUTG_REC_NB']==outg_rec_nb_i]

# There should only be a single unique dt_on_ts and dt_off_ts_full for each outage
if(df_i[dt_on_ts_col].nunique()!=1 or 
   df_i[df_off_ts_full_col].nunique()!=1):
    print(f'outg_rec_nb_i = {outg_rec_nb_i}')
    print(f'df_i[dt_on_ts_col].nunique()       = {df_i[dt_on_ts_col].nunique()}')
    print(f'df_i[df_off_ts_full_col].nunique() = {df_i[df_off_ts_full_col].nunique()}')
    print('CRASH IMMINENT!')
    assert(0)
# Grab power out/on time and PNs from df_i
dt_on_ts_i       = df_i[dt_on_ts_col].unique()[0]
df_off_ts_full_i = df_i[df_off_ts_full_col].unique()[0]
PNs_i            = df_i[prem_nb_col].unique().tolist()

# Just as was done above for PNs, NaN values must be removed from PNs_i
#   The main purpose here is to remove instances where PNs_i = [nan]
#   NOTE: For case of slim df, the NaNs should already be removed
# After removal, if len(PNs_i)==0, contine
PNs_i = [x for x in PNs_i if pd.notna(x)]

In [None]:
import MeterPremise
reload(MeterPremise)
from MeterPremise import MeterPremise

In [None]:
hmm = MeterPremise.get_historic_SNs_for_PNs(
    PNs=PNs_i,
    df_mp_curr=None, 
    df_mp_hist=None, 
    output_index=None,
    output_groupby=None, 
    assert_all_PNs_found=False
)

In [None]:
hmm

In [None]:
# Build active_SNs_df_i and add it to active_SNs_in_outgs_dfs_dict
# NOTE: assume_one_xfmr_per_PN=True above in MeterPremise.build_mp_df_curr_hist_for_PNs,
#       so does not need to be set again (i.e., assume_one_xfmr_per_PN=False below)
active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
    PNs=PNs_i,
    df_mp_curr=mp_df_curr_FINAL, 
    df_mp_hist=mp_df_hist_FINAL, 
    dt_0=df_off_ts_full_i,
    dt_1=dt_on_ts_i,
    assume_one_xfmr_per_PN=False, 
    output_index=None,
    output_groupby=None, 
    assert_all_PNs_found=False
)

In [None]:
df_outage

In [None]:
reload(Utilities_df)

import MeterPremise
reload(MeterPremise)
from MeterPremise import MeterPremise

In [None]:
df_outage = df_outage_OG.copy()

In [None]:
MP_1_OG = build_active_MP_for_outages_df(
    df_outage=df_outage, 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    assert_all_PNs_found=False
)

In [None]:
MP_1_OG = build_active_MP_for_outages_df(
    df_outage=df_outage, 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    assert_all_PNs_found=False
)

In [None]:
MP_1_OG = build_active_MP_for_outages_df(
    df_outage=df_outage, 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    assert_all_PNs_found=False
)

In [None]:
reload(Utilities_df)

import MeterPremise
reload(MeterPremise)
from MeterPremise import MeterPremise

In [None]:
MP_1_TMP = build_active_MP_for_outages_df(
    df_outage=df_outage[df_outage['OUTG_REC_NB'].isin(df_outage['OUTG_REC_NB'].unique().tolist()[:100])], 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    assert_all_PNs_found=False
)

In [None]:
MP_1_TMP.nunique()

In [None]:
MP_1_TMP.shape

In [None]:
def fuckfunc(x, flag_col):
    x[flag_col]=False
    return x

In [None]:
tmp_col='tmp_col'
MP_1_TMP[tmp_col]=np.nan
MP_1_TMP=MP_1_TMP.groupby('trsf_pole_nb').apply(lambda x: fuckfunc(x, flag_col=tmp_col))

In [None]:
return_df_w_dups  = MP_1_TMP[MP_1_TMP[tmp_col]].copy()

In [None]:
return_df_w_dups

In [None]:
MP_1_OG

In [None]:
MP_2_OG = build_active_MP_for_outages_df(
    df_outage=df_outage_slim, 
    prem_nb_col='PREMISE_NBS', 
    is_slim=True, 
    assert_all_PNs_found=False
)

In [None]:
MP_1_OG.head()

In [None]:
MP_2_OG.head()

In [None]:
MP_1 = MP_1_OG.copy()
MP_2 = MP_2_OG.copy()

In [None]:
MP_1 = MP_1.sort_values(by=['OUTG_REC_NB', 'prem_nb', 'mfr_devc_ser_nbr'], ignore_index=True)
MP_2 = MP_2.sort_values(by=['OUTG_REC_NB', 'prem_nb', 'mfr_devc_ser_nbr'], ignore_index=True)

In [None]:
print(MP_1.shape)
print(MP_2.shape)

In [None]:
MP_1.equals(MP_2)

In [None]:
MP_1.shape

In [None]:
df_mp_serial_number_col='mfr_devc_ser_nbr'
df_mp_prem_nb_col='prem_nb'
df_mp_install_time_col='inst_ts'
df_mp_removal_time_col='rmvl_ts'
df_mp_trsf_pole_nb_col='trsf_pole_nb'

necessary_mp_cols = [df_mp_serial_number_col, df_mp_prem_nb_col, df_mp_install_time_col, df_mp_removal_time_col]

In [None]:
MP_1.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'OUTG_REC_NB']).size().sort_values()

In [None]:
MP_1

In [None]:
reload(Utilities_df)

import MeterPremise
reload(MeterPremise)
from MeterPremise import MeterPremise

In [None]:
MP_1.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).size().sort_values()

In [None]:
MP_1 = MeterPremise.drop_approx_mp_duplicates(
    mp_df = MP_1, 
    fuzziness=pd.Timedelta('1 hour'), 
    assert_single_overlap=True, 
    addtnl_groupby_cols=['OUTG_REC_NB'], 
    gpby_dropna=False
)

In [None]:
MP_1.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).size().sort_values()

In [None]:
MP_1.shape

In [None]:
any(MP_1.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).size().sort_values()>1)

In [None]:
df_mp_all

In [None]:
MP_1

In [None]:
MP_2

In [None]:
df_outage_OG.head()

In [None]:
df_outage_1 = df_outage_OG.copy()
df_outage_1 = DOVSOutages.merge_df_outage_with_mp(
    df_outage=df_outage_1, 
    df_mp=df_mp_all, 
    merge_on_outg='PREMISE_NB', 
    merge_on_mp='prem_nb', 
    cols_to_include_mp=cols_of_interest_met_prem, 
    drop_cols = None, 
    rename_cols=None, 
    inplace=True
)

In [None]:
import DOVSOutages
reload(DOVSOutages)
from DOVSOutages import DOVSOutages

In [None]:
df_outage_2 = df_outage_OG.copy()
df_outage_2 = DOVSOutages.merge_df_outage_with_mp(
    df_outage=df_outage_2, 
    df_mp=MP_1, 
#     merge_on_outg='PREMISE_NB', 
#     merge_on_mp='prem_nb', 
    merge_on_outg=['PREMISE_NB', 'OUTG_REC_NB'], 
    merge_on_mp=['prem_nb', 'OUTG_REC_NB'], 
#     merge_on_outg='OUTG_REC_NB', 
#     merge_on_mp='OUTG_REC_NB', 
    cols_to_include_mp=None, 
    drop_cols = None, 
    rename_cols=None, 
    inplace=True
)

In [None]:
print(df_outage_1.shape)
print(df_outage_2.shape)

In [None]:
df_outage_1.head()

In [None]:
df_outage_2.head()

In [None]:
overlap_cols = set(df_outage_1.columns).intersection(set(df_outage_2.columns))
overlap_cols

In [None]:
df_outage_1 = df_outage_1[overlap_cols]
df_outage_2 = df_outage_2[overlap_cols]

In [None]:
df_outage_1 = df_outage_1.sort_values(by=['OUTG_REC_NB', 'PREMISE_NB', 'mfr_devc_ser_nbr', 'prem_nb'], ignore_index=True)
df_outage_2 = df_outage_2.sort_values(by=['OUTG_REC_NB', 'PREMISE_NB', 'mfr_devc_ser_nbr', 'prem_nb'], ignore_index=True)

In [None]:
df_outage_1.equals(df_outage_2)

In [None]:
df_outage_1=df_outage_1.set_index(['OUTG_REC_NB', 'PREMISE_NB', 'mfr_devc_ser_nbr', 'prem_nb'])
df_outage_2=df_outage_2.set_index(['OUTG_REC_NB', 'PREMISE_NB', 'mfr_devc_ser_nbr', 'prem_nb'])

In [None]:
print(df_outage_1.shape)
print(df_outage_2.shape)

In [None]:
df_outage_1.index.nunique()

In [None]:
df_outage_1.index.value_counts()

In [None]:
overlap_idxs = set(df_outage_1.index).intersection(set(df_outage_2.index))

In [None]:
len(overlap_idxs)

In [None]:
df_outage_1=df_outage_1.loc[overlap_idxs]
df_outage_2=df_outage_2.loc[overlap_idxs]

In [None]:
df_outage_1.equals(df_outage_2)

In [None]:
print(df_outage_1.shape)
print(df_outage_2.shape)

In [None]:
df_outage_1=df_outage_1.drop((-2147483648, np.nan, np.nan, np.nan))
df_outage_2=df_outage_2.drop((-2147483648, np.nan, np.nan, np.nan))

In [None]:
print(df_outage_1.shape)
print(df_outage_2.shape)

In [None]:
df_outage_1.index.value_counts()

In [None]:
df_outage_2.index.value_counts()

In [None]:
# fucker = df_outage_2.loc[(12333707, '076914321', '776156666', '076914321')]
fucker = df_outage_2.loc[(12335129, '079279512', '882993723', '079279512')]

In [None]:
fucker

In [None]:
MP_1[(MP_1['mfr_devc_ser_nbr']=='882993723') & (MP_1['prem_nb']=='079279512') & (MP_1['OUTG_REC_NB']==12335129)]

In [None]:
fucker.iloc[0].equals(fucker.iloc[1])

In [None]:
fucker.iloc[0]==fucker.iloc[1]

In [None]:
MP_sub = build_active_MP_for_outages_df(
    df_outage=df_outage[df_outage['OUTG_REC_NB']==12335129].copy(), 
    prem_nb_col='PREMISE_NB', 
    is_slim=False, 
    assert_all_PNs_found=False
)

In [None]:
MP_sub[(MP_sub['mfr_devc_ser_nbr']=='882993723') & (MP_sub['prem_nb']=='079279512') & (MP_sub['OUTG_REC_NB']==12335129)]

In [None]:
MP_sub[(MP_sub['mfr_devc_ser_nbr']=='882993723') & (MP_sub['prem_nb']=='079279512') & (MP_sub['OUTG_REC_NB']==12335129)]

In [None]:
df_outage_OG[(df_outage_OG['OUTG_REC_NB']==12333707) & (df_outage_OG['PREMISE_NB']=='076914321')]

In [None]:
fucker2 = MP_2[(MP_2['mfr_devc_ser_nbr']=='776156666') & (MP_2['prem_nb']=='076914321') & (MP_2['OUTG_REC_NB']==12333707)]

In [None]:
fucker2

In [None]:
fucker2.iloc[0].equals(fucker2.iloc[1])

In [None]:
fucker2.iloc[0]==fucker2.iloc[1]

In [None]:
MP_2.groupby(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).groups

In [None]:
MP_2.set_index(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).index

In [None]:
MP_2.set_index(['mfr_devc_ser_nbr', 'prem_nb', 'trsf_pole_nb', 'OUTG_REC_NB']).index.value_counts()

In [None]:
diffs = Utilities_df.get_dfs_diff(df_outage_1, df_outage_2)

In [None]:
import MeterPremise
reload(MeterPremise)
from MeterPremise import MeterPremise

In [None]:
MP_2b = build_active_MP_for_outages_df(
    df_outage=df_outage_slim.loc[[12333707]], 
    prem_nb_col='PREMISE_NBS', 
    is_slim=True, 
    assert_all_PNs_found=False
)

In [None]:
MP_2b

In [None]:
MP_2b[(MP_2b['mfr_devc_ser_nbr']=='776156666') & (MP_2b['prem_nb']=='076914321') & (MP_2b['OUTG_REC_NB']==12333707)]

In [None]:
dev_df_outage = df_outage_slim.loc[[12333707]].copy()
prem_nb_col = 'PREMISE_NBS'

df_mp_curr=None
df_mp_hist=None
assert_all_PNs_found=True
drop_inst_rmvl_cols=False
outg_rec_nb_col='OUTG_REC_NB'
is_slim=False
dt_on_ts_col='DT_ON_TS'
df_off_ts_full_col='DT_OFF_TS_FULL'
consolidate_PNs_batch_size=1000
df_mp_serial_number_col='mfr_devc_ser_nbr'
df_mp_prem_nb_col='prem_nb'
df_mp_install_time_col='inst_ts'
df_mp_removal_time_col='rmvl_ts'
df_mp_trsf_pole_nb_col='trsf_pole_nb'

In [None]:
PNs = Utilities_df.consolidate_column_of_lists(
    df=dev_df_outage, 
    col=prem_nb_col, 
    sort=True,
    include_None=False,
    batch_size=consolidate_PNs_batch_size, 
    verbose=False
)
PNs = [x for x in PNs if pd.notna(x)]
#-------------------------
mp_df_curr_hist_dict = MeterPremise.build_mp_df_curr_hist_for_PNs(
    PNs=PNs, 
    mp_df_curr=None,
    mp_df_hist=None, 
    join_curr_hist=False, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assert_all_PNs_found=assert_all_PNs_found, 
    assume_one_xfmr_per_PN=True, 
    combine_rmvl_ts_nat_entries=True
)
df_mp_curr = mp_df_curr_hist_dict['mp_df_curr']
df_mp_hist = mp_df_curr_hist_dict['mp_df_hist']

In [None]:
mp_df_curr_hist_dict

In [None]:
mp_df_curr_hist = MeterPremise.build_mp_df_curr_hist_for_PNs(
    PNs=PNs, 
    mp_df_curr=mp_df_curr_hist_dict['mp_df_curr'],
    mp_df_hist=mp_df_curr_hist_dict['mp_df_hist'], 
    join_curr_hist=True
)

In [None]:
mp_df_curr_hist[(mp_df_curr_hist['mfr_devc_ser_nbr']=='776156666') & (mp_df_curr_hist['prem_nb']=='076914321')]

In [None]:
df_mp_curr[(df_mp_curr['mfr_devc_ser_nbr']=='776156666') & (df_mp_curr['prem_nb']=='076914321')]

In [None]:
df_mp_hist[(df_mp_hist['mfr_devc_ser_nbr']=='776156666') & (df_mp_hist['prem_nb']=='076914321')]

In [None]:
mp_df_curr_hist[(mp_df_curr_hist['mfr_devc_ser_nbr']=='776156666') & (mp_df_curr_hist['prem_nb']=='076914321')].sort_values(by=['inst_ts', 'rmvl_ts'])

In [None]:
fkr = mp_df_curr_hist[(mp_df_curr_hist['mfr_devc_ser_nbr']=='776156666') & (mp_df_curr_hist['prem_nb']=='076914321')].sort_values(by=['inst_ts', 'rmvl_ts'])
fkr.iloc[2]==fkr.iloc[3]

In [None]:
# Only reason for making dict is to ensure outg_rec_nbs are not repeated 
active_SNs_in_outgs_dfs_dict = {}
for outg_rec_nb_i, row_i in dev_df_outage.iterrows():
    # NOTE: assume_one_xfmr_per_PN=True above in MeterPremise.build_mp_df_curr_hist_for_PNs,
    #       so does not need to be set again (i.e., assume_one_xfmr_per_PN=False below)
    active_SNs_df_i = MeterPremise.get_active_SNs_for_PNs_at_datetime_interval(
        PNs=row_i[prem_nb_col],
        df_mp_curr=df_mp_curr, 
        df_mp_hist=df_mp_hist, 
        dt_0=row_i[df_off_ts_full_col],
        dt_1=row_i[dt_on_ts_col],
        assume_one_xfmr_per_PN=False, 
        output_index=None,
        output_groupby=None, 
        assert_all_PNs_found=False
    )
    active_SNs_df_i[outg_rec_nb_col] = outg_rec_nb_i
    assert(outg_rec_nb_i not in active_SNs_in_outgs_dfs_dict)
    active_SNs_in_outgs_dfs_dict[outg_rec_nb_i] = active_SNs_df_i

In [None]:
active_SNs_df_i

In [None]:
active_SNs_df_i[(active_SNs_df_i['mfr_devc_ser_nbr']=='776156666') & (active_SNs_df_i['prem_nb']=='076914321') & (active_SNs_df_i['OUTG_REC_NB']==12333707)]

# DEV MeterPremise.drop_approx_mp_duplicates

In [None]:
PNs = Utilities_df.consolidate_column_of_lists(
    df=dev_df_outage, 
    col=prem_nb_col, 
    sort=True,
    include_None=False,
    batch_size=consolidate_PNs_batch_size, 
    verbose=False
)
PNs = [x for x in PNs if pd.notna(x)]
#-------------------------
mp_df_curr_hist_dict = MeterPremise.build_mp_df_curr_hist_for_PNs(
    PNs=PNs, 
    mp_df_curr=None,
    mp_df_hist=None, 
    join_curr_hist=False, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assert_all_PNs_found=assert_all_PNs_found, 
    assume_one_xfmr_per_PN=True, 
    combine_rmvl_ts_nat_entries=False
)
df_mp_curr = mp_df_curr_hist_dict['mp_df_curr']
df_mp_hist = mp_df_curr_hist_dict['mp_df_hist']

In [None]:
mp_df_curr_hist = MeterPremise.build_mp_df_curr_hist_for_PNs(
    PNs=PNs, 
    mp_df_curr=mp_df_curr_hist_dict['mp_df_curr'],
    mp_df_hist=mp_df_curr_hist_dict['mp_df_hist'], 
    join_curr_hist=True, 
    combine_rmvl_ts_nat_entries=False
)

In [None]:
mp_df_curr_hist.shape

In [None]:
mp_df_curr_hist = MeterPremise.combine_rmvl_ts_nat_entries(mp_df_curr_hist)

In [None]:
mp_df_curr_hist.shape

In [None]:
mp_df_curr_hist.groupby(['mfr_devc_ser_nbr', 'prem_nb']).size().sort_values(ascending=False)

In [None]:
mp_df_curr_hist[(mp_df_curr_hist['mfr_devc_ser_nbr']=='776156666') & (mp_df_curr_hist['prem_nb']=='076914321')]

In [None]:
mp_df_curr_hist.shape

In [None]:
mp_df_curr_hist = MeterPremise.drop_approx_mp_duplicates(
    mp_df=mp_df_curr_hist, 
    fuzziness=pd.Timedelta('1 hour'), 
    gpby_dropna=False    
)

In [None]:
mp_df_curr_hist.shape

In [None]:
mp_df_curr_hist.shape

In [None]:
mp_df_curr_hist[(mp_df_curr_hist['mfr_devc_ser_nbr']=='776156666') & (mp_df_curr_hist['prem_nb']=='076914321')]

In [None]:
v1.shape

# END NEW DEV

In [None]:
dev_mp_df

In [None]:
assert(all(dev_mp_df[ovrlp_intrvl_0_col]<dev_mp_df[ovrlp_intrvl_1_col]))

In [None]:
dev_mp_df = dev_mp_df.sort_values(by=[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col])

In [None]:
overlaps = []
current_beg, current_end = dev_mp_df.iloc[0][[ovrlp_intrvl_0_col, ovrlp_intrvl_1_col]]
overlaps.append(
    dict(min_val=current_beg, max_val=current_end, idxs=[dev_mp_df.index[0]])
)

In [None]:
for i, (idx, row) in enumerate(dev_mp_df.iterrows()):
    if i==0:
        continue
    #---------------
    beg = row[ovrlp_intrvl_0_col]
    end = row[ovrlp_intrvl_1_col]
    if beg > current_end+fuzziness:
        # beg after current end (with fuzziness buffer), so new interval needed
        overlaps.append(dict(min_val=beg, max_val=end, idxs=[idx]))
        current_beg, current_end = beg, end
    else:
        # beg <= current_end+fuzziness, so overlap
        # The beg of overlaps[-1] remains the same, but the end of overlaps[-1] should be changed to
        #   the max of current_end and end.
        # Also, idx needs to be added to the overlap
        current_end = max(current_end, end)
        overlaps[-1]['max_val'] = current_end
        overlaps[-1]['idxs'].append(idx)

In [None]:
assert(dev_mp_df_i.shape[0]==1)

In [None]:
dev_mp_df_i

In [None]:
ovrlp_intrvl_0_col_idx = Utilities_df.find_idxs_in_highest_order_of_columns(dev_mp_df_i, ovrlp_intrvl_0_col)
assert(len(ovrlp_intrvl_0_col_idx)==1)
ovrlp_intrvl_0_col_idx=ovrlp_intrvl_0_col_idx[0]

In [None]:
ovrlp_intrvl_1_col_idx = Utilities_df.find_idxs_in_highest_order_of_columns(dev_mp_df_i, ovrlp_intrvl_1_col)
assert(len(ovrlp_intrvl_1_col_idx)==1)
ovrlp_intrvl_1_col_idx=ovrlp_intrvl_1_col_idx[0]

In [None]:
dev_mp_df_i.iloc[0, ovrlp_intrvl_0_col_idx] = overlap_dict_i['min_val']
dev_mp_df_i.iloc[0, ovrlp_intrvl_1_col_idx] = overlap_dict_i['max_val']

In [None]:
dev_mp_df_i=dev_mp_df_i.reset_index()

In [None]:
dev_mp_df_i

In [None]:
df_outage = df_outage_OG.copy()
df_outage = DOVSOutages.merge_df_outage_with_mp(
    df_outage=df_outage, 
    df_mp=df_mp_all, 
    merge_on_outg='PREMISE_NB', 
    merge_on_mp='prem_nb', 
    cols_to_include_mp=cols_of_interest_met_prem, 
    drop_cols = None, 
    rename_cols=None, 
    inplace=True
)
df_outage = Utilities_df.convert_col_types(
    df=df_outage, 
    cols_and_types_dict={'annual_kwh':float, 'annual_max_dmnd':float}, 
    to_numeric_errors='coerce', 
    inplace=True
)


# Below, 'prim' stands for primary, and means the meters are connected directly to a transformer pole causing an outage
df_outage_prim = df_outage[df_outage['LOCATION_ID']==df_outage['trsf_pole_nb']].copy()

# Below, 'prim_strict' stands for primary strict, and means the  meters are connected directly 
# to a transformer pole causing an outage, AND the equipment type causing the outage is a transformer
# (exact EQUIP_TYP_NMs given in xfmr_equip_typ_nms_of_interest)
xfmr_equip_typ_nms_of_interest = ['TRANSFORMER, OH', 'TRANSFORMER, UG']
df_outage_prim_strict = df_outage[(df_outage['LOCATION_ID']==df_outage['trsf_pole_nb']) & 
                                  (df_outage['EQUIP_TYP_NM'].isin(xfmr_equip_typ_nms_of_interest))].copy()

In [None]:
necessary_mp_cols = ['mfr_devc_ser_nbr', 'prem_nb', 'inst_ts', 'rmvl_ts']

In [None]:
cols_to_list = [x for x in dev_mp_df.columns if x not in necessary_mp_cols]

In [None]:
df_outage.head()

In [None]:
ovrlp_intrvl_1_col_idx

In [None]:
dev_mp_df

In [None]:
dev_mp_df_i

In [None]:
df_outage_prim_strict['mfr_devc_ser_nbr'].nunique()

In [None]:
# # THIS GIVES SAME RESULT AS MERGE ABOVE, IF I DONT WANT TO EXPLICITLY BUILD MP DFs
# df_outage_2 = df_outage_OG.copy()
# start=time.time()
# df_outage_2 = DOVSOutages.build_mp_df_and_merge_with_df_outage(
#     df_outage=df_outage_2, 
#     cols_of_interest_met_prem=cols_of_interest_met_prem, 
#     build_mp_df_args = dict(
#         premise_nb_col='PREMISE_NB', 
#         df_construct_type=DFConstructType.kRunSqlQuery, 
#         build_sql_function=MeterPremise.build_sql_meter_premise, 
#         addtnl_build_sql_function_kwargs=dict(
#             state_cds=['OH']
#         )
#     ), 
#     cols_to_include_mp=cols_of_interest_met_prem, 
#     drop_cols=None
# )
# print(f'time = {time.time()-start}')

# -----------------------------------------------------------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# -----------------------------------------------------------------------------------------------

In [None]:
df_outage_slim             = DOVSOutages.consolidate_df_outage(df_outage)
df_outage_prim_slim        = DOVSOutages.consolidate_df_outage(df_outage_prim)
df_outage_prim_strict_slim = DOVSOutages.consolidate_df_outage(df_outage_prim_strict)

In [None]:
print(df_outage_slim.shape)
df_outage_slim.columns

# Save CSVs if save_dfs_to_csv = True

In [None]:
if save_dfs_to_csv:
    df_outage.to_csv(os.path.join(save_dir_csvs, 'df_outage.csv'), index=False)
    df_outage_prim.to_csv(os.path.join(save_dir_csvs, 'df_outage_prim.csv'), index=False)
    df_outage_prim_strict.to_csv(os.path.join(save_dir_csvs, 'df_outage_prim_strict.csv'), index=False)
    #-----
    df_outage_slim.to_csv(os.path.join(save_dir_csvs, 'df_outage_slim.csv'), index=False)
    df_outage_prim_slim.to_csv(os.path.join(save_dir_csvs, 'df_outage_prim_slim.csv'), index=False)
    df_outage_prim_strict_slim.to_csv(os.path.join(save_dir_csvs, 'df_outage_prim_strict_slim.csv'), index=False)

In [None]:
# FUCK

In [None]:
df_outage_slim = DOVSOutages.read_df_outage_slim_from_csv(os.path.join(save_dir_csvs, 'df_outage_slim.csv'))
#df_outage_slim = DOVSOutages.read_df_outage_slim_from_csv(os.path.join(save_dir_csvs, 'df_outage_prim_strict_slim.csv'))

In [None]:
# print(df_outage_slim[(df_outage_slim['MJR_CAUSE_CD']=='DL') & (df_outage_slim['MNR_CAUSE_CD']=='OL')].shape)
# print(df_outage_slim[(df_outage_slim['MJR_CAUSE_CD']=='DL') & (df_outage_slim['MNR_CAUSE_CD']=='EQF') ].shape)
# print(df_outage_slim[
#     (df_outage_slim['MJR_CAUSE_CD']=='DL') & 
#     (df_outage_slim['MNR_CAUSE_CD']=='EQF') & 
#     (df_outage_slim['EQUIP_TYP_NM'].isin(['TRANSFORMER, OH', 'TRANSFORMER, UG']))
# ].shape)
# print(df_outage_slim[
#     ((df_outage_slim['MJR_CAUSE_CD']=='DL') & (df_outage_slim['MNR_CAUSE_CD']=='OL')) | 
#     ((df_outage_slim['MJR_CAUSE_CD']=='DL') & 
#      (df_outage_slim['MNR_CAUSE_CD']=='EQF') & 
#      (df_outage_slim['EQUIP_TYP_NM'].isin(['TRANSFORMER, OH', 'TRANSFORMER, UG'])))
# ].shape)

# df_outage_slim = df_outage_slim[
#     ((df_outage_slim['MJR_CAUSE_CD']=='DL') & (df_outage_slim['MNR_CAUSE_CD']=='OL')) | 
#     ((df_outage_slim['MJR_CAUSE_CD']=='DL') & 
#      (df_outage_slim['MNR_CAUSE_CD']=='EQF') & 
#      (df_outage_slim['EQUIP_TYP_NM'].isin(['TRANSFORMER, OH', 'TRANSFORMER, UG'])))
# ].copy()

In [None]:
print(f'df_outage_slim.shape = {df_outage_slim.shape}')

# -----------------------------------------------------------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# -----------------------------------------------------------------------------------------------

In [None]:
search_time_half_window = datetime.timedelta(days=31)
#-------------------------
cols_of_interest_end_dev_event = TableInfos.AMIEndEvents_TI.std_columns_of_interest
#-----
cols_of_interest_met_prem      = TableInfos.MeterPremise_TI.std_columns_of_interest
#-------------------------
usg_split_to_CTEs=True
match_events_in_df_to_outages=False
combine_kwh_delivered_and_received=True
#-------------------------
if not usg_split_to_CTEs:
    match_events_in_df_to_outages = True
#-------------------------

In [None]:
df_outage_slim = DOVSOutages.set_search_time_in_outage_df(
    df_outage=df_outage_slim, 
    search_time_half_window=search_time_half_window
)

In [None]:
df_outage_slim.columns

In [None]:
print(df_outage_slim['OUTG_REC_NB'].nunique())
print(len(DOVSOutages.get_prem_nbs_from_consolidated_df_outage(df_outage_slim)))

In [None]:
# outg_rec_nbs = df_outage['OUTG_REC_NB'].unique().tolist()
# df_outage_i = df_outage[df_outage['OUTG_REC_NB'].isin(outg_rec_nbs[:3])]

In [None]:
# df_construct_type=DFConstructType.kRunSqlQuery
# contstruct_df_args_end_events=None
# end_events_sql_function_kwargs = dict(
#     cols_of_interest=cols_of_interest_end_dev_event, 
#     df_outage=df_outage, 
#     split_to_CTEs=usg_split_to_CTEs, 
#     join_mp_args=dict(
#         join_with_CTE=True, 
#         build_mp_kwargs=dict(cols_of_interest=cols_of_interest_met_prem), 
#         join_type='LEFT'
#     ), 
# )
# addtnl_end_events_sql_function_kwargs = dict(
#     build_sql_function_kwargs=dict(opco='oh')
# )
# end_events_sql_function_kwargs = {**end_events_sql_function_kwargs, 
#                                   **addtnl_end_events_sql_function_kwargs}
# start=time.time()
# end_events = AMIEndEvents(
#     df_construct_type=df_construct_type, 
#     contstruct_df_args = contstruct_df_args_end_events, 
#     build_sql_function=AMIEndEvents_SQL.build_sql_end_events_for_outages, 
#     build_sql_function_kwargs=end_events_sql_function_kwargs, 
#     init_df_in_constructor=True
# )
# end_events_build_time = time.time()-start

In [None]:
save_args = dict(save_to_file=True, 
                 save_dir = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\EndEvents_FUCK', 
                 save_name=r'end_events.csv', 
                 index=True)

# save_args = dict(save_to_file=True, 
#                  save_dir = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\EndEvents_prim_strict', 
#                  save_name=r'end_events.csv', 
#                  index=True)
batch_size=10
verbose=True
n_update=1

In [None]:
del df_outage_OG
del df_outage
del df_outage_prim
del df_outage_prim_strict

In [None]:
df_construct_type=DFConstructType.kRunSqlQuery
contstruct_df_args_end_events=None
end_events_sql_function_kwargs = dict(
    cols_of_interest=cols_of_interest_end_dev_event, 
    df_outage=df_outage_slim, 
    split_to_CTEs=usg_split_to_CTEs, 
    join_mp_args=dict(
        join_with_CTE=True, 
        build_mp_kwargs=dict(cols_of_interest=cols_of_interest_met_prem), 
        join_type='LEFT'
    ), 
    df_args = dict(mapping_to_ami={'PREMISE_NBS':'premise_nbs'}, 
                   is_df_consolidated=True), 
    field_to_split='df_outage', 
    field_to_split_location_in_kwargs=['df_outage'], 
    sort_coll_to_split=True,
    batch_size=batch_size, verbose=verbose, n_update=n_update
)
addtnl_end_events_sql_function_kwargs = dict(
    build_sql_function_kwargs=dict(opco='oh')
)
end_events_sql_function_kwargs = {**end_events_sql_function_kwargs, 
                                  **addtnl_end_events_sql_function_kwargs}

In [None]:
start=time.time()
end_events = AMIEndEvents(
    df_construct_type=df_construct_type, 
    contstruct_df_args = contstruct_df_args_end_events, 
    build_sql_function=AMIEndEvents_SQL.build_sql_end_events_for_outages, 
    build_sql_function_kwargs=end_events_sql_function_kwargs, 
    init_df_in_constructor=True, 
    save_args=save_args
)
end_events_build_time = time.time()-start

In [None]:
df_outage_slim

In [2]:
df1 = pd.read_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230301\20230101_20231231\Outgs_Full\df_outage.pkl')
df1_slim = pd.read_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230301\20230101_20231231\Outgs_Full\df_outage_slim.pkl')

In [10]:
df_slim2 = DOVSOutages.consolidate_df_outage(
    df1[df1['OUTG_REC_NB'].isin(df1['OUTG_REC_NB'].unique().tolist()[:10])], 
    addtnl_grpby_cols=['trsf_pole_nb'], 
    set_outg_rec_nb_as_index=False
)

groups_violating_uniqueness = []


In [8]:
df1_slim

Unnamed: 0_level_0,OUTG_REC_NB,CI_NB,CMI_NB,OUTAGE_NB,DT_ON_TS,DT_OFF_TS,DT_OFF_TS_FULL,STEP_DRTN_NB,START_YEAR,OPERATING_UNIT_ID,...,EQUIP_TYP_NM,SHORT_NM_EQP_TYP,MJR_CAUSE_NM,MNR_CAUSE_NM,mfr_devc_ser_nbr,prem_nb,inst_ts,rmvl_ts,trsf_pole_nb,PREMISE_NBS
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13149401,13149401,6,462.0,3345801,2023-01-01 02:10:00,2023-01-01,2023-01-01 00:53:00,77.0,2023.0,3,...,NONE,NONE,DISTRIBUTION LINE,UNKNOWN (NON WEATHER),659433671,109479311,2014-01-13 12:00:20,NaT,1671753343672,"[109479311, 102679311, 103279311, 107969311]"
13149414,13149414,161,27531.0,3345831,2023-01-01 04:13:00,2023-01-01,2023-01-01 01:22:00,171.0,2023.0,3,...,CONDUCTOR OVERHEAD,OH COND,DISTRIBUTION LINE,VANDALISM,645780102,100171140,2010-12-06 12:00:20,2023-02-15 12:00:00,1879927725778,"[100171140, 100012140, 100059040, 100080140, 1..."
13149439,13149439,1,116.0,3345941,2023-01-01 06:30:00,2023-01-01,2023-01-01 04:34:00,116.0,2023.0,3,...,CONDUCTOR OVERHEAD,OH COND,DISTRIBUTION LINE,EQUIPMENT FAILURE,655793273,077483840,2012-10-08 12:00:20,NaT,41840653000079,[077483840]
13149452,13149452,1,121.0,3345971,2023-01-01 07:50:00,2023-01-01,2023-01-01 05:49:00,121.0,2023.0,3,...,CUTOUT,CUTOUT,DISTRIBUTION LINE,ANIMAL - NON BIRD,659229323,103790890,2013-12-05 12:00:20,NaT,1652961379999,[103790890]
13149458,13149458,19,836.0,3346021,2023-01-01 08:39:00,2023-01-01,2023-01-01 07:55:00,44.0,2023.0,3,...,NONE,NONE,DISTRIBUTION LINE,ANIMAL - NON BIRD,883461236,073113201,2019-09-17 12:00:20,NaT,41810698B10078,"[073113201, 070013201, 070317201, 072143201, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13285733,13285733,1,1137.0,3897371,2023-04-04 09:37:00,2023-04-03,2023-04-03 14:40:00,1137.0,2023.0,3,...,NONE,NONE,DISTRIBUTION LINE,TREE OUT OF ROW,692404312,101740132,2022-02-15 12:00:20,NaT,1877670749950,[101740132]
13285948,13285948,10,11190.0,3900221,2023-04-04 11:03:00,2023-04-03,2023-04-03 16:24:00,1119.0,2023.0,3,...,NONE,NONE,DISTRIBUTION LINE,TREE INSIDE ROW,880176801,102378573,2018-10-08 12:00:20,NaT,1909221708340,"[102378573, 100491833, 100868573, 101778573, 1..."
13286663,13286663,7,7938.0,3899991,2023-04-04 14:20:00,2023-04-03,2023-04-03 19:26:00,1134.0,2023.0,3,...,CONDUCTOR OVERHEAD,OH COND,DISTRIBUTION LINE,TREE OUT OF ROW,884699384,074931545,2019-06-17 12:00:20,NaT,41810868A30030,"[074931545, 070241545, 072431545, 073341545, 0..."
22907760,22907760,92,45540.0,3677701,2023-03-25 21:00:00,2023-03-25,2023-03-25 12:45:00,495.0,2023.0,3,...,CONDUCTOR OVERHEAD,OH COND,DISTRIBUTION LINE,TREE OUT OF ROW,190703791,100074225,2014-02-03 12:00:20,NaT,1857601475997,"[100074225, 100090491, 100114428, 100150491, 1..."


In [9]:
df_slim2

Unnamed: 0_level_0,OUTG_REC_NB,trsf_pole_nb,GIS_CRCT_NB,MJR_CAUSE_CD,OPERATING_UNIT_ID,DT_OFF_TS_FULL,CMI_NB,OPRTG_UNT_NM,MJR_CAUSE_NM,OUTAGE_NB,...,SHORT_NM_EQP_TYP,START_YEAR,SHORT_NM_CLR_DEV,STATE_ABBR_TX,CI_NB,PREMISE_NBS,mfr_devc_ser_nbr,prem_nb,inst_ts,rmvl_ts
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13171421,13171421,2100358533316,0011202,DL,3,2023-01-20 09:32:00,7946.0,AEP-Ohio,DISTRIBUTION LINE,3419191,...,INSULATOR,2023.0,RECLOSER,OH,137,"[104445576, 105198720]","[538440290, 659525774]","[104445576, 105198720]","[2018-09-19 12:00:20, 2014-04-10 12:00:20]",[]
13171421,13171421,2100626533669,0011202,DL,3,2023-01-20 09:32:00,7946.0,AEP-Ohio,DISTRIBUTION LINE,3419191,...,INSULATOR,2023.0,RECLOSER,OH,137,[104581230],[659692280],[104581230],[2014-04-10 12:00:20],[]
13171421,13171421,2100894534278,0011202,DL,3,2023-01-20 09:32:00,7946.0,AEP-Ohio,DISTRIBUTION LINE,3419191,...,INSULATOR,2023.0,RECLOSER,OH,137,[109575059],[659685882],[109575059],[2014-04-14 12:00:20],[]
13171421,13171421,2101179534635,0011202,DL,3,2023-01-20 09:32:00,7946.0,AEP-Ohio,DISTRIBUTION LINE,3419191,...,INSULATOR,2023.0,RECLOSER,OH,137,[108809753],[190692806],[108809753],[2019-09-09 12:00:20],[]
13171421,13171421,2101560534463,0011202,DL,3,2023-01-20 09:32:00,7946.0,AEP-Ohio,DISTRIBUTION LINE,3419191,...,INSULATOR,2023.0,RECLOSER,OH,137,[070514950],[662025306],[070514950],[2021-02-12 12:00:20],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13245016,13245016,1849446775962,0009801,DL,3,2023-03-16 16:05:00,3938.0,AEP-Ohio,DISTRIBUTION LINE,3640911,...,OH COND,2023.0,LINE FUSE,OH,22,[100186562],[879867725],[100186562],[2018-05-16 12:00:20],[]
13245016,13245016,1849683775954,0009801,DL,3,2023-03-16 16:05:00,3938.0,AEP-Ohio,DISTRIBUTION LINE,3640911,...,OH COND,2023.0,LINE FUSE,OH,22,"[107976562, 108190662]","[676535287, 879867724]","[108190662, 107976562]","[2018-05-16 12:00:20, 2018-06-02 12:00:20]",[]
13245017,13245017,1869979737253,0002914,DL,3,2023-03-16 17:30:00,300.0,AEP-Ohio,DISTRIBUTION LINE,3640651,...,POLE,2023.0,XFMR FUSE,OH,2,[104487138],[436405058],[104487138],[2016-05-13 12:00:20],[]
13245017,13245017,1870450737309,0002914,DL,3,2023-03-16 17:30:00,300.0,AEP-Ohio,DISTRIBUTION LINE,3640651,...,POLE,2023.0,XFMR FUSE,OH,2,"[100240092, 101830092, 103030092, 103340092, 1...","[640302265, 640292085, 761933589, 683580542, 6...","[103340092, 101830092, 104930092, 103030092, 1...","[2022-06-02 12:00:20, 2010-03-04 12:00:20, 201...",[]
