# Originally adapted from IT_DEMO_MODEL_DEV

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

In [2]:
%run ./model_end_events_for_outages_METHODS.ipynb




In [3]:
from importlib import reload
#reload(Utilities)
#reload(clm)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

#---------------------------------------------------------------------
import sys, os
import re
from pathlib import Path
import json
import pickle
import joblib

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version

import copy
import itertools
import adjustText

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
import matplotlib.colors as mcolors
import matplotlib.cm as cm #e.g. for cmap=cm.jet
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
from MeterPremise import MeterPremise
from EEMSP import EEMSP
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from AMIEDE_DEV import AMIEDE_DEV
from MECPODf import MECPODf
from MECPOAn import MECPOAn
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
from OutagePredictor import OutagePredictor
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_General
import Plot_Box_sns
import Plot_Hist
import Plot_Bar
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer
#---------------------------------------------------------------------

In [4]:
def plot_pred_probs_for_trsf_i(
    fig_num, 
    trsf_pole_nb_i, 
    y_prob1_by_date_df, 
    dovs_df, 
    trsf_pole_nb_col='trsf_pole_nb', 
    outg_rec_nb_col='OUTG_REC_NB', 
    dt_off_ts_full_col='DT_OFF_TS_FULL', 
    plot_maxima=False, 
    maxima_colors=['green', 'lawngreen']
):
    r"""
    dovs_df:
        Must be merged with MeterPremise so that it contains trsf_pole_nb_col
    """
    #-------------------------
    fig,ax = Plot_General.default_subplots(fig_num=fig_num)
    #-------------------------
    y_prob1_by_date_df.loc[trsf_pole_nb_i].T.plot.line(ax=ax)
    ax.axhline(0.5, color='cyan')
    #-------------------------
    dovs_df_i = dovs_df[dovs_df[trsf_pole_nb_col]==trsf_pole_nb_i][[outg_rec_nb_col, dt_off_ts_full_col]].drop_duplicates()
    if dovs_df_i.shape[0]>0:
        for idx_ij in range(dovs_df_i.shape[0]):
            dt_off_ts_full_ij = dovs_df_i.iloc[idx_ij][dt_off_ts_full_col]
            ax.axvline(dt_off_ts_full_ij, color='red', lw=5, alpha=0.5)
    ax.set_title(f'trsf_pole_nb = {trsf_pole_nb_i}')
    #-------------------------
    ax.set_ylim(0,1);
    #-------------------------
    if plot_maxima:
        idx_max_i = y_prob1_by_date_df.loc[trsf_pole_nb_i].idxmax()
        ax.axvline(idx_max_i, color=maxima_colors[0])
        if(
            idx_max_i > dovs_df_i[dt_off_ts_full_col].max() and
            y_prob1_by_date_df.loc[trsf_pole_nb_i].index[0] < dovs_df_i[dt_off_ts_full_col].max() #Make sure there are actually data to grab
        ):
            idx_max_i = y_prob1_by_date_df.loc[trsf_pole_nb_i][:dovs_df_i[dt_off_ts_full_col].max()].idxmax()
            ax.axvline(idx_max_i, color=maxima_colors[1])
    #-------------------------
    if dovs_df_i[dt_off_ts_full_col].min() < y_prob1_by_date_df.loc[trsf_pole_nb_i].index[0]:
        ax.set_xlim(left=dovs_df_i[dt_off_ts_full_col].min()-pd.Timedelta('1D'))    
    #-------------------------
    return fig,ax

In [5]:
outg_rec_nbs_and_trsf_pole_nbs = [
    ('13275190', '40820482D20097'),
    ('13276382', '41810769C10019'),
    ('13516272', '1887822760551'),
    ('13281137', '41810677B20017'),
    ('13276382', '41810769C30027'),
    ('13284541', '41810796A10038'),
    ('13415438', '1916206716391'),
    ('13431370', '1884744712610'),
    ('13295601', '1852023757802'),
    ('13276753', '41810724B30053'),
    ('13417871', '1893731751927'),
    ('13417871', '1895111751920'),
    ('13274063', '41810723A20123'),
    ('13275201', '40820707C40124'),
    ('13275190', '40820483B10262'),
    ('13415340', '1912458713764'),
    ('13499728', '1852582750272'),
    ('13275205', '40820731A30084'),
    ('13393559', '1834040741037'),
    ('13417871', '1893939751371'),
    ('13276382', '41810770A30092'),
    ('13276382', '41810745D20026'),
    ('13283050', '41810892000151'),
    ('13275190', '40820482D30197'),
    ('13275190', '40820482C40019'),
    ('13393559', '1838888738583'),
    ('13415438', '1914777719791'),
    ('13281046', '41810771A40026'),
    ('13415340', '1912185712890'),
    ('13526085', '1860302692180'),
    ('13275895', '40820507C30177'),
    ('13276382', '41810769C30018'),
    ('13275201', '40820707D20093'),
    ('13514788', '1911697717715'),
    ('13460850', '1848168697905'),
    ('13403703', '1834467736881'),
    ('13514788', '1914335720034'),
    ('13412376', '1893308707792'),
    ('13467591', '40830120D20010'),
    ('13389129', '1867427734381'),
    ('13403703', '1837283735919'),
    ('13276382', '41810769000350'),
    ('13314058', '41830806B20055'),
    ('13275190', '40820482D20111'),
    ('13393559', '1838734738009'),
    ('13276753', '41810724D40119'),
    ('13485854', '1868344715875'),
    ('13277285', '40820508A30379'),
    ('13415340', '1913460716874'),
    ('13514788', '1914362717373'),
    ('13278651', '40810001C30061'),
    ('13276996', '40820506B40143'),
    ('13406470', '1871656735225'),
    ('13415438', '1916295718725'),
    ('13276996', '40820506B40088'),
    ('13275190', '40820482D40089'),
    ('13431370', '1886191712330'),
    ('13276382', '41810769000088'),
    ('13275895', '40820507C30126'),
    ('13438478', '1819164739684'),
    ('13544115', '1833725729244'),
    ('13415340', '1912003713975'),
    ('13514722', '41840982B20074'),
    ('13282527', '41810700B40030'),
    ('13275201', '40820707C40185'),
    ('13275190', '40820482D10088'),
    ('13276382', '41810769C40053'),
    ('13417871', '1892806750265'),
    ('13393559', '1838220737735'),
    ('13408697', '1870422694331'),
    ('13275190', '40820506C30146'),
    ('13514788', '1913256717164'),
    ('13275190', '40820482D20119'),
    ('13400225', '1902452700640'),
    ('13499690', '1827080796078'),
    ('13499690', '1827588789150'),
    ('13402650', '1871972735001'),
    ('13279018', '39830880A20086'),
    ('13276382', '41810745D30057'),
    ('13275853', '40820685000017'),
    ('13276382', '41810746A10066'),
    ('13403703', '1830532740214'),
    ('13275201', '40820707C30132'),
    ('13275201', '40820707D10133'),
    ('13417871', '1894735750035'),
    ('13275205', '40820706D40013'),
    ('13384109', '1872593713120'),
    ('13514788', '1916894718759'),
    ('13516272', '1887338760527'),
    ('13415340', '1911710713780'),
    ('13465713', '1876383744479'),
    ('13282147', '1894307719021'),
    ('13276382', '41810769000233'),
    ('13403703', '1835580734953'),
    ('13514788', '1913682718651'),
    ('13516879', '1874886723070'),
    ('13276382', '41810770A40029'),
    ('13276753', '41810724D40113'),
    ('13275201', '40820707D10101'),
    ('13431370', '1884944712035')
]

In [6]:
outg_rec_nbs = [x[0] for x in outg_rec_nbs_and_trsf_pole_nbs]
dovs_dev = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True,
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        outg_rec_nbs=outg_rec_nbs, 
        field_to_split='outg_rec_nbs', 
        include_premise=False
    ), 
    build_consolidated=False
)
dovs_df_dev = dovs_dev.df.copy()

n_coll = 100
batch_size = 1000
n_batches = 1


  df = pd.read_sql_query(sql, conn_db, **read_sql_args)


In [7]:
# dovs_df_dev

In [8]:
# New meter_events.events_summary_vw table became live on 2023-04-01
# Therefore, we can only really assess outages occurring on/after 2023-05-02 (due to 31-day look)
outg_rec_nbs = dovs_df_dev[dovs_df_dev['DT_OFF_TS'] >= pd.to_datetime('2023-05-02')]['OUTG_REC_NB'].unique().tolist()
outg_rec_nbs_and_trsf_pole_nbs = [x for x in outg_rec_nbs_and_trsf_pole_nbs if x[0] in outg_rec_nbs]

# BEG DEL

In [9]:
df_1 = OutagePredictor.build_eemsp_df(
    trsf_pole_nbs = [x[1] for x in outg_rec_nbs_and_trsf_pole_nbs], 
    date_range    =  [pd.to_datetime('2023-07-01'), pd.to_datetime('2023-09-01')], 
    addtnl_kwargs=None, 
    mult_strategy='agg', 
    include_n_eemsp=True, 
    cols_of_interest_eemsp=None, 
    numeric_cols = ['kva_size'], 
    dt_cols = ['install_dt', 'removal_dt'], 
    ignore_cols = ['serial_nb'], 
    batch_size=10000, 
    verbose=True, 
    n_update=10,
)

n_coll = 51
batch_size = 10000
n_batches = 1


  df = pd.read_sql_query(sql, conn_db, **read_sql_args)


0


  srs_i = df_eemsp_i.agg(agg_dict)
  srs_i = df_eemsp_i.agg(agg_dict)
  srs_i = df_eemsp_i.agg(agg_dict)
  srs_i = df_eemsp_i.agg(agg_dict)


In [10]:
df_2 = OutagePredictor.build_eemsp_df_OLD(
    trsf_pole_nbs = [x[1] for x in outg_rec_nbs_and_trsf_pole_nbs], 
    date_range    =  [pd.to_datetime('2023-07-01'), pd.to_datetime('2023-09-01')], 
    conn_aws=None,  
    mult_strategy='agg', 
    include_n_eemsp=True, 
    cols_of_interest_eemsp=None, 
    numeric_cols = ['kva_size'], 
    dt_cols = ['install_dt', 'removal_dt'], 
    ignore_cols = ['serial_nb'], 
    batch_size=10000, 
    verbose=True, 
    n_update=10,
)

n_coll = 51
batch_size = 10000
n_batches = 1


  eemsp_df_i = pd.read_sql_query(sql_EEMSP_i, conn_aws)


0


  srs_i = df_eemsp_i.agg(agg_dict)
  srs_i = df_eemsp_i.agg(agg_dict)
  srs_i = df_eemsp_i.agg(agg_dict)
  srs_i = df_eemsp_i.agg(agg_dict)


In [11]:
df_1.shape

(50, 19)

In [12]:
df_2.shape

(50, 19)

In [13]:
df_1.equals(df_2)

True

In [14]:
set(df_1['LOCATION_NB'].unique()).symmetric_difference(set(df_2['LOCATION_NB'].unique()))

set()

In [15]:
df_1[df_1['LOCATION_NB']=='1916894718759']

Unnamed: 0,LOCATION_NB,KVA_SIZE,INSTALL_DT,REMOVAL_DT,COOLANT,EQTYPE_ID,INFO,LAST_TRANS_DESC,LATEST_STATUS,MFGR_NM,PHASE_CNT,PRIM_VOLTAGE,PROTECTION,PRU_NUMBER,SEC_VOLTAGE,SPECIAL_CHAR,TAPS,XFTYPE,n_eemsp


In [16]:
[x[1] for x in outg_rec_nbs_and_trsf_pole_nbs]

['1887822760551',
 '1916206716391',
 '1884744712610',
 '1893731751927',
 '1895111751920',
 '1912458713764',
 '1852582750272',
 '1834040741037',
 '1893939751371',
 '1838888738583',
 '1914777719791',
 '1912185712890',
 '1860302692180',
 '1911697717715',
 '1848168697905',
 '1834467736881',
 '1914335720034',
 '1893308707792',
 '40830120D20010',
 '1867427734381',
 '1837283735919',
 '1838734738009',
 '1868344715875',
 '1913460716874',
 '1914362717373',
 '1871656735225',
 '1916295718725',
 '1886191712330',
 '1819164739684',
 '1833725729244',
 '1912003713975',
 '41840982B20074',
 '1892806750265',
 '1838220737735',
 '1870422694331',
 '1913256717164',
 '1902452700640',
 '1827080796078',
 '1827588789150',
 '1871972735001',
 '1830532740214',
 '1894735750035',
 '1872593713120',
 '1916894718759',
 '1887338760527',
 '1911710713780',
 '1876383744479',
 '1835580734953',
 '1913682718651',
 '1874886723070',
 '1884944712035']

# END DEL

In [None]:
prediction_date   = pd.to_datetime('2023-07-27')
# prediction_date   = pd.to_datetime('2023-09-01')

In [None]:
self = OutagePredictor(prediction_date=prediction_date)
#-----
self.set_model_dir(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models\All_EEMSP_agg_Top10_v2_NoMonth')
#-----
trsf_pole_nbs = [x[1] for x in outg_rec_nbs_and_trsf_pole_nbs]
self.set_trsf_pole_nbs(trsf_pole_nbs)
# self.set_trsf_pole_nbs_from_sql(
#     n_trsf_pole_nbs=10000, 
#     states='OH'
# )
# print(f"n_trsf_pole_nbs = {len(self.trsf_pole_nbs)}")
# print(f"\nSQL statement:\n{self.trsf_pole_nbs_sql}")
#-----
self.initialize_data(
    evsSum_sql_fcn=AMIEndEvents_SQL.build_sql_end_events,  
    evsSum_sql_kwargs=dict(opco='oh'), 
)

In [None]:
# Make predictions
y_pred = self.model_clf.predict(self.X_test)

In [None]:
y_scores = self.model_clf.predict_proba(self.X_test)

In [None]:
precisions, recalls, thresholds = precision_recall_curve(y_pred, y_scores[:,1])

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

In [None]:
# Sanity check
threshold=0.5
y_pred_from_proba = [int(x>threshold) for x in y_scores[:, 1]]
assert(all(y_pred_from_proba==y_pred))

In [None]:
[x for x in y_scores[:, 1]]==[x[1] for x in y_scores]

In [None]:
self.rcpx_df

In [None]:
assert(0)

In [None]:
# prediction_dates = pd.date_range(
#     start = pd.to_datetime('2023-07-01'), 
#     end   = pd.to_datetime('2023-08-01'), 
#     freq  = pd.Timedelta('1D')
# )
# prediction_dates

prediction_dates = pd.date_range(
    start = pd.to_datetime('2023-05-02'), 
    end   = pd.to_datetime('2023-09-30'), 
    freq  = pd.Timedelta('1D')
)
prediction_dates

In [None]:
%run ./model_end_events_for_outages_METHODS.ipynb

In [None]:
outg_prdctr = OutagePredictor(prediction_date=prediction_dates[0])
#-----
outg_prdctr.set_model_dir(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models\All_EEMSP_agg_Top10_v2_NoMonth')
#-----
trsf_pole_nbs = [x[1] for x in outg_rec_nbs_and_trsf_pole_nbs]
outg_prdctr.set_trsf_pole_nbs(trsf_pole_nbs)
# outg_prdctr.set_trsf_pole_nbs_from_sql(
#     n_trsf_pole_nbs=10000, 
#     states='OH'
# )
# print(f"n_trsf_pole_nbs = {len(outg_prdctr.trsf_pole_nbs)}")
# print(f"\nSQL statement:\n{outg_prdctr.trsf_pole_nbs_sql}")
#-----
outg_prdctr.prep_multiple_prediction_dates(
    prediction_dates  = prediction_dates, 
    predictions_range = None, 
    evsSum_sql_fcn    = AMIEndEvents_SQL.build_sql_end_events,  
    evsSum_sql_kwargs = dict(opco='oh'), 
    save_args         = False
)

In [None]:
results_dict = dict()
for prediction_date_i in prediction_dates:
    print(prediction_date_i)
    assert(prediction_date_i not in results_dict)
    outg_prdctr.change_prediction_date(prediction_date=prediction_date_i)
    y_pred_i   = outg_prdctr.model_clf.predict(outg_prdctr.X_test)
    y_scores_i = outg_prdctr.model_clf.predict_proba(outg_prdctr.X_test)
    #-----
    assert(outg_prdctr.rcpx_df.shape[0]==len(y_pred_i))
    assert(outg_prdctr.rcpx_df.shape[0]==len(y_scores_i))
    res_i = pd.DataFrame(
        index=outg_prdctr.rcpx_df.index, 
        data=dict(
            y_pred=y_pred_i, 
            y_prob_0=[x[0] for x in y_scores_i], 
            y_prob_1=[x[1] for x in y_scores_i]
        )
    )    
    #-----
    results_dict[prediction_date_i] = res_i

In [None]:
results_dict.keys()

In [None]:
df_i = results_dict[pd.to_datetime('2023-07-01 00:00:00')].copy()
df_i

In [None]:
df_i = Utilities_df.prepend_level_to_MultiIndex(
    df=df_i, 
    level_val=pd.to_datetime('2023-07-01 00:00:00'), 
    axis=1
)

In [None]:
return_df = pd.DataFrame()

In [None]:
for i, (prediction_date_i, df_i) in enumerate(results_dict.items()):
    df_i = Utilities_df.prepend_level_to_MultiIndex(
        df=df_i, 
        level_val=prediction_date_i, 
        axis=1
    )
    if i==0:
        return_df = df_i.copy()
    else:
        return_df = pd.merge(
            return_df, 
            df_i, 
            left_index=True, 
            right_index=True, 
            how='outer'
        )

In [None]:
return_df

In [None]:
y_pred_by_date_df  = return_df.xs('y_pred', level=1, axis=1)
y_prob1_by_date_df = return_df.xs('y_prob_1', level=1, axis=1)

In [None]:
fig,ax = Plot_General.default_subplots()
y_prob1_by_date_df.T.plot.line(ax=ax)

In [None]:
fig,ax = Plot_General.default_subplots()
y_prob1_by_date_df.iloc[:10].T.plot.line(ax=ax)

In [None]:
df_mp_install_time_col = 'inst_ts'
df_mp_removal_time_col = 'rmvl_ts'
dt_0 = prediction_dates[0]
dt_1 = prediction_dates[-1]

In [None]:
mp_df = MeterPremise.build_mp_df_curr_hist_for_xfmrs(
    trsf_pole_nbs=self.rcpx_df.index.tolist(), 
    join_curr_hist=True, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assume_one_xfmr_per_PN=True, 
    drop_approx_duplicates=True, 
    drop_approx_duplicates_args=None, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb'
)

# Only want meters active at the relevant time period
mp_df = mp_df[(mp_df[df_mp_install_time_col]<=pd.to_datetime(dt_0)) & 
              (mp_df[df_mp_removal_time_col].fillna(pd.Timestamp.max)>pd.to_datetime(dt_1))]

In [None]:
mp_df[mp_df['trsf_pole_nb']=='1817472740442']

In [None]:
# Build dovs_df
dovs = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True,
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        premise_nbs=mp_df['prem_nb'].unique().tolist(), 
        date_range=[dt_0, dt_1], 
        field_to_split='premise_nbs', 
        include_premise=True
    ), 
    build_consolidated=False
)
dovs_df = dovs.df.copy()

In [None]:
outg_rec_nbs = [x[0] for x in outg_rec_nbs_and_trsf_pole_nbs]
missing_outg_rec_nbs = list(set(outg_rec_nbs).difference(set(dovs_df['OUTG_REC_NB'].unique().tolist())))
if len(missing_outg_rec_nbs) > 0:
    addtnl_dovs = DOVSOutages(
        df_construct_type=DFConstructType.kRunSqlQuery, 
        contstruct_df_args=None, 
        init_df_in_constructor=True,
        build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
        build_sql_function_kwargs=dict(
            outg_rec_nbs=missing_outg_rec_nbs, 
            field_to_split='outg_rec_nbs', 
            include_premise=False
        ), 
        build_consolidated=False
    )
    addtnl_dovs_df = addtnl_dovs.df.copy()
    #-----
    dovs_df = pd.concat([dovs_df, addtnl_dovs_df])
assert(set(outg_rec_nbs).difference(set(dovs_df['OUTG_REC_NB'].unique().tolist()))==set())

In [None]:
dovs_df = pd.merge(
    dovs_df, 
    mp_df[['prem_nb', 'trsf_pole_nb']].drop_duplicates(), 
    left_on='PREMISE_NB', 
    right_on='prem_nb', 
    how='left'
)
dovs_df

In [None]:
# Set predictions column in self.rcpx_df
assert(self.rcpx_df.shape[0]==y_pred.shape[0])
self.rcpx_df['y_pred'] = y_pred

In [None]:
mp_df_pred1 = mp_df[mp_df['trsf_pole_nb'].isin(self.rcpx_df[self.rcpx_df['y_pred']==1].index.tolist())].copy()
dovs_df_pred1 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred1['prem_nb'].unique().tolist())]
#-----
mp_df_pred0 = mp_df[mp_df['trsf_pole_nb'].isin(self.rcpx_df[self.rcpx_df['y_pred']==0].index.tolist())].copy()
dovs_df_pred0 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred0['prem_nb'].unique().tolist())]

In [None]:
fig_num=0

trsf_pole_nb_i = y_prob1_by_date_df.index[0]
# trsf_pole_nb_i = '1916206716391'

fig,ax = plot_pred_probs_for_trsf_i(
    fig_num            = fig_num, 
    trsf_pole_nb_i     = trsf_pole_nb_i, 
    y_prob1_by_date_df = y_prob1_by_date_df, 
    dovs_df            = dovs_df, 
    trsf_pole_nb_col   = 'trsf_pole_nb', 
    outg_rec_nb_col    = 'OUTG_REC_NB', 
    dt_off_ts_full_col = 'DT_OFF_TS_FULL', 
    plot_maxima=True
)

fig_num+=1

In [None]:
fig_num=0
for i, trsf_pole_nb_i in enumerate(y_prob1_by_date_df.index.unique().tolist()):
    outg_rec_nbs_i = [x[0] for x in outg_rec_nbs_and_trsf_pole_nbs if x[1]==trsf_pole_nb_i]
    #-----
    fig,ax = plot_pred_probs_for_trsf_i(
        fig_num            = fig_num, 
        trsf_pole_nb_i     = trsf_pole_nb_i, 
        y_prob1_by_date_df = y_prob1_by_date_df, 
        dovs_df            = dovs_df, 
        trsf_pole_nb_col   = 'trsf_pole_nb', 
        outg_rec_nb_col    = 'OUTG_REC_NB', 
        dt_off_ts_full_col = 'DT_OFF_TS_FULL', 
        plot_maxima=True
    )
    #-----
    fig_num += 1

In [None]:
outg_prdctr.model_dir

In [None]:
C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models\

In [None]:
time_infos_df_outg          = pd.read_pickle(os.path.join(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models', 'time_infos_df_outg.pkl'))
no_outg_time_infos_df       = pd.read_pickle(os.path.join(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models', 'no_outg_time_infos_df.pkl'))
no_outg_time_infos_prstn_df = pd.read_pickle(os.path.join(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models', 'no_outg_time_infos_prstn_df.pkl'))

In [None]:
time_infos_df_outg

In [None]:
no_outg_time_infos_df

In [None]:
no_outg_time_infos_prstn_df

In [None]:
#-------------------------
# 1A. Make sure the transformer pole numbers have been set (whether done explicitly with
#      set_trsf_pole_nbs or via sql with set_trsf_pole_nbs_from_sql)
assert(self.trsf_pole_nbs is not None and len(self.trsf_pole_nbs)>0)

# 1B. Make sure the model_dir has been set (and, through set_model_dir, all needed components
#       have been extracted)
assert(self.model_dir is not None and os.path.exists(self.model_dir))

In [None]:
#-------------------------
# 2. Build events summary (self.evsSum)
self.build_events_summary(
        evsSum_sql_fcn=AMIEndEvents_SQL.build_sql_end_events,  
        evsSum_sql_kwargs=dict(opco='oh'), 
        init_df_in_constructor=True, 
        save_args=False
    )

In [None]:
evsSum_df = self.evsSum_df
data_structure_df = self.data_structure_df
td_min=self.idk_name_2
td_max=self.idk_name_1
cr_trans_dict = self.cr_trans_dict
trsf_pole_nbs = self.trsf_pole_nbs
eemsp_enc = self.eemsp_enc
scaler = self.scaler
eemsp_mult_strategy = self.eemsp_mult_strategy
merge_eemsp = self.merge_eemsp
include_month = self.include_month
date_range = self.date_range

freq='5D'
group_cols=['trsf_pole_nb']
date_col = 'aep_event_dt'
normalize_by_SNs=True
normalize_by_time=True

include_power_down_minus_up=False
regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*']
combine_cpo_df_reasons=True
include_n_eemsp=True

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()

# BEG DEL

In [None]:
prediction_date = self.prediction_date

xf_meter_cnt_col = 'xf_meter_cnt'
events_tot_col = 'events_tot'
trsf_pole_nb_col = 'trsf_pole_nb'
other_reasons_col = 'Other Reasons'
total_counts_col = 'total_counts'
nSNs_col         = 'nSNs'

In [None]:
    def build_rcpx_from_evsSum_df(
        evsSum_df, 
        data_structure_df, 
        prediction_date, 
        td_min, 
        td_max, 
        cr_trans_dict, 
        freq='5D', 
        group_cols=['trsf_pole_nb'], 
        date_col='aep_event_dt', 
        normalize_by_SNs=True, 
        normalize_by_time=True, 
        include_power_down_minus_up=False, 
        regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*'], 
        combine_cpo_df_reasons=True, 
        xf_meter_cnt_col = 'xf_meter_cnt', 
        events_tot_col = 'events_tot', 
        trsf_pole_nb_col = 'trsf_pole_nb', 
        other_reasons_col = 'Other Reasons',  # From data_structure_df
        total_counts_col = 'total_counts', 
        nSNs_col         = 'nSNs', 
    ):
        r"""
        This function only permits uniform time periods; i.e., all periods will have length equal to freq.
        If one wants to use variable spacing (e.g., maybe the first group is one day in width, the second is
          three days, all others equal to five days), a new function will need to be built.
          
        NOTE: td_min, td_max, and freq must all be in DAYS
        """
        #--------------------------------------------------
        # 0. Need data_structure_df
        #     In general, not all curated reasons will be included in the model.
        #     Typically, 10 commong curated reasons will be included, and all others will be grouped together in "Other Reasons".
        #     Furthermore, some reasons may be combined together, others may be completely removed.
        #     For these reasons, it is beneficial to have some sample data (taken from when the model was created) to utilize 
        #       in structuring the new data in the same fashion.
        #     Additionally, the data will be used to ensure the ordering of columns is correct before the data are fed into 
        #       the model.
        assert(isinstance(data_structure_df, pd.DataFrame) and data_structure_df.shape[0]>0)

        #--------------------------------------------------
        #--------------------------------------------------
        # 1. Build rcpo_0
        #     Construct rcpx_0 by aggregating evsSum_df by group_cols and by freq on date_col
        #--------------------------------------------------
        evsSum_df = evsSum_df.copy()
        #-------------------------
        if not isinstance(group_cols, list):
            group_cols = [group_cols]
        assert(len(set(group_cols).difference(set(evsSum_df.columns.tolist())))==0)
        #-------------------------
        # Need to set origin in pd.Grouper to ensure proper grouping
        freq = pd.Timedelta(freq)
        assert((td_max-td_min) % freq==pd.Timedelta(0))
        #-----
        time_grps = pd.date_range(
            start = prediction_date-td_max, 
            end   = prediction_date-td_min, 
            freq  = freq
        )
        time_grps = [(time_grps[i], time_grps[i+1]) for i in range(len(time_grps)-1)]
        assert(len(time_grps) == (td_max-td_min)/pd.Timedelta(freq))
        #-------------------------
        group_freq=pd.Grouper(freq=freq, key=date_col, origin=time_grps[0][0])
        #-------------------------
        cr_cols = Utilities.find_in_list_with_regex(
            lst=evsSum_df.columns.tolist(), 
            regex_pattern=r'cr\d*', 
            ignore_case=False
        )
        #-----
        cols_to_drop = set(evsSum_df.columns.tolist()).difference(
            set(cr_cols+group_cols+[date_col, xf_meter_cnt_col, events_tot_col])
        )
        cols_to_drop = list(cols_to_drop)
        #-------------------------
        # Make sure date_col is datetime object
        evsSum_df[date_col] = pd.to_datetime(evsSum_df[date_col])
        
        #-------------------------
        # No need in wasting time grouping data we won't use
        # So, reduce evsSum_df to only the dates we're interested in 
        evsSum_df = evsSum_df[
            (evsSum_df[date_col] >= prediction_date-td_max) & 
            (evsSum_df[date_col] <= prediction_date-td_min)
        ]


        # All of the cr# columns will be aggregated with np.sum, as will events_tot_col
        # xf_meter_cnt_col will be aggregated using np.max
        agg_dict = {col:np.sum for col in cr_cols+[events_tot_col, xf_meter_cnt_col]}
        agg_dict[xf_meter_cnt_col] = np.max
        #-------------------------
        rcpx_0 = evsSum_df.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

        #--------------------------------------------------
        # 2. Grab meter_cnt_per_gp_srs and all_groups
        #--------------------------------------------------
        # Project out the meter count per group, as it will be used later
        #   This information will be stored in the pd.Series object meter_cnt_per_gp_srs, where the index will
        #   contain the group_cols
        meter_cnt_per_gp_srs = rcpx_0.reset_index()[group_cols+[xf_meter_cnt_col]].drop_duplicates().set_index(group_cols).squeeze()
        assert(meter_cnt_per_gp_srs.shape[0]==meter_cnt_per_gp_srs.index.nunique())
        meter_cnt_per_gp_srs.name = nSNs_col

        # Will also need the unique groups in rcpx_0
        #   This will be used later (see no_events_pd_i below)
        #   These can be grabbed from the index of rcpx_0 (excluding the date_col level)
        all_groups = rcpx_0.droplevel(date_col, axis=0).index.unique().tolist()

        #--------------------------------------------------
        # 3. Transform rcpx_0 to the form expected by the model
        #     i.e., similar to data_structure_df.
        #     This is essentially just changing rcpo_0 from long form to wide form
        #--------------------------------------------------
        #-------------------------
        # 3a. Build time_pds_rename
        #      Need to convert the time periods, which are currently housed in the date_col index of 
        #        rcpx_0 from their specific dates to the names expected by the model.
        #      In rcpx_0, after grouping by the freq intervals, the values of date_col are equal to the beginning
        #        dates of the given interval.
        #      These will be converted to the titles contained in final_time_pds below
        #      NOTE: This is probably not 100% necessary, but is useful nonetheless
        #-------------------------
        curr_time_pds = [x[0] for x in time_grps]
        time_pds_rename = OutagePredictor.get_time_pds_rename(
            curr_time_pds=curr_time_pds, 
            td_min=td_min, 
            td_max=td_max, 
            freq=freq
        )
        final_time_pds = list(time_pds_rename.values())
        # final_time_pds should all be found in data_structure_df to help
        #   ensure the alignment between the current data and data used when modelling
        assert(set(final_time_pds).difference(data_structure_df.columns.get_level_values(0).unique())==set())
        #-------------------------
        # Overkill here (since all time windows are of length freq), but something similar will 
        #   be needed if I want to move to non-uniform period lengths
        time_grps_dict = dict()
        assert(len(curr_time_pds) == len(time_grps))
        # Each element in curr_time_pds should match exactly one of the 0th elements 
        #   in time_grps (which is a list of length-2 tuples)
        # Make sure this is so while building time_grps_dict
        for curr_time_pd_i in curr_time_pds:
            time_grp_i = [x for x in time_grps if x[0]==curr_time_pd_i]
            assert(len(time_grp_i)==1)
            assert(curr_time_pd_i not in time_grps_dict.keys())
            time_grps_dict[curr_time_pd_i] = time_grp_i[0]

        #-------------------------
        # 3b. Transform rcpx_0 to the form expected by the model
        #      As stated above, this is essentially just changing rcpo_0 from long form to wide form
        #      This will probably be formalized further in the future (i.e., function(s) developed to handle)
        rename_cols = {
            events_tot_col:total_counts_col, 
            xf_meter_cnt_col:nSNs_col
        }
        rcpx_0=rcpx_0.rename(columns=rename_cols)
        #-------------------------
        total_counts_col = total_counts_col
        nSNs_col         = nSNs_col
        non_reason_cols = [nSNs_col, total_counts_col]
        #------------------------- 
        pd_dfs = []
        for date_pd_i in curr_time_pds:
            # Grab the proper time period name from final_time_pd_i
            final_time_pd_i = time_pds_rename[date_pd_i]
            #-------------------------
            rcpx_0_pd_i = OutagePredictor.project_time_pd_from_rcpx_0_and_prepare(
                rcpx_0                      = rcpx_0, 
                date_pd_i                   = date_pd_i, 
                final_time_pd_i             = final_time_pd_i, 
                data_structure_df           = data_structure_df, 
                meter_cnt_per_gp_srs        = meter_cnt_per_gp_srs, 
                all_groups                  = all_groups, 
                cr_trans_dict               = cr_trans_dict, 
                non_reason_cols             = non_reason_cols, 
                other_reasons_col           = other_reasons_col, 
                group_cols                  = group_cols, 
                date_col                    = date_col, 
                regex_patterns_to_remove    = regex_patterns_to_remove, 
                combine_cpo_df_reasons      = combine_cpo_df_reasons, 
                include_power_down_minus_up = include_power_down_minus_up, 
                total_counts_col            = total_counts_col, 
                nSNs_col                    = nSNs_col
            )
            #-------------------------
            # Overkill here (since all time windows are of length freq), but something similar will 
            #   be needed if I want to move to non-uniform period lengths
            # One could, e.g., simply divide by length of freq in days
            if normalize_by_time:
                time_grp_i = time_grps_dict[date_pd_i]
                #-----
                days_min_outg_td_window_i = prediction_date - time_grp_i[1]
                days_max_outg_td_window_i = prediction_date - time_grp_i[0]
                #-----
                OutagePredictor.assert_timedelta_is_days(days_min_outg_td_window_i)
                OutagePredictor.assert_timedelta_is_days(days_max_outg_td_window_i)
                #-----
                days_min_outg_td_window_i = days_min_outg_td_window_i.days
                days_max_outg_td_window_i = days_max_outg_td_window_i.days
                #-------------------------
                rcpx_0_pd_i = MECPODf.normalize_rcpo_df_by_time_interval(
                    rcpo_df                 = rcpx_0_pd_i, 
                    days_min_outg_td_window = days_min_outg_td_window_i, 
                    days_max_outg_td_window = days_max_outg_td_window_i, 
                    cols_to_adjust          = None, 
                    SNs_tags                = None, 
                    inplace                 = True
                )
            #-------------------------
            pd_dfs.append(rcpx_0_pd_i)

        # Make sure all dfs in pd_dfs look correct
        shape_0 = pd_dfs[0].shape
        index_0 = pd_dfs[0].index
        for i in range(len(pd_dfs)):
            if i==0:
                continue
            assert(pd_dfs[i].shape==shape_0)
            assert(len(set(index_0).symmetric_difference(set(pd_dfs[i].index)))==0)
            #-----
            # Aligning the indices is not strictly necessary, as pd.concat should handle that
            # But, it's best to be safe
            pd_dfs[i] = pd_dfs[i].loc[index_0]

        # Build rcpx_final by combining all dfs in pd_dfs
        rcpx_final = pd.concat(pd_dfs, axis=1)

        # Include back in the number of SNs per group (from meter_cnt_per_gp_srs)
        rcpx_final=rcpx_final.merge(
            meter_cnt_per_gp_srs.to_frame(name=(nSNs_col, nSNs_col)), 
            left_index=True, 
            right_index=True, 
            how='left'
        )
        # Sanity check on the merge
        assert(rcpx_final[nSNs_col].notna().all().all())

        #--------------------------------------------------
        # 4. Normalize by nSNs
        #--------------------------------------------------
        if normalize_by_SNs:
            # Kind of silly, but below I cannot simply use 'rcpx_final[final_time_pds] = ...'
            #   This will result in: "ValueError: Columns must be same length as key", because final_time_pds
            #   has only, e.g., 6 elements but rcpx_final[final_time_pds] contains, e.g., 72 columns
            # Instead, must use 'rcpx_final[rcpx_final[final_time_pds].columns] = ..'
            rcpx_final[rcpx_final[final_time_pds].columns] = rcpx_final[final_time_pds].divide(rcpx_final[(nSNs_col, nSNs_col)], axis=0)

        #--------------------------------------------------
        return rcpx_final

In [None]:
assert(isinstance(data_structure_df, pd.DataFrame) and data_structure_df.shape[0]>0)

#--------------------------------------------------
#--------------------------------------------------
# 1. Build rcpo_0
#     Construct rcpx_0 by aggregating evsSum_df by group_cols and by freq on date_col
#--------------------------------------------------
evsSum_df = evsSum_df.copy()
#-------------------------
if not isinstance(group_cols, list):
    group_cols = [group_cols]
assert(len(set(group_cols).difference(set(evsSum_df.columns.tolist())))==0)
#-------------------------
# Need to set origin in pd.Grouper to ensure proper grouping
freq = pd.Timedelta(freq)
assert((td_max-td_min) % freq==pd.Timedelta(0))
#-----
time_grps = pd.date_range(
    start = prediction_date-td_max, 
    end   = prediction_date-td_min, 
    freq  = freq
)
time_grps = [(time_grps[i], time_grps[i+1]) for i in range(len(time_grps)-1)]
assert(len(time_grps) == (td_max-td_min)/pd.Timedelta(freq))
#-----
group_freq=pd.Grouper(freq=freq, key=date_col, origin=time_grps[0][0])
#-------------------------
cr_cols = Utilities.find_in_list_with_regex(
    lst=evsSum_df.columns.tolist(), 
    regex_pattern=r'cr\d*', 
    ignore_case=False
)
#-----
cols_to_drop = set(evsSum_df.columns.tolist()).difference(
    set(cr_cols+group_cols+[date_col, xf_meter_cnt_col, events_tot_col])
)
cols_to_drop = list(cols_to_drop)
#-------------------------
# Make sure date_col is datetime object
evsSum_df[date_col] = pd.to_datetime(evsSum_df[date_col])

#-------------------------
# No need in wasting time grouping data we won't use
# So, reduce evsSum_df to only the dates we're interested in 
evsSum_df = evsSum_df[
    (evsSum_df[date_col] >= prediction_date-td_max) & 
    (evsSum_df[date_col] <= prediction_date-td_min)
]


# All of the cr# columns will be aggregated with np.sum, as will events_tot_col
# xf_meter_cnt_col will be aggregated using np.max
agg_dict = {col:np.sum for col in cr_cols+[events_tot_col, xf_meter_cnt_col]}
agg_dict[xf_meter_cnt_col] = np.max
#-------------------------
rcpx_0 = evsSum_df.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

#--------------------------------------------------
# 2. Grab meter_cnt_per_gp_srs and all_groups
#--------------------------------------------------
# Project out the meter count per group, as it will be used later
#   This information will be stored in the pd.Series object meter_cnt_per_gp_srs, where the index will
#   contain the group_cols
meter_cnt_per_gp_srs = rcpx_0.reset_index()[group_cols+[xf_meter_cnt_col]].drop_duplicates().set_index(group_cols).squeeze()
assert(meter_cnt_per_gp_srs.shape[0]==meter_cnt_per_gp_srs.index.nunique())
meter_cnt_per_gp_srs.name = nSNs_col

# Will also need the unique groups in rcpx_0
#   This will be used later (see no_events_pd_i below)
#   These can be grabbed from the index of rcpx_0 (excluding the date_col level)
all_groups = rcpx_0.droplevel(date_col, axis=0).index.unique().tolist()

#--------------------------------------------------
# 3. Transform rcpx_0 to the form expected by the model
#     i.e., similar to data_structure_df.
#     This is essentially just changing rcpo_0 from long form to wide form
#--------------------------------------------------
#-------------------------
# 3a. Build time_pds_rename
#      Need to convert the time periods, which are currently housed in the date_col index of 
#        rcpx_0 from their specific dates to the names expected by the model.
#      In rcpx_0, after grouping by the freq intervals, the values of date_col are equal to the beginning
#        dates of the given interval.
#      These will be converted to the titles contained in final_time_pds below
#      NOTE: This is probably not 100% necessary, but is useful nonetheless
#-------------------------
curr_time_pds = [x[0] for x in time_grps]
time_pds_rename = OutagePredictor.get_time_pds_rename(
    curr_time_pds=curr_time_pds, 
    td_min=td_min, 
    td_max=td_max, 
    freq=freq
)
final_time_pds = list(time_pds_rename.values())
# final_time_pds should all be found in data_structure_df to help
#   ensure the alignment between the current data and data used when modelling
assert(set(final_time_pds).difference(data_structure_df.columns.get_level_values(0).unique())==set())

In [None]:
time_grps_dict = dict()
assert(len(curr_time_pds) == len(time_grps))
# Each element in curr_time_pds should match exactly one of the 0th elements 
#   in time_grps (which is a list of length-2 tuples)
# Make sure this is so while building time_grps_dict
for curr_time_pd_i in curr_time_pds:
    time_grp_i = [x for x in time_grps if x[0]==curr_time_pd_i]
    assert(len(time_grp_i)==1)
    assert(curr_time_pd_i not in time_grps_dict.keys())
    time_grps_dict[curr_time_pd_i] = time_grp_i[0]

In [None]:
time_grps_dict

In [None]:
date_pd_i =  curr_time_pds[0]

In [None]:
time_grp_i = time_grps_dict[date_pd_i]
#-----
days_min_outg_td_window_i = prediction_date - time_grp_i[1]
days_max_outg_td_window_i = prediction_date - time_grp_i[0]
#-----
OutagePredictor.assert_timedelta_is_days(days_min_outg_td_window_i)
OutagePredictor.assert_timedelta_is_days(days_max_outg_td_window_i)
#-----
days_min_outg_td_window_i = days_min_outg_td_window_i.days
days_max_outg_td_window_i = days_max_outg_td_window_i.days

In [None]:
#-------------------------
# 3. Build reason counts per x df (self.rcpx_df)
rcpx_df_OG = OutagePredictor.build_rcpx_from_evsSum_df(
    evsSum_df=evsSum_df, 
    data_structure_df=data_structure_df, 
    prediction_date=prediction_date, 
    td_min=self.idk_name_2, 
    td_max=self.idk_name_1, 
    cr_trans_dict=cr_trans_dict, 
    freq=freq, 
    group_cols=group_cols, 
    date_col=date_col, 
    normalize_by_SNs=normalize_by_SNs, 
    normalize_by_time=normalize_by_time, 
    include_power_down_minus_up=include_power_down_minus_up, 
    regex_patterns_to_remove=regex_patterns_to_remove, 
    combine_cpo_df_reasons=combine_cpo_df_reasons, 
    xf_meter_cnt_col = 'xf_meter_cnt', 
    events_tot_col = 'events_tot', 
    trsf_pole_nb_col = 'trsf_pole_nb', 
    other_reasons_col = 'Other Reasons',  # From data_structure_df
    total_counts_col = 'total_counts', 
    nSNs_col         = 'nSNs'
)

In [None]:
self.idk_name_2

In [None]:
self.idk_name_1

In [None]:
prediction_date

In [None]:
rcpx_df_OG.loc['40820650C10054']

In [None]:
evsSum_df[evsSum_df['trsf_pole_nb']=='40820650C10054'].sort_values(by=['aep_event_dt'])

In [None]:
        assert(isinstance(data_structure_df, pd.DataFrame) and data_structure_df.shape[0]>0)

        #--------------------------------------------------
        #--------------------------------------------------
        # 1. Build rcpo_0
        #     Construct rcpx_0 by aggregating evsSum_df by group_cols and by freq on date_col
        #--------------------------------------------------
        evsSum_df = evsSum_df.copy()
        #-------------------------
        if not isinstance(group_cols, list):
            group_cols = [group_cols]
        assert(len(set(group_cols).difference(set(evsSum_df.columns.tolist())))==0)
        #-------------------------
        # Need to set origin in pd.Grouper to ensure proper grouping
        freq = pd.Timedelta(freq)
        assert((td_max-td_min) % freq==pd.Timedelta(0))
        #-----
        time_grps = pd.date_range(
            start = prediction_date-td_max, 
            end   = prediction_date-td_min, 
            freq  = freq
        )
        time_grps = [(time_grps[i], time_grps[i+1]) for i in range(len(time_grps)-1)]
        assert(len(time_grps) == (td_max-td_min)/pd.Timedelta(freq))
        #-----
        group_freq=pd.Grouper(freq=freq, key=date_col, origin=time_grps[0][0])
        #-------------------------
        cr_cols = Utilities.find_in_list_with_regex(
            lst=evsSum_df.columns.tolist(), 
            regex_pattern=r'cr\d*', 
            ignore_case=False
        )
        #-----
        cols_to_drop = set(evsSum_df.columns.tolist()).difference(
            set(cr_cols+group_cols+[date_col, xf_meter_cnt_col, events_tot_col])
        )
        cols_to_drop = list(cols_to_drop)
        #-------------------------
        # Make sure date_col is datetime object
        evsSum_df[date_col] = pd.to_datetime(evsSum_df[date_col])
        
        #-------------------------
        # No need in wasting time grouping data we won't use
        # So, reduce evsSum_df to only the dates we're interested in 
        evsSum_df = evsSum_df[
            (evsSum_df[date_col] >= prediction_date-td_max) & 
            (evsSum_df[date_col] <= prediction_date-td_min)
        ]


        # All of the cr# columns will be aggregated with np.sum, as will events_tot_col
        # xf_meter_cnt_col will be aggregated using np.max
        agg_dict = {col:np.sum for col in cr_cols+[events_tot_col, xf_meter_cnt_col]}
        agg_dict[xf_meter_cnt_col] = np.max
        #-------------------------
        rcpx_0 = evsSum_df.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

In [None]:
rcpx_0.loc['40820650C10054']

In [None]:
evsSum_df[evsSum_df['trsf_pole_nb']=='40820650C10054'].sort_values(by=['aep_event_dt'])

# END DEL

In [None]:
#-------------------------
# 3. Build reason counts per x df (self.rcpx_df)
rcpx_df_OG = OutagePredictor.build_rcpx_from_evsSum_df(
    evsSum_df=evsSum_df, 
    data_structure_df=data_structure_df, 
    td_min=self.idk_name_2, 
    td_max=self.idk_name_1, 
    cr_trans_dict=cr_trans_dict, 
    freq=freq, 
    group_cols=group_cols, 
    date_col=date_col, 
    normalize_by_SNs=normalize_by_SNs, 
    normalize_by_time=normalize_by_time, 
    include_power_down_minus_up=include_power_down_minus_up, 
    regex_patterns_to_remove=regex_patterns_to_remove, 
    combine_cpo_df_reasons=combine_cpo_df_reasons, 
    xf_meter_cnt_col = 'xf_meter_cnt', 
    events_tot_col = 'events_tot', 
    trsf_pole_nb_col = 'trsf_pole_nb', 
    other_reasons_col = 'Other Reasons',  # From data_structure_df
    total_counts_col = 'total_counts', 
    nSNs_col         = 'nSNs'
)

In [None]:
rcpx_df = rcpx_df_OG.copy()

In [None]:
#-------------------------
# 4. If including EEMSP, build and merge with rcpx
if merge_eemsp:
    rcpx_df, eemsp_df = OutagePredictor.build_eemsp_df_and_merge_rcpx( 
            rcpx_df=rcpx_df, 
            trsf_pole_nbs=trsf_pole_nbs, 
            date_range=date_range, 
            merge_on_rcpx=['index_0'], 
            merge_on_eems=['LOCATION_NB'], 
            conn_aws=None, 
            mult_strategy=eemsp_mult_strategy, 
            include_n_eemsp=include_n_eemsp, 
            cols_of_interest_eemsp=None, 
            numeric_cols = ['kva_size'], 
            dt_cols = ['install_dt', 'removal_dt'], 
            ignore_cols = ['serial_nb'], 
            batch_size=10000, 
            verbose=True, 
            n_update=10, 
        )
    #-----
    rcpx_df = OutagePredictor.convert_install_dt_to_years(
        rcpx_df=rcpx_df, 
        prediction_date=prediction_date, 
        install_dt_col=('EEMSP_0', 'INSTALL_DT'), 
        assert_col_found=False
    )

In [None]:
#-------------------------
# 5. Include month?
if include_month:
    rcpx_df = OutagePredictor.add_predict_month_to_rcpx_df(
        rcpx_df=rcpx_df, 
        prediction_date=prediction_date, 
        month_col=('dummy_lvl_0', 'outg_month'), 
        dummy_col_levels_prefix='dummy'
    )

In [None]:
#-------------------------
# 6. Make sure final form of rcpx_df agrees with data_structure_df
rcpx_df = OutagePredictor.assert_rcpx_has_correct_form(
    rcpx_df=rcpx_df, 
    data_structure_df=data_structure_df
)

In [None]:
#-------------------------
# 7. Build X_test
X_test_no1 = OutagePredictor.build_X_test(
    rcpx_df=rcpx_df, 
    data_structure_df=data_structure_df, 
    eemsp_args=dict(eemsp_enc=eemsp_enc), 
    scaler=scaler
)

In [None]:
rcpx_df_no1 = rcpx_df.copy()

In [None]:
(X_test_no1==self.X_test).all()

In [None]:
rcpx_df_no1.equals(self.rcpx_df)

In [None]:
X_test = X_test_no1.copy()

In [None]:
# Make predictions
y_pred = self.model_clf.predict(X_test)

In [None]:
print(f"# Outages Predicted: {y_pred.sum()}")
print(f"# Predictions:       {y_pred.shape[0]}")
print(f"%:                   {100*y_pred.sum()/y_pred.shape[0]}")

In [None]:
rcpx_final = rcpx_df.copy()

In [None]:
# Set predictions column in rcpx_final
assert(rcpx_final.shape[0]==y_pred.shape[0])
rcpx_final['y_pred'] = y_pred

In [None]:
rcpx_final[rcpx_final['y_pred']==1]

In [None]:
df_mp_install_time_col = 'inst_ts'
df_mp_removal_time_col = 'rmvl_ts'
dt_0 = prediction_date
dt_1 = prediction_date

In [None]:
mp_df = MeterPremise.build_mp_df_curr_hist_for_xfmrs(
    trsf_pole_nbs=rcpx_final.index.tolist(), 
    join_curr_hist=True, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assume_one_xfmr_per_PN=True, 
    drop_approx_duplicates=True, 
    drop_approx_duplicates_args=None, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb'
)

# Only want meters active at the relevant time period
mp_df = mp_df[(mp_df[df_mp_install_time_col]<=pd.to_datetime(dt_0)) & 
              (mp_df[df_mp_removal_time_col].fillna(pd.Timestamp.max)>pd.to_datetime(dt_1))]

In [None]:
# Build dovs_df
dovs = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True,
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        premise_nbs=mp_df['prem_nb'].unique().tolist(), 
        date_range=[
            prediction_date-pd.Timedelta('31D'), 
            prediction_date+pd.Timedelta('31D')
        ], 
        field_to_split='premise_nbs', 
        include_premise=True
    ), 
    build_consolidated=False
)
dovs_df = dovs.df.copy()

In [None]:
dovs_df = pd.merge(
    dovs_df, 
    mp_df[['prem_nb', 'trsf_pole_nb']].drop_duplicates(), 
    left_on='PREMISE_NB', 
    right_on='prem_nb', 
    how='left'
)
dovs_df

In [None]:
mp_df_pred1 = mp_df[mp_df['trsf_pole_nb'].isin(rcpx_final[rcpx_final['y_pred']==1].index.tolist())].copy()
dovs_df_pred1 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred1['prem_nb'].unique().tolist())]
#-----
mp_df_pred0 = mp_df[mp_df['trsf_pole_nb'].isin(rcpx_final[rcpx_final['y_pred']==0].index.tolist())].copy()
dovs_df_pred0 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred0['prem_nb'].unique().tolist())]

In [None]:
dovs_df_pred1['DT_OFF_TS_FULL'].nunique()

In [None]:
dovs_df_pred0['DT_OFF_TS_FULL'].nunique()

In [None]:
dovs_df_pred1['DT_OFF_TS_FULL']

In [None]:
natsorted(dovs_df_pred1['DT_OFF_TS_FULL'].unique())

In [None]:
prediction_date

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.plot.scatter(ax=ax, x='DT_OFF_TS_FULL', y='CI_NB')

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.plot.scatter(ax=ax, x='DT_OFF_TS_FULL', y='CMI_NB')

In [None]:
dovs_df_pred1

In [None]:
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL')).count()

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CMI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CMI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
dovs_df['OUTG_REC_NB'].nunique()

In [None]:
dovs_df['trsf_pole_nb'].nunique()

In [None]:
rcpx_final[rcpx_final['y_pred']==1]

In [None]:
set(dovs_df['trsf_pole_nb'].unique()).intersection(set(rcpx_final[rcpx_final['y_pred']==1].index))

In [None]:
natsorted(rcpx_final[rcpx_final['y_pred']==1].index)

In [None]:
natsorted(dovs_df['trsf_pole_nb'].unique())

In [None]:
set(dovs_df['trsf_pole_nb'].unique()).intersection(set(rcpx_final.index))

In [None]:
dovs_df[dovs_df['DT_OFF_TS']=='2023-06-01']

In [None]:
full_data_df_test_i = pd.read_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models\All_EEMSP_agg_Top10_v2\full_data_df_test_i.pkl')

In [None]:
full_data_df_test_i.loc[
    (full_data_df_test_i.index.get_level_values(1)=='1870612751127') & 
    (full_data_df_test_i.index.get_level_values(0).isin(['13382076', '13382693']))
]

In [None]:
rcpx_df[rcpx_df.index=='1870612751127']

In [None]:
rcpx_df[rcpx_df.index=='1870612751127']

In [None]:
wtf = full_data_df_test_i.loc[
    (full_data_df_test_i.index.get_level_values(1)=='1870612751127') & 
    (full_data_df_test_i.index.get_level_values(0).isin(['13382076', '13382693']))
].copy()

In [None]:
wtf.iloc[0]!=wtf.iloc[1]

In [None]:
wtf.diff()

In [None]:
hmm = wtf.iloc[0]!=wtf.iloc[1]

In [None]:
hmm[hmm].index.tolist()

In [None]:
wtf[('EEMSP_0', 'INSTALL_DT')]

In [None]:
wtf[hmm[hmm].index.tolist()]

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# ----------------------------------------------------------------------------------------------------

# SET model_dir TO YOUR LOCAL VALUE!!!!!
This directory should house the following files:
- forest_clf.joblib
- scaler.joblib
- eemsp_encoder.joblib
- data_structure_df.pkl

In [None]:
model_dir = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230615\Models\All_EEMSP_agg_Top10_v2'

# Randomly chosen trsf_pole_nbs
I randomly choses the trsf_pole_nbs below from a dataset I was working with.
</br>The purpose is simply to create a smaller, more manageable, dataset to work with for this demo (as opposed to, e.g., taking all Ohio data)

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()

In [None]:
# trsf_pole_nbs_df = MeterPremise.get_distinct_trsf_pole_nbs(
#     conn_aws=conn_aws, 
#     states='OH'
# )
# trsf_pole_nbs = trsf_pole_nbs_df.sample(n=10000)['trsf_pole_nb'].tolist()

In [None]:
trsf_pole_nbs

# Prediction Date
This corresponds essentially to the day on which the model will be run/data evaluated.
</br>Data will be collected for a period spanning 31 days before the prediction date up to 1 day before.
</br>Eventually, the data will be grouped into the 5-day periods:'01-06 Days', '06-11 Days', '11-16 Days', '16-21 Days', '21-26 Days','26-31 Days'

In [None]:
prediction_date = pd.to_datetime('2023-06-01')
date_range = [
    prediction_date-pd.Timedelta('31D'), 
    prediction_date-pd.Timedelta('1D')
]

In [None]:
len(trsf_pole_nbs)

# Grab the data from meter_events.events_summary_vw

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()
#-----
end_events_sql_function_kwargs=dict(
    schema_name='meter_events', 
    table_name='events_summary_vw', 
    cols_of_interest=['*'], 
    date_range=date_range, 
    trsf_pole_nbs=trsf_pole_nbs, 
    opco='oh'
)
#-----
end_events = AMIEndEvents(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args = dict(conn_db=conn_aws), 
    build_sql_function=AMIEndEvents_SQL.build_sql_end_events, 
    build_sql_function_kwargs=end_events_sql_function_kwargs, 
    init_df_in_constructor=True, 
    save_args=False
)
ede_df = end_events.df.copy()

In [None]:
ede_df.head()

In [None]:
# ede_df = self.evsSum_df.copy()

# Also need meter_events.event_summ_regex_setup
to convert the column names in rcpx from cr# to curated reason

In [None]:
# cr_trans_dict = curated reasons translation dictionary
sql = """
SELECT * FROM meter_events.event_summ_regex_setup
"""
regex_setup_df = pd.read_sql(sql, conn_aws, dtype=str)
cr_trans_dict = {x[0]:x[1] for x in regex_setup_df[['pivot_id', 'regex_report_title']].values.tolist()}

In [None]:
cr_trans_dict

# Build rcpx_0
Construct rcpx_0 by aggregating ede_df by trsf_pole_nb and by 5-day frequency

In [None]:
freq='5D'
group_cols=['trsf_pole_nb']
group_freq=pd.Grouper(freq=freq, key='aep_event_dt')
#-------------------------
# Convert aep_event_dt to datetime object
ede_df['aep_event_dt'] = pd.to_datetime(ede_df['aep_event_dt'])

# Will no longer need the following columns
cols_to_drop = ['serialnumber', 'aep_premise_nb', 'aep_opco']

agg_dict = {col:np.sum for col in ede_df.drop(columns=cols_to_drop+['trsf_pole_nb', 'aep_event_dt']).columns.tolist()}
agg_dict['xf_meter_cnt'] = np.max
#-------------------------
rcpx_0 = ede_df.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

In [None]:
rcpx_0.head(10)

# Project out xf_meter_cnt, as it will be used later

In [None]:
xf_meter_cnt_srs = rcpx_0.droplevel(1, axis=0)['xf_meter_cnt'].reset_index().drop_duplicates().set_index('trsf_pole_nb').squeeze()
assert(xf_meter_cnt_srs.shape[0]==xf_meter_cnt_srs.index.nunique())
all_trsf_pole_nbs = rcpx_0.index.get_level_values(0).unique().tolist()
xf_meter_cnt_srs.name='nSNs'

In [None]:
xf_meter_cnt_srs.head()

In [None]:
# Need data_structure_df
# In general, not all curated reasons will be included in the model.
# Typically, 10 commong curated reasons will be included, and all others will be grouped together in "Other Reasons".
# Furthermore, some reasons may be combined together, others may be completely removed.
# For these reasons, it is beneficial to have some sample data (taken from when the model was created) to utilize in structuring the new data in the same fashion.
# Additionally, the data will be used to ensure the ordering of columns is correct before the data are fed into the model.
data_structure_df = pd.read_pickle(os.path.join(model_dir, 'data_structure_df.pkl'))
data_structure_df.head()

# Transform rcpx_0 to the form expected by the model
i.e., similar to data_structure_df.
</br>This is essentially just changing rcpo_0 from long form to wide form

In [None]:
# Build time_pds_rename
#-----
# We will need to convert the time periods, which are currently housed in the 'aep_event_dt' index of 
#   rcpx_0 from their specific dates to the names expected by the model.
# In rcpx_0, after grouping by the 5-day intervals, the values of 'aep_event_dt' are equal to the beginning
#   dates of the given interval.
# These will be converted to the titles contained in final_time_pds below
# NOTE: This is probably not 100% necessary, but is useful nonetheless
#-------------------------
curr_time_pds = natsorted(rcpx_0.index.get_level_values(1).unique())
# There should be 6 time periods, each of width 5 days
for i in range(len(curr_time_pds)):
    if i==0:
        continue
    assert(curr_time_pds[i]-curr_time_pds[i-1]==pd.Timedelta('5D'))
#-----
final_time_pds = [
    '01-06 Days',
    '06-11 Days',
    '11-16 Days',
    '16-21 Days',
    '21-26 Days',
    '26-31 Days',
]
#-----
time_pds_rename = dict(zip(curr_time_pds, final_time_pds))
#-------------------------

In [None]:
# As stated above, this is essentially just changing rcpo_0 from long form to wide form
# This will probably be formalized further in the future (i.e., function(s) developed to handle)
rename_cols = {
    'events_tot':'total_counts', 
    'xf_meter_cnt':'nSNs'
}

total_counts_col = 'total_counts'
nSNs_col         = 'nSNs'
non_reason_cols = [nSNs_col, total_counts_col]

include_power_down_minus_up=False
#-------------------------
rcpx_0=rcpx_0.rename(columns=rename_cols)
#-------------------------
pd_dfs = []
for date_pd_i in curr_time_pds:
    # Grab the proper time period name from final_time_pd_i
    final_time_pd_i = time_pds_rename[date_pd_i]
    #-----
    # Get the expected columns for this time period from data_structure_df
    final_reason_cols_i = data_structure_df[final_time_pd_i].columns.tolist()
    final_reason_cols_i = [x for x in final_reason_cols_i if x not in non_reason_cols+['Other Reasons']]
    #-------------------------
    # Project out the current time period (date_pd_i) from rcpx_0 by selecting the appropriate
    #   values from the 'aep_event_dt' index (i.e., index level 1)
    rcpx_0_pd_i = rcpx_0[rcpx_0.index.get_level_values(1)==date_pd_i].copy()
    rcpx_0_pd_i = rcpx_0_pd_i.droplevel(1, axis=0)
    #-------------------------
    # Make sure all trsf_pole_nbs have an entry in rcpx_0_pd_i:
    #   If a trsf_pole_nb didn't register any events in a given time period, it will not be included in the projection.
    #   However, the final format requires each transformer have entries for each time period
    #   Therefore, we identify the trsf_pole_nbs missing from rcpx_0_pd_i (no_events_pd_i) and add approriate rows
    #     containing all 0 values for the counts
    no_events_pd_i = list(set(all_trsf_pole_nbs).difference(set(rcpx_0_pd_i.index.get_level_values(0).unique())))
    no_events_pd_i_df = pd.DataFrame(
        columns=rcpx_0.columns, 
        index=no_events_pd_i, 
        data=np.zeros((len(no_events_pd_i), rcpx_0.shape[1]))
    )
    #-----
    # Use xf_meter_cnt_srs to fill the 'nSNs' column in no_events_pd_i_df
    # NOTE: This is probably not strictly necessary, as the 'nSNs' column won't be used here,
    #         since the data are not normalized.
    no_events_pd_i_df = no_events_pd_i_df.drop(columns=['nSNs']).merge(
        xf_meter_cnt_srs, 
        left_index=True, 
        right_index=True, 
        how='left'
    )
    # Sanity check on the merge
    assert(no_events_pd_i_df['nSNs'].notna().all())
    #-----
    # Combine rcpx_0_pd_i and no_events_pd_i_df
    assert(len(set(rcpx_0_pd_i.columns).symmetric_difference(set(no_events_pd_i_df.columns)))==0)
    no_events_pd_i_df = no_events_pd_i_df[rcpx_0_pd_i.columns]
    rcpx_0_pd_i = pd.concat([rcpx_0_pd_i, no_events_pd_i_df])
    #-------------------------
    # Rename the cr# columns to their full curated reasons
    rcpx_0_pd_i=rcpx_0_pd_i.rename(columns=cr_trans_dict)
    #--------------------------------------------------
    #--------------------------------------------------
    # Any columns without a curated reason (i.e., those with column name = ''), have not been observed
    #   yet in the data, and therefore the sume of the counts should be 0.
    # These empty columns are not needed, so drop
    assert(rcpx_0_pd_i[''].sum().sum()==0)
    rcpx_0_pd_i=rcpx_0_pd_i.drop(columns=[''])
    #-------------------------
    # Any curated reasons containing 'cleared' or 'Test Mode' or not included in the analysis, so remove
    rcpx_0_pd_i = MECPODf.remove_reasons_from_rcpo_df(
        rcpo_df=rcpx_0_pd_i, 
        regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*'], 
        ignore_case=True
    )
    #-----
    # After irrelevant cleared and test columns removed, need to recalculate events_tot to accurately
    #   reflect the total number of relevant events
    assert(total_counts_col in non_reason_cols)
    rcpx_0_pd_i[total_counts_col] = rcpx_0_pd_i.drop(columns=non_reason_cols).sum(axis=1)
    #-------------------------
    # Combine similar reasons (e.g., all 'Tamper' type reasons are combined into 1)
    # See MECPODf.combine_cpo_df_reasons for more information
    rcpx_0_pd_i = MECPODf.combine_cpo_df_reasons(rcpo_df=rcpx_0_pd_i)
    #-------------------------
    # Include the difference in power-up and power-down, if desired (typically turned off) 
    if include_power_down_minus_up:
        rcpx_0_pd_i = MECPODf.delta_cpo_df_reasons(
            rcpo_df=rcpx_0_pd_i, 
            reasons_1='Primary Power Down',
            reasons_2='Primary Power Up',
            delta_reason_name='Power Down Minus Up'
        )
    #-------------------------
    # Make sure rcpx_0_pd_i contains the expected final reason columns.
    # Once this is assured, project out these reasons and combine all other reasons into
    #   the 'Other Reasons' columns
    # See MECPODf.get_reasons_subset_from_cpo_df for more info
    assert(len(set(final_reason_cols_i).difference(set(rcpx_0_pd_i.columns.tolist())))==0)
    rcpx_0_pd_i = MECPODf.get_reasons_subset_from_cpo_df(
        cpo_df=rcpx_0_pd_i, 
        reasons_to_include=final_reason_cols_i, 
        combine_others=True, 
        output_combine_others_col='Other Reasons', 
        SNs_tags=None, 
        is_norm=False, 
        counts_col='nSNs', 
        normalize_by_nSNs_included=False, 
        level_0_raw_col = 'counts', 
        level_0_nrm_col = 'counts_norm', 
        cols_to_ignore = ['total_counts'], 
        include_counts_col_in_output=True
    )    
    #--------------------------------------------------
    #--------------------------------------------------
    # Don't want nSNs in each pd individually
    rcpx_0_pd_i = rcpx_0_pd_i.drop(columns=[nSNs_col])
    #-------------------------
    # Add the correct time period name as level 0 of the columns
    rcpx_0_pd_i = Utilities_df.prepend_level_to_MultiIndex(
        df=rcpx_0_pd_i, 
        level_val=final_time_pd_i, 
        level_name=None, 
        axis=1
    )
    #-------------------------
    pd_dfs.append(rcpx_0_pd_i)
    
# Make sure all dfs in pd_dfs look correct
shape_0 = pd_dfs[0].shape
index_0 = pd_dfs[0].index
for i in range(len(pd_dfs)):
    if i==0:
        continue
    assert(pd_dfs[i].shape==shape_0)
    assert(len(set(index_0).symmetric_difference(set(pd_dfs[i].index)))==0)
    #-----
    # Aligning the indices is not strictly necessary, as pd.concat should handle that
    # But, it's best to be safe
    pd_dfs[i] = pd_dfs[i].loc[index_0]
    
# Build rcpx_final by combining all dfs in pd_dfs
rcpx_final = pd.concat(pd_dfs, axis=1)

# Include back in the number of SNs per transformer (from xf_meter_cnt_srs)
rcpx_final=rcpx_final.merge(
    xf_meter_cnt_srs.to_frame(name=('nSNs', 'nSNs')), 
    left_index=True, 
    right_index=True, 
    how='left'
)
# Sanity check on the merge
assert(rcpx_final['nSNs'].notna().all().all())

In [None]:
rcpx_final.head()

# Normalize by nSNs

In [None]:
# Kind of silly, but below I cannot simply use 'rcpx_final[final_time_pds] = ...'
#   This will result in: "ValueError: Columns must be same length as key", because final_time_pds
#   has only, e.g., 6 elements but rcpx_final[final_time_pds] contains, e.g., 72 columns
# Instead, must use 'rcpx_final[rcpx_final[final_time_pds].columns] = ..'
rcpx_final[rcpx_final[final_time_pds].columns] = rcpx_final[final_time_pds].divide(rcpx_final[('nSNs', 'nSNs')], axis=0)

In [None]:
rcpx_final.head()

# Build EEMSP Data

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()

In [None]:
merge_eemsp = True
mult_strategy='agg'
#-------------------------
cols_of_interest_eemsp = [
    'location_nb', 
    'mfgr_nm', 
    'install_dt', 
    'last_trans_desc', 
    'eqtype_id', 
    'coolant', 
    'info', 
    'kva_size',
    'phase_cnt', 
    'prim_voltage', 
    'protection', 
    'pru_number', 
    'sec_voltage', 
    'special_char', 
    'taps', 
    'xftype'
]
cols_of_interest_eemsp_full = cols_of_interest_eemsp + ['latest_status', 'removal_dt', 'serial_nb']
#-------------------------
sql_EEMSP = """
SELECT {} 
FROM meter_events.eems_transformer_nameplate
WHERE location_nb IN ({})
AND install_dt <= '{}'
AND (removal_dt IS NULL OR removal_dt > '{}')
""".format(
    Utilities_sql.join_list(cols_of_interest_eemsp_full, quotes_needed=False), 
    Utilities_sql.join_list(trsf_pole_nbs, quotes_needed=True), 
    date_range[0], 
    date_range[1]
)
print(sql_EEMSP)
#-------------------------
df_eemsp = pd.read_sql_query(sql_EEMSP, conn_aws)

In [None]:
df_eemsp.head()

# Reduce down df_eemsp so there is a single entry for each transformer
reduce1_eemsp_for_outg_trsf reduces df_eemsp down to contain only entries for transformers which were active during the date(s) in question.
</br>No need to run reduce1_eemsp_for_outg_trsf for this case, as all share the same date restrictions which were already imposed in sql_EEMSP.
</br>(For model development/training, this step would be necessary, as the data utilized there have many different date restrictions, and df_eemsp cannot simply be built with the date restrictions)

reduce2_eemsp_for_outg_trsf futher reduces df_eemsp down so there is a single entry for each transformer.
</br>How exactly this is achieved is dictated mainly by the "mult_strategy" parameter

In [None]:
# reduce2_eemsp_for_outg_trsf was designed to be used with outg_rec_nb/no_outg_rec_nb.
# outg_rec_nb is not necessary here, but we need a temporary column anyway to make the function happy.
# I'll update the code in the future so this unnecessary step won't be needed
df_eemsp['outg_rec_nb'] = df_eemsp['location_nb']
#-----
df_eemsp_reduce2 = reduce2_eemsp_for_outg_trsf_OLD(
    df_eemsp=df_eemsp, 
    mult_strategy='agg', 
    include_n_eemsp=True, 
    outg_rec_nb_col='outg_rec_nb', 
    location_nb_col='location_nb', 
    numeric_cols = ['kva_size'], 
    dt_cols = ['install_dt', 'removal_dt'], 
    ignore_cols = ['serial_nb'], 
    cat_cols_as_strings=True
)
#-------------------------
# No matter of the mult_strategy used, at this point df_eemsp_reduce2 should only have a single
#   entry for each outg_rec_nb, location_nb pair
assert(all(df_eemsp_reduce2[['outg_rec_nb', 'location_nb']].value_counts()==1))

#----------------------------------------------------------------------------------------------------
# Clean up df_eemsp_reduce2 and merge with rcpx_final
#--------------------------------------------------
# Can't simply take df_eemsp_reduce2[cols_of_interest_eemsp] because we need also the new column
#   OUTG_REC_NB_TO_MERGE (and any others which may be added in the future)
cols_to_drop = list(set(cols_of_interest_eemsp_full).difference(set(cols_of_interest_eemsp)))
cols_to_drop = [x for x in cols_to_drop if x in df_eemsp_reduce2.columns]
if len(cols_to_drop)>0:
    df_eemsp_reduce2 = df_eemsp_reduce2.drop(columns=cols_to_drop)
#-------------------------
assert(df_eemsp_reduce2.shape[0]==df_eemsp_reduce2.groupby(['outg_rec_nb', 'location_nb']).ngroups)
print(f"df_eemsp_reduce2['location_nb'].nunique() = {df_eemsp_reduce2['location_nb'].nunique()}")
print(f"len(trsf_pole_nbs)                        = {len(trsf_pole_nbs)}")
print(f"Diff                                      = {len(trsf_pole_nbs)-df_eemsp_reduce2['location_nb'].nunique()}")
print()
#-------------------------
# Make all EEMSP columns (except n_eemsp) uppercase to match what was done in model development (where EEMSP)
#   data were grabbed from the Oracle database, and columns were all uppercase)
df_eemsp_reduce2 = Utilities_df.make_all_column_names_uppercase(df_eemsp_reduce2, cols_to_exclude=['n_eemsp'])

# Similar to the case with 'outg_rec_nb' column in df_eemsp above, merge_rcpx_with_eemsp was designed to be 
#   used with outg_rec_nb/no_outg_rec_nb.
# As such, rcpx_final needs an additional column (in this case, it is easier to add another level to the index)
# I'll update the code in the future so this unnecessary step won't be needed
rcpx_final = rcpx_final.set_index([rcpx_final.index, rcpx_final.index])
#-------------------------
print("\nShapes BEFORE merging")
print(f"rcpx_final.shape = {rcpx_final.shape}")
#-------------------------
rcpx_final = merge_rcpx_with_eemsp_OLD(
    df_rcpx=rcpx_final, 
    df_eemsp=df_eemsp_reduce2, 
    outg_rec_nb_idfr_rcpx ='index_0', 
    trsf_pole_nb_idfr_rcpx='index_1', 
    outg_rec_nb_idfr_eemsp='OUTG_REC_NB', 
    location_nb_idfr_eemsp='LOCATION_NB', 
    set_index=True
)
#-------------------------
print("\nShapes AFTER merging")
print(f"rcpx_final.shape = {rcpx_final.shape}")
#-------------------------
# Drop the unnecessary index level that was added above and is no longer needed
rcpx_final=rcpx_final.droplevel(0, axis=0)

# Convert INSTALL_DT to age in years
rcpx_final[('EEMSP_0', 'INSTALL_DT')] = (prediction_date-rcpx_final[('EEMSP_0', 'INSTALL_DT')]).dt.total_seconds()/(60*60*24*365)

# Add month
rcpx_final[('dummy_lvl_0', 'outg_month')] = prediction_date.month
#-------------------------
# Make sure rcpx_final has the correct columns in the correct order
assert(len(set(data_structure_df.columns).symmetric_difference(set(rcpx_final.columns)))==0)
rcpx_final=rcpx_final[data_structure_df.columns]
X_test = rcpx_final.copy()

In [None]:
rcpx_final.equals(rcpx_df_no1)

# Load Model and Make Predictions

In [None]:
forest_clf = joblib.load(os.path.join(model_dir, 'forest_clf.joblib'))
scaler     = joblib.load(os.path.join(model_dir, 'scaler.joblib'))
eemsp_enc  = joblib.load(os.path.join(model_dir, 'eemsp_encoder.joblib'))

# Transformations/scaling

In [None]:
#-------------------------
cols_to_encode = data_structure_df['EEMSP_0'].columns
numeric_cols = ['KVA_SIZE', 'INSTALL_DT']
cols_to_encode = [x for x in cols_to_encode if x not in numeric_cols]
assert(len(set(eemsp_enc.feature_names_in_).symmetric_difference(cols_to_encode))==0)
assert(set(X_test['EEMSP_0'].columns).difference(eemsp_enc.feature_names_in_)==set(numeric_cols))
#-----
cols_to_encode = [('EEMSP_0', x) for x in cols_to_encode if x not in numeric_cols]
X_test[cols_to_encode] = X_test[cols_to_encode].astype(str)
X_test[cols_to_encode] = eemsp_enc.transform(X_test[cols_to_encode].droplevel(0, axis=1))
#----------
X_test = scaler.transform(X_test)
#-------------------------

In [None]:
# Make predictions
y_pred = forest_clf.predict(X_test)

In [None]:
print(f"# Outages Predicted: {y_pred.sum()}")
print(f"# Predictions:       {y_pred.shape[0]}")
print(f"%:                   {100*y_pred.sum()/y_pred.shape[0]}")

In [None]:
# Set predictions column in rcpx_final
assert(rcpx_final.shape[0]==y_pred.shape[0])
rcpx_final['y_pred'] = y_pred

In [None]:
rcpx_final[rcpx_final['y_pred']==1]

In [None]:
assert(0)

In [None]:
df_mp_install_time_col = 'inst_ts'
df_mp_removal_time_col = 'rmvl_ts'
dt_0 = prediction_date
dt_1 = prediction_date

In [None]:
mp_df = MeterPremise.build_mp_df_curr_hist_for_xfmrs(
    trsf_pole_nbs=rcpx_final.index.tolist(), 
    join_curr_hist=True, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assume_one_xfmr_per_PN=True, 
    drop_approx_duplicates=True, 
    drop_approx_duplicates_args=None, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb'
)

# Only want meters active at the relevant time period
mp_df = mp_df[(mp_df[df_mp_install_time_col]<=pd.to_datetime(dt_0)) & 
              (mp_df[df_mp_removal_time_col].fillna(pd.Timestamp.max)>pd.to_datetime(dt_1))]

In [None]:
# Build dovs_df
dovs = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True,
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        premise_nbs=mp_df['prem_nb'].unique().tolist(), 
        date_range=[
            prediction_date-pd.Timedelta('31D'), 
            prediction_date+pd.Timedelta('31D')
        ], 
        field_to_split='premise_nbs', 
        include_premise=True
    ), 
    build_consolidated=False
)
dovs_df = dovs.df.copy()

In [None]:
dovs_df = pd.merge(
    dovs_df, 
    mp_df[['prem_nb', 'trsf_pole_nb']].drop_duplicates(), 
    left_on='PREMISE_NB', 
    right_on='prem_nb', 
    how='left'
)
dovs_df

In [None]:
mp_df_pred1 = mp_df[mp_df['trsf_pole_nb'].isin(rcpx_final[rcpx_final['y_pred']==1].index.tolist())].copy()
dovs_df_pred1 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred1['prem_nb'].unique().tolist())]
#-----
mp_df_pred0 = mp_df[mp_df['trsf_pole_nb'].isin(rcpx_final[rcpx_final['y_pred']==0].index.tolist())].copy()
dovs_df_pred0 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred0['prem_nb'].unique().tolist())]

In [None]:
dovs_df_pred1['DT_OFF_TS_FULL'].nunique()

In [None]:
dovs_df_pred0['DT_OFF_TS_FULL'].nunique()

In [None]:
dovs_df_pred1['DT_OFF_TS_FULL']

In [None]:
natsorted(dovs_df_pred1['DT_OFF_TS_FULL'].unique())

In [None]:
prediction_date

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.plot.scatter(ax=ax, x='DT_OFF_TS_FULL', y='CI_NB')

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.plot.scatter(ax=ax, x='DT_OFF_TS_FULL', y='CMI_NB')

In [None]:
dovs_df_pred1

In [None]:
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL')).count()

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CMI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CMI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
dovs_df['OUTG_REC_NB'].nunique()

In [None]:
dovs_df['trsf_pole_nb'].nunique()

In [None]:
rcpx_final[rcpx_final['y_pred']==1]

In [None]:
set(dovs_df['trsf_pole_nb'].unique()).intersection(set(rcpx_final[rcpx_final['y_pred']==1].index))

In [None]:
natsorted(rcpx_final[rcpx_final['y_pred']==1].index)

In [None]:
natsorted(dovs_df['trsf_pole_nb'].unique())

In [None]:
set(dovs_df['trsf_pole_nb'].unique()).intersection(set(rcpx_final.index))

In [None]:
df_OG = GenAn.read_df_from_csv_dir_batches(
    files_dir=r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231005\20230401_20230930\Outgs_Full\EndEvents', 
    file_path_glob=r'end_events_[0-9]*.csv', 
    file_path_regex=None, 
    cols_and_types_to_convert_dict=None, 
    to_numeric_errors='coerce', 
    drop_unnamed0_col=True, 
    pd_read_csv_kwargs={}, 
    assert_all_cols_equal=True
)

In [None]:
df = df_OG.copy()

In [None]:
outg_rec_nbs = df['OUTG_REC_NB_GPD_FOR_SQL'].unique().tolist()
dovs = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True,
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        outg_rec_nbs=outg_rec_nbs, 
        field_to_split='outg_rec_nbs', 
        include_premise=True
    ), 
    build_consolidated=False
)
dovs_df = dovs.df.copy()

In [None]:
dovs_df = dovs.df.copy()

In [None]:
og_len = df.shape[0]
#-----
df = pd.merge(
    df, 
    dovs_df[['OUTG_REC_NB', 'PREMISE_NB', 'DT_OFF_TS_FULL', 'DT_ON_TS']], 
    left_on=['OUTG_REC_NB_GPD_FOR_SQL', 'aep_premise_nb'], 
    right_on=['OUTG_REC_NB', 'PREMISE_NB'], 
    how='left'
)
#-----
assert(df.shape[0]==og_len)

In [None]:
        self.rcpx_df = OutagePredictor.build_rcpx_from_evsSum_df(
            evsSum_df                   = self.evsSum_df, 
            data_structure_df           = self.data_structure_df, 
            td_min                      = self.idk_name_2, 
            td_max                      = self.idk_name_1, 
            cr_trans_dict               = self.cr_trans_dict, 
            freq                        = freq, 
            group_cols                  = group_cols, 
            date_col                    = date_col, 
            normalize_by_SNs            = normalize_by_SNs, 
            include_power_down_minus_up = include_power_down_minus_up, 
            regex_patterns_to_remove    = regex_patterns_to_remove, 
            combine_cpo_df_reasons      = combine_cpo_df_reasons, 
            xf_meter_cnt_col            = 'xf_meter_cnt', 
            events_tot_col              = 'events_tot', 
            trsf_pole_nb_col            = 'trsf_pole_nb', 
            other_reasons_col           = 'Other Reasons', 
            total_counts_col            = 'total_counts', 
            nSNs_col                    = 'nSNs'
        )

In [None]:
        freq                        = '5D', 
        group_cols                  = ['trsf_pole_nb'], 
        date_col                    = 'aep_event_dt', 
        normalize_by_SNs            = True, 
        include_power_down_minus_up = False, 
        regex_patterns_to_remove    = ['.*cleared.*', '.*Test Mode.*'], 
        combine_cpo_df_reasons      = True, 
        include_n_eemsp             = True

In [None]:
def build_events_summary_df_from_csvs(
    files_dir, 
    file_path_glob, 
    file_path_regex, 
    batch_kwargs=None, 
    cols_and_types_to_convert_dict=None, 
    to_numeric_errors='coerce', 
    assert_all_cols_equal=True, 
    verbose=True, 
    n_update=1,
):
    r"""
    
    batch_kwargs:
        Keys and default values:
            batch_size_MB        = 1024
            tolerance_pct        = 0.01
            absorb_last_pair_pct = None
    """
    #--------------------------------------------------
    paths = Utilities.find_all_paths(
        base_dir      = files_dir, 
        glob_pattern  = file_path_glob, 
        regex_pattern = file_path_regex
    )
    if len(paths)==0:
        print(f'No paths found in files_dir = {files_dir}')
        return None
    paths=natsorted(paths)    
    #--------------------------------------------------
    # Find the smallest file in paths and check to see if it is empty
    # If it is found to be empty, any files of that size can be skipped
    min_size, min_file = Utilities.get_smallest_file_size_MB(
        paths           = paths, 
        return_min_file = True
    )
    smallest_df = pd.read_csv(min_file)
    # If smallest_df is not empty, set min_size equal to None so that
    #   no files will be skipped
    if smallest_df.shape[0]!=0:
        min_size = None
    #--------------------------------------------------
    if batch_kwargs is None:
        batch_kwargs = {}
    assert(isinstance(batch_kwargs, dict))
    #-----
    batch_idxs = Utilities.get_files_split_locations(
        paths                = paths, 
        batch_size_MB        = batch_kwargs.get('batch_size_MB',        1024), 
        tolerance_pct        = batch_kwargs.get('tolerance_pct',        0.01), 
        absorb_last_pair_pct = batch_kwargs.get('absorb_last_pair_pct', None)
    )
    n_batches = len(batch_idxs)
    #-------------------------
    if verbose:
        print(f'n_paths       = {len(paths)}')
        print(f'batch_size_MB = {batch_kwargs.get("batch_size_MB", 1024)}')
        print(f'n_batches     = {n_batches}')
    #-------------------------
    #-------------------------
    evsSum_dfs = []
    for i, batch_i in enumerate(batch_idxs):
        if verbose and (i+1)%n_update==0:
            print(f'{i+1}/{n_batches}')
        i_beg = batch_i[0]
        i_end = batch_i[1]
        #-----
        evsSum_df_i = GenAn.read_df_from_csv_batch(
            paths                          = paths[i_beg:i_end], 
            cols_and_types_to_convert_dict = cols_and_types_to_convert_dict, 
            to_numeric_errors              = to_numeric_errors, 
            drop_na_rows_when_exception    = True, 
            drop_unnamed0_col              = True, 
            pd_read_csv_kwargs             = None, 
            make_all_columns_lowercase     = False, 
            assert_all_cols_equal          = assert_all_cols_equal, 
            min_fsize_MB                   = min_size
        )
        if evsSum_df_i.shape[0]>0:
            evsSum_dfs.append(evsSum_df_i)
    #-------------------------        
    evsSum_cols = evsSum_dfs[0].columns.tolist()
    for i_df in range(len(evsSum_dfs)):
        # Make sure columns are same
        assert(set(evsSum_dfs[i_df].columns.tolist()).symmetric_difference(set(evsSum_cols))==set())
        # Make sure order is same
        evsSum_dfs[i_df] = evsSum_dfs[i_df][evsSum_cols]
    #-------------------------
    evsSum_df = pd.concat(evsSum_dfs)
    return evsSum_df

In [None]:
files_dir       = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231004\20230401_20230930\Outgs_Full\EndEvents'
files_dir_2       = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231005\20230401_20230930\Outgs_Full\EndEvents'
file_path_glob  = r'end_events_[0-9]*.csv'
file_path_regex = None

# batch_kwargs = None
batch_kwargs = dict(
    batch_size_MB = 64
#     batch_size_MB = 5.623506
)

cols_and_types_to_convert_dict=None
to_numeric_errors='coerce'
assert_all_cols_equal=True
verbose=True
n_update=1

In [None]:
evsSum_df = build_events_summary_df_from_csvs(
    files_dir=files_dir, 
    file_path_glob=file_path_glob, 
    file_path_regex=file_path_regex, 
    batch_kwargs=batch_kwargs, 
    cols_and_types_to_convert_dict=cols_and_types_to_convert_dict, 
    to_numeric_errors=to_numeric_errors, 
    assert_all_cols_equal=assert_all_cols_equal, 
    verbose=verbose, 
    n_update=n_update,
)
# evsSum_df.to_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231004\20230401_20230930\Outgs_Full\evsSum_df.pkl')

In [None]:
evsSum_df_2 = build_events_summary_df_from_csvs(
    files_dir=files_dir_2, 
    file_path_glob=file_path_glob, 
    file_path_regex=file_path_regex, 
    batch_kwargs=batch_kwargs, 
    cols_and_types_to_convert_dict=cols_and_types_to_convert_dict, 
    to_numeric_errors=to_numeric_errors, 
    assert_all_cols_equal=assert_all_cols_equal, 
    verbose=verbose, 
    n_update=n_update,
)
# evsSum_df_2.to_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231005\20230401_20230930\Outgs_Full\evsSum_df.pkl')

In [None]:
print(evsSum_df_1.shape[0])
print(evsSum_df_2.shape[0])

In [None]:
evsSum_df_2.shape[0]-evsSum_df_1.shape[0]

In [None]:
evsSum_df_1 = evsSum_df_1.sort_values(by=['OUTG_REC_NB_GPD_FOR_SQL', 'trsf_pole_nb', 'serialnumber', 'aep_event_dt'], ignore_index=True)
evsSum_df_2 = evsSum_df_2.sort_values(by=['OUTG_REC_NB_GPD_FOR_SQL', 'trsf_pole_nb', 'serialnumber', 'aep_event_dt'], ignore_index=True)

In [None]:
gps_1 = list(evsSum_df_1.groupby(['serialnumber', 'trsf_pole_nb', 'OUTG_REC_NB_GPD_FOR_SQL']).groups.keys())
gps_2 = list(evsSum_df_2.groupby(['serialnumber', 'trsf_pole_nb', 'OUTG_REC_NB_GPD_FOR_SQL']).groups.keys())

In [None]:
len(set(gps_1).symmetric_difference(set(gps_2)))

In [None]:
overlap_gps = list(set(gps_1).intersection(set(gps_2)))

In [None]:
# overlap_1 = evsSum_df_1.groupby(['serialnumber', 'trsf_polae_nb', 'OUTG_REC_NB_GPD_FOR_SQL']).apply(lambda x: x.name in overlap_gps)

In [None]:
idx=0

overlap_1 = evsSum_df_1[
    (evsSum_df_1['serialnumber']            == overlap_gps[idx][0]) & 
    (evsSum_df_1['trsf_pole_nb']            == overlap_gps[idx][1]) & 
    (evsSum_df_1['OUTG_REC_NB_GPD_FOR_SQL'] == overlap_gps[idx][2])
][overlap_2.columns]

overlap_2 = evsSum_df_2[
    (evsSum_df_2['serialnumber']            == overlap_gps[idx][0]) & 
    (evsSum_df_2['trsf_pole_nb']            == overlap_gps[idx][1]) & 
    (evsSum_df_2['OUTG_REC_NB_GPD_FOR_SQL'] == overlap_gps[idx][2])
]

In [None]:
overlap_1

In [None]:
overlap_2

In [None]:
overlap_1.equals(overlap_2)

In [None]:
overlap_1 = evsSum_df_1.set_index(['serialnumber', 'trsf_pole_nb', 'OUTG_REC_NB_GPD_FOR_SQL']).loc[overlap_gps]

In [None]:
overlap_2 = evsSum_df_2.set_index(['serialnumber', 'trsf_pole_nb', 'OUTG_REC_NB_GPD_FOR_SQL']).loc[overlap_gps]

In [None]:
overlap_1.equals(overlap_2)

In [None]:
overlap_1

In [None]:
overlap_2

In [None]:
df = pd.DataFrame( 
    { 
        "Date": [ 
            pd.Timestamp("2000-11-02"), 
            pd.Timestamp("2000-11-03"), 
            pd.Timestamp("2000-11-04"), 
            pd.Timestamp("2000-11-05"), 
            pd.Timestamp("2000-11-06"), 
            pd.Timestamp("2000-11-07") 
        ], 
        "ID": [1, 2, 3, 4, 5, 6], 
        "Price": [140, 120, 230, 40, 100, 450] 
    } 
) 

In [None]:
df

In [None]:
df.groupby(pd.Grouper(key='Date', axis=0,  
                      freq='2D', sort=True)).sum() 

In [None]:
fig_num=0
for trsf_pole_nb_i in y_prob1_by_date_df.index.unique().tolist():
    outg_rec_nbs_i = [x[0] for x in outg_rec_nbs_and_trsf_pole_nbs if x[1]==trsf_pole_nb_i]
    #-----
    fig,ax = Plot_General.default_subplots(fig_num=fig_num)
    y_prob1_by_date_df.loc[trsf_pole_nb_i].T.plot.line(ax=ax)
    #-----
    dovs_df_dev_i = dovs_df_dev[dovs_df_dev['OUTG_REC_NB'].isin(outg_rec_nbs_i)].copy()
    for idx_ij in range(dovs_df_dev_i.shape[0]):
        dt_off_ts_full_i, dt_on_ts_i = dovs_df_dev_i.iloc[idx_ij][['DT_OFF_TS_FULL', 'DT_ON_TS']]
        ax.axvline(dt_off_ts_full_i, color='red')
    #-----
    idx_max_i = y_prob1_by_date_df.loc[trsf_pole_nb_i].idxmax()
    ax.axvline(idx_max_i, color='green')
    if(
        idx_max_i > dovs_df_dev_i['DT_OFF_TS_FULL'].max() and
        y_prob1_by_date_df.loc[trsf_pole_nb_i].index[0] < dovs_df_dev_i['DT_OFF_TS_FULL'].max() #Make sure there are actually data to grab
    ):
        idx_max_i = y_prob1_by_date_df.loc[trsf_pole_nb_i][:dovs_df_dev_i['DT_OFF_TS_FULL'].max()].idxmax()
        ax.axvline(idx_max_i, color='lawngreen')
    #-----
    if dovs_df_dev_i['DT_OFF_TS_FULL'].min() < y_prob1_by_date_df.loc[trsf_pole_nb_i].index[0]:
        ax.set_xlim(left=dovs_df_dev_i['DT_OFF_TS_FULL'].min()-pd.Timedelta('1D'))
    #-----
    fig_num += 1

In [None]:
trsf_pole_nb_i = y_prob1_by_date_df.index.unique().tolist()[0]
outg_rec_nbs_i = [x[0] for x in outg_rec_nbs_and_trsf_pole_nbs if x[1]==trsf_pole_nb_i]
y_prob1_i = y_prob1_by_date_df.loc[trsf_pole_nb_i].copy()
dovs_df_dev_i = dovs_df_dev[dovs_df_dev['OUTG_REC_NB'].isin(outg_rec_nbs_i)].copy()

In [None]:
y_prob1_i

In [None]:
dovs_df_dev_i

In [None]:
idx_max_i = y_prob1_by_date_df.loc[trsf_pole_nb_i].idxmax()

In [None]:
(
        idx_max_i > dovs_df_dev_i['DT_OFF_TS_FULL'].max() and
        y_prob1_by_date_df.loc[trsf_pole_nb_i].index[0] < dovs_df_dev_i['DT_OFF_TS_FULL'].max() #Make sure there are actually data to grab
    )

In [None]:
# When failures occur, what are number of days above threshold?  Median, mean, etc.

In [None]:
y_prob1_i>0.5

In [None]:
dovs_df_dev_i

In [None]:
dovs_df_dev_i.iloc[0]['DT_OFF_TS_FULL']

In [None]:
date = dovs_df_dev_i.iloc[0]['DT_OFF_TS_FULL']

In [None]:
date

In [None]:
y_prob1_i

In [None]:
y_prob1_i.loc[:date]

In [None]:
pd_len=pd.Timedelta('7D')
exclude_day_of=True
threshold=0.5

In [None]:
def find_longest_consec_len_in_bool_srs(
    bool_srs
):
    r"""
    Given a pd.Series, srs, comprised of boolean values, determine the longest streak of consecutive True values.
    Assumption is that bool_srs has been sorted to fit needs before being fit into this function.
    """
    #-------------------------
    assert(bool_srs.dtype==bool)
    #-------------------------
    longest = 0
    streak = 0
    for idx_i, bool_i in bool_srs.items():
        if bool_i==True:
            streak += 1
        else:
            longest = max(longest, streak)
            streak = 0
    # Make sure the last streak isn't ignored if series ends with True
    longest = max(longest, streak)
    #-------------------------
    return longest

def find_longest_consec_idxs_in_bool_srs(
    bool_srs, 
    return_ilocs=False, 
    return_len=False
):
    r"""
    Given a pd.Series, srs, comprised of boolean values, determine the longest streak of consecutive True values and return the indices.
    Returns:
        A pair of indices representing the beginning and ending of the True block
        The returned indices are INCLUSIVE, meaning bool_srs.loc[return_idxs[0]]==True and bool_srs.loc[return_idxs[1]]==True
        
    return_ilocs:
        If True, instead of returning the values in bool_srs.index, return integers between 0 and bool_srs.shape[0]-1
        
    return_len:
        If True, also return the length of the block, in integer form
    
    Assumption is that bool_srs has been sorted to fit needs before being fit into this function.
    """
    #-------------------------
    assert(bool_srs.dtype==bool)
    #-------------------------
    longest = 0
    streak = 0
    #-----
    locs = [np.nan, np.nan]
    curr_beg_loc = bool_srs.index[0]
    #-----
    ilocs = [np.nan, np.nan]
    curr_beg_iloc = 0
    #-------------------------
    for i in range(bool_srs.shape[0]):
        idx_i  = bool_srs.index[i]
        bool_i = bool_srs.iloc[i]
        #-----
        if bool_i==True:
            streak += 1
        else:
            if streak >= longest:
                longest = streak
                locs  = [curr_beg_loc,  idx_i]
                ilocs = [curr_beg_iloc, i]
            #-----
            streak = 0
            if i < bool_srs.shape[0]-1:
                curr_beg_loc  = bool_srs.index[i+1]
                curr_beg_iloc = i+1
    # Make sure the last streak isn't ignored if series ends with True
    if streak >= longest:
        longest = streak
        locs  = [curr_beg_loc, idx_i]
        ilocs = [curr_beg_iloc, bool_srs.shape[0]-1] 
    #-------------------------
    return_idxs = locs
    if return_ilocs:
        return_idxs = ilocs
    #-------------------------
    if return_len:
        return return_idxs, longest
    else:
        return return_idxs

In [None]:
def get_prob1_i_stats_preceding_date(
    y_prob1_i, 
    date, 
    pd_len=pd.Timedelta('7D'),
    exclude_day_of=True, 
    threshold=0.5, 
    return_series=True, 
    cols_to_drop=None
):
    r"""
    Return information regarding the probability prediction in the period of length pd_len preceding date
    
    date:
        If not supplied, will be randomly selected from y_prob1_i.index UNLESS pd_len is also not supplied (see NEITHER date/pd_len supplied)
        
    pd_len:
        If not supplied, all available data preceding date will be used UNLESS date is also not supplied (see NEITHER date/pd_len supplied)
        
    NEITHER date/pd_len supplied:
        If neither is supplied, entire series y_prob1_i is evaluated
    """
    #--------------------------------------------------
    assert(isinstance(y_prob1_i, pd.Series))
    y_prob1_i = copy.deepcopy(y_prob1_i)
    if cols_to_drop is not None:
        y_prob1_i = y_prob1_i.drop(cols_to_drop)
    #--------------------------------------------------
    # If date is None and pd_len is None, no slicing done on y_prob1_i (i.e., y_prob1_i=y_prob1_i instead
    #   of y_prob1_i = y_prob1_i.loc[pd.Timestamp(date-pd_len) : pd.Timestamp(date)])
    # Put differently, slicing is needed if date is not None or pd_len is not None
    if(
        date   is not None or 
        pd_len is not None
    ):
        #-------------------------
        if date is None:
            date_0 = y_prob1_i.index.min()
            date_1 = y_prob1_i.index.max()
            if exclude_day_of:
                # To ensure date_0 remains within bounds of y_prob1_i
                date_0 = date_0+pd.Timedelta('1D')
            #-----
            date = Utilities_dt.get_random_date_interval_between(
                date_0       = date_0, 
                date_1       = date_1, 
                window_width = pd_len, 
                rand_seed    = None
            )
            date = date[1]
        #-------------------------
        if pd_len is None:
            pd_len = date-y_prob1_i.index.min()
            if exclude_day_of:
                pd_len = pd_len-pd.Timedelta('1D')
        #-------------------------
        # Being extra safe, don't want safe changed outside of function
        date = copy.deepcopy(date)
        #-------------------------
        # Ensure date is a date object, and not datetime
        assert(Utilities.is_object_one_of_types(date, [pd.Timestamp, datetime.date]))
        if isinstance(date, pd.Timestamp):
            date = date.date()
        assert(isinstance(date, datetime.date))
        #-------------------------
        if exclude_day_of:
            date = date-pd.Timedelta('1D')
        #-------------------------
        if(
            pd.Timestamp(date-pd_len) < y_prob1_i.index.min() or 
            pd.Timestamp(date) > y_prob1_i.index.max()
        ):
            return None
        #-------------------------
        y_prob1_i = y_prob1_i.loc[pd.Timestamp(date-pd_len) : pd.Timestamp(date)]
    #--------------------------------------------------
    n_nan                          = y_prob1_i.isna().sum()
    n_abv_thrsh                    = (y_prob1_i > threshold).sum()
    idxs, longest_streak_abv_thrsh = find_longest_consec_idxs_in_bool_srs(
        bool_srs     = y_prob1_i>threshold, 
        return_ilocs = False, 
        return_len   = True
    )
    #--------------------------------------------------
    if date is None:
        longest_streak_beg = y_prob1_i.index.max()-idxs[0]
        longest_streak_end = y_prob1_i.index.max()-idxs[1]
    else:
        longest_streak_beg = pd.Timestamp(date)-idxs[0]
        longest_streak_end = pd.Timestamp(date)-idxs[1]
    #-----
    return_dict = dict(
        n_nan                    = n_nan, 
        n_abv_thrsh              = n_abv_thrsh, 
        longest_streak_abv_thrsh = longest_streak_abv_thrsh, 
        longest_streak_beg       = longest_streak_beg, 
        longest_streak_end       = longest_streak_end
    )
    #--------------------------------------------------
    if return_series:
        return pd.Series(return_dict)
    else:
        return return_dict

In [None]:
# return_df.xs('y_prob_1', level=1, axis=1).to_pickle(r'C:\Users\s346557\Downloads\tmp_df.pkl')

In [None]:
y_prob1_by_date_df = return_df.xs('y_prob_1', level=1, axis=1).copy()
# y_prob1_by_date_df = pd.read_pickle(r'C:\Users\s346557\Downloads\tmp_df.pkl')

In [None]:
#-------------------------
dovs_aid_df = pd.DataFrame(data=outg_rec_nbs_and_trsf_pole_nbs, columns=['outg_rec_nb', 'trsf_pole_nb'])
#-----
assert(dovs_df_dev['OUTG_REC_NB'].nunique()==dovs_df_dev.shape[0])
#-----
dovs_aid_df = pd.merge(
    dovs_aid_df, 
    dovs_df_dev[['OUTG_REC_NB', 'DT_OFF_TS_FULL', 'DT_ON_TS']], 
    left_on='outg_rec_nb', 
    right_on='OUTG_REC_NB', 
    how='left'
)
#-------------------------
y_prob1_by_date_df = pd.merge(
    y_prob1_by_date_df.reset_index(), 
    dovs_aid_df, 
    left_on='trsf_pole_nb', 
    right_on='trsf_pole_nb', 
    how='left'
).set_index('trsf_pole_nb')

In [None]:
y_prob1_stats = y_prob1_by_date_df.apply(
    lambda x: get_prob1_i_stats_preceding_date(
        y_prob1_i = x, 
        date = x['DT_OFF_TS_FULL'], 
        pd_len=pd.Timedelta('7D'),
        exclude_day_of=True, 
        threshold=0.5, 
        return_series=True, 
        cols_to_drop=['outg_rec_nb', 'OUTG_REC_NB', 'DT_OFF_TS_FULL', 'DT_ON_TS']
    ), 
    axis=1
)

In [None]:
y_prob1_stats

In [None]:
y_prob1_stats.describe()

In [None]:
# dovs_df.to_pickle(r'C:\Users\s346557\Documents\OutgPredictions\NoMonth\dovs_df.pkl')
# return_df.to_pickle(r'C:\Users\s346557\Documents\OutgPredictions\NoMonth\return_df.pkl')