# SEE 'Directions for running' below!

In [None]:
%run ./model_end_events_for_outages_METHODS.ipynb

In [None]:
from importlib import reload
#reload(Utilities)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re
from pathlib import Path
import json
import pickle
import joblib

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version
import copy

import itertools

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
import matplotlib.colors as mcolors
import matplotlib.cm as cm #e.g. for cmap=cm.jet
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
from MeterPremise import MeterPremise
from EEMSP import EEMSP
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from MECPODf import MECPODf
from MECPOAn import MECPOAn
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_General
import Plot_Box_sns
import Plot_Hist
import Plot_Bar
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer

# Directions for running:

For the purposes of this demonstration, let's assume your AEP User ID is s123456, and your local Documents directory
is located at C:\Users\s123456\Documents

1. If not already done, clone the Analysis GitHub repo (https://github.aepsc.com/s346557/Analysis).
<br>- I will assume the repo was cloned into the Documents directory, i.e. I assume your local copy of the repo is located at C:\Users\s123456\Documents\Analysis (and therefore, this Jupyter notebook should be located at C:\Users\s123456\Documents\Analysis\JupyterNbs\IT_Demo.ipynb)
<br><br>

2. Create a simple text file containing your AEP passwords.
- I suggest you use the file pwd_file_template.txt in the Analysis directory (C:\Users\s123456\Documents\Analysis\pwd_file_template.txt) to create your own password file.
    - DO NOT ALTER the pwd_file_template.txt file, create a new pwd_file.txt file!
- I further suggest you name your password file pwd_file.txt and place it in the Analysis directory (C:\Users\s123456\Documents\Analysis\pwd_file.txt).
    - The Git repo is set up to ignore pwd_file.txt in the Analysis directory, so your information will not be pushed up to the repo if saved in this manner.
- NOTE: At one point, my Athena and Oracle passwords were different, which is why there is a 'Main' and 'Oracle' entry in the password file.  Likely you will put the same password for both entries.
<br><br>

3. IF NOT ALREADY DONE, run the method Utilities_config.generate_initial_config_file to initiate your config.yaml file
- I suggest you input arguments for all three parameters (aep_user_id, pwd_file_path, and local_data_dir)
    - If no aep_user_id is given, the code will attempt to determine your AEP User ID from the contents of your C:\Users directory
    - If no pwd_file_path is given, it is assumed to exist, be named pwd_file.txt, and be located in the Analysis directory (C:\Users\s123456\Documents\Analysis\pwd_file.txt)
    - If local_data_dir is not None, it should point to a directory when you plan to store any results (my personal local_data_dir is located at C:\Users\s346557\Documents\LocalData\).
        - If you are not planning to save or load any files locally, I believe this can be kept as None

# ----------------------------------------------------------------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# ONLY NEED TO RUN ONCE!
So, if you have already run Utilities_config.generate_initial_config_file (and your configuration has not changed since), there is no need to run again

In [None]:
run_config=False

In [None]:
if run_config:
    # REPLACE VALUES BELOW WITH YOUR OWN
    aep_user_id = 's123456'
    pwd_file_path = r'C:\Users\s3123456\Documents\Analysis\pwd_file.txt'
    local_data_dir = r'C:\Users\s123456\Documents\LocalData'

    Utilities_config.generate_initial_config_file(
        aep_user_id=aep_user_id, 
        pwd_file_path=pwd_file_path, 
        local_data_dir=local_data_dir
    )

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# ----------------------------------------------------------------------------------------------------

# SET model_dir TO YOUR LOCAL VALUE!!!!!
This directory should house the following files:
- forest_clf.joblib
- scaler.joblib
- eemsp_encoder.joblib
- data_structure_df.pkl

In [None]:
# model_dir = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230615\Models\All_EEMSP_agg_Top10_v2'
model_dir = r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20231201\Models\All_EEMSP_agg_Top10_v5'

# Randomly chosen trsf_pole_nbs
I randomly choses the trsf_pole_nbs below from a dataset I was working with.
</br>The purpose is simply to create a smaller, more manageable, dataset to work with for this demo (as opposed to, e.g., taking all Ohio data)

In [None]:
# trsf_pole_nbs = [
#     '1881678764270',
#     '1783411710022',
#     '41830153D30129',
#     '1872247711298',
#     '1820228711653',
#     '1924989734074',
#     '1834492774877',
#     '1853148757927',
#     '1840493692059',
#     '41810769C10088',
#     '1882624786848',
#     '1867392684831',
#     '1951078726448',
#     '40820458D10181',
#     '41840501D30023',
#     '41810982D20079',
#     '1899769749646',
#     '41830806B20022',
#     '40820458A40051',
#     '1894765679964',
#     '1914519715083',
#     '2295283520733',
#     '1865162714841',
#     '1819362730022',
#     '39830853B40009',
#     '1842568672564',
#     '41840982B20038',
#     '1827991743027',
#     '1864172691901',
#     '1839189769213',
#     '1872465763559',
#     '1878400683201',
#     '41810796A20156',
#     '1905880759206',
#     '1839712741136',
#     '41810748B40188',
#     '41810958D40169',
#     '1829134693468',
#     '40820506D10218',
#     '1887542761309',
#     '1871334715206',
#     '1904835767206',
#     '1902838697810',
#     '40820507A10072',
#     '41810819C40139',
#     '1865431765527',
#     '1891170737034',
#     '40820707C20002',
#     '1836231721573',
#     '1912124700405',
#     '1842330805220',
#     '1866130766088',
#     '40820588D30084',
#     '41840898A10069',
#     '1871696719469',
#     '41810794A40161',
#     '41811008A20198',
#     '1875598765170',
#     '1861922686377',
#     '1948667782803',
#     '40820506D40238',
#     '41840898C20057',
#     '1833284756491',
#     '39830855D10027',
#     '40820507B20057',
#     '1856743480538',
#     '1914662698440',
#     '1832714711800',
#     '40820660D10199',
#     '40820507C10103',
#     '41830440D40074',
#     '41810796A40071',
#     '1859120778206',
#     '1866200767783',
#     '41830750A30052',
#     '1919909700535',
#     '1868442737650',
#     '1879608712393',
#     '40820507A40022',
#     '1823173736584',
#     '1885575704441',
#     '1853216748770',
#     '40810165C10068',
#     '1829736736745',
#     '41840874B30085',
#     '1867971718645',
#     '40820482B20025',
#     '41810748D20072',
#     '1877687713638',
#     '1835644777440',
#     '1870217709319',
#     '41840874B30044',
#     '40810190A10166',
#     '1796307716407',
#     '1875834764859',
#     '40810002A10009',
#     '1881341721010',
#     '40820506D10055',
#     '1898393738979',
#     '40820507C30093'
# ]

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()

In [None]:
trsf_pole_nbs_df = MeterPremise.get_distinct_trsf_pole_nbs(
    conn_aws=conn_aws, 
    states='OH'
)

In [None]:
# trsf_pole_nbs = trsf_pole_nbs_df.sample(n=10000)['trsf_pole_nb'].tolist()
trsf_pole_nbs = trsf_pole_nbs_df.sample(n=100)['trsf_pole_nb'].tolist()

In [None]:
# trsf_pole_nbs

# Prediction Date
This corresponds essentially to the day on which the model will be run/data evaluated.
</br>Data will be collected for a period spanning 31 days before the prediction date up to 1 day before.
</br>Eventually, the data will be grouped into the 5-day periods:'01-06 Days', '06-11 Days', '11-16 Days', '16-21 Days', '21-26 Days','26-31 Days'

In [None]:
prediction_date = pd.to_datetime('2023-06-01')
date_range = [
    prediction_date-pd.Timedelta('31D'), 
    prediction_date-pd.Timedelta('1D')
]

In [None]:
len(trsf_pole_nbs)

# Grab the data from meter_events.events_summary_vw

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()
#-----
end_events_sql_function_kwargs=dict(
    schema_name='meter_events', 
    table_name='events_summary_vw', 
    cols_of_interest=['*'], 
    date_range=date_range, 
    trsf_pole_nbs=trsf_pole_nbs, 
    opco='oh'
)
#-----
end_events = AMIEndEvents(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args = dict(conn_db=conn_aws), 
    build_sql_function=AMIEndEvents_SQL.build_sql_end_events, 
    build_sql_function_kwargs=end_events_sql_function_kwargs, 
    init_df_in_constructor=True, 
    save_args=False
)
ede_df = end_events.df.copy()

In [None]:
ede_df.head()

# Also need meter_events.event_summ_regex_setup
to convert the column names in rcpx from cr# to curated reason

In [None]:
# cr_trans_dict = curated reasons translation dictionary
sql = """
SELECT * FROM meter_events.event_summ_regex_setup
"""
regex_setup_df = pd.read_sql(sql, conn_aws, dtype=str)
cr_trans_dict = {x[0]:x[1] for x in regex_setup_df[['pivot_id', 'regex_report_title']].values.tolist()}

In [None]:
cr_trans_dict

# Build rcpx_0
Construct rcpx_0 by aggregating ede_df by trsf_pole_nb and by 5-day frequency

In [None]:
freq='5D'
group_cols=['trsf_pole_nb']
group_freq=pd.Grouper(freq=freq, key='aep_event_dt')
#-------------------------
# Convert aep_event_dt to datetime object
ede_df['aep_event_dt'] = pd.to_datetime(ede_df['aep_event_dt'])

# Will no longer need the following columns
cols_to_drop = ['serialnumber', 'aep_premise_nb', 'aep_opco']

agg_dict = {col:np.sum for col in ede_df.drop(columns=cols_to_drop+['trsf_pole_nb', 'aep_event_dt']).columns.tolist()}
agg_dict['xf_meter_cnt'] = np.max
#-------------------------
rcpx_0 = ede_df.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

In [None]:
rcpx_0.head(10)

# Project out xf_meter_cnt, as it will be used later

In [None]:
xf_meter_cnt_srs = rcpx_0.droplevel(1, axis=0)['xf_meter_cnt'].reset_index().drop_duplicates().set_index('trsf_pole_nb').squeeze()
assert(xf_meter_cnt_srs.shape[0]==xf_meter_cnt_srs.index.nunique())
all_trsf_pole_nbs = rcpx_0.index.get_level_values(0).unique().tolist()
xf_meter_cnt_srs.name='nSNs'

In [None]:
xf_meter_cnt_srs.head()

# Need data_structure_df
In general, not all curated reasons will be included in the model.
</br>Typically, 10 commong curated reasons will be included, and all others will be grouped together in "Other Reasons".
</br>Furthermore, some reasons may be combined together, others may be completely removed.
</br>For these reasons, it is beneficial to have some sample data (taken from when the model was created) to utilize in structuring the new data in the same fashion.
</br>Additionally, the data will be used to ensure the ordering of columns is correct before the data are fed into the model.

In [None]:
data_structure_df = pd.read_pickle(os.path.join(model_dir, 'data_structure_df.pkl'))

In [None]:
data_structure_df.head()

# Transform rcpx_0 to the form expected by the model
i.e., similar to data_structure_df.
</br>This is essentially just changing rcpo_0 from long form to wide form

In [None]:
# Build time_pds_rename
#-----
# We will need to convert the time periods, which are currently housed in the 'aep_event_dt' index of 
#   rcpx_0 from their specific dates to the names expected by the model.
# In rcpx_0, after grouping by the 5-day intervals, the values of 'aep_event_dt' are equal to the beginning
#   dates of the given interval.
# These will be converted to the titles contained in final_time_pds below
# NOTE: This is probably not 100% necessary, but is useful nonetheless
#-------------------------
curr_time_pds = natsorted(rcpx_0.index.get_level_values(1).unique())
# There should be 6 time periods, each of width 5 days
for i in range(len(curr_time_pds)):
    if i==0:
        continue
    assert(curr_time_pds[i]-curr_time_pds[i-1]==pd.Timedelta('5D'))
#-----
final_time_pds = [
    '01-06 Days',
    '06-11 Days',
    '11-16 Days',
    '16-21 Days',
    '21-26 Days',
    '26-31 Days',
]
#-----
time_pds_rename = dict(zip(curr_time_pds, final_time_pds))
#-------------------------

In [None]:
# As stated above, this is essentially just changing rcpo_0 from long form to wide form
# This will probably be formalized further in the future (i.e., function(s) developed to handle)
rename_cols = {
    'events_tot':'total_counts', 
    'xf_meter_cnt':'nSNs'
}

total_counts_col = 'total_counts'
nSNs_col         = 'nSNs'
non_reason_cols = [nSNs_col, total_counts_col]

include_power_down_minus_up=False
#-------------------------
rcpx_0=rcpx_0.rename(columns=rename_cols)
#-------------------------
pd_dfs = []
for date_pd_i in curr_time_pds:
    # Grab the proper time period name from final_time_pd_i
    final_time_pd_i = time_pds_rename[date_pd_i]
    #-----
    # Get the expected columns for this time period from data_structure_df
    final_reason_cols_i = data_structure_df[final_time_pd_i].columns.tolist()
    final_reason_cols_i = [x for x in final_reason_cols_i if x not in non_reason_cols+['Other Reasons']]
    #-------------------------
    # Project out the current time period (date_pd_i) from rcpx_0 by selecting the appropriate
    #   values from the 'aep_event_dt' index (i.e., index level 1)
    rcpx_0_pd_i = rcpx_0[rcpx_0.index.get_level_values(1)==date_pd_i].copy()
    rcpx_0_pd_i = rcpx_0_pd_i.droplevel(1, axis=0)
    #-------------------------
    # Make sure all trsf_pole_nbs have an entry in rcpx_0_pd_i:
    #   If a trsf_pole_nb didn't register any events in a given time period, it will not be included in the projection.
    #   However, the final format requires each transformer have entries for each time period
    #   Therefore, we identify the trsf_pole_nbs missing from rcpx_0_pd_i (no_events_pd_i) and add approriate rows
    #     containing all 0 values for the counts
    no_events_pd_i = list(set(all_trsf_pole_nbs).difference(set(rcpx_0_pd_i.index.get_level_values(0).unique())))
    no_events_pd_i_df = pd.DataFrame(
        columns=rcpx_0.columns, 
        index=no_events_pd_i, 
        data=np.zeros((len(no_events_pd_i), rcpx_0.shape[1]))
    )
    #-----
    # Use xf_meter_cnt_srs to fill the 'nSNs' column in no_events_pd_i_df
    # NOTE: This is probably not strictly necessary, as the 'nSNs' column won't be used here,
    #         since the data are not normalized.
    no_events_pd_i_df = no_events_pd_i_df.drop(columns=['nSNs']).merge(
        xf_meter_cnt_srs, 
        left_index=True, 
        right_index=True, 
        how='left'
    )
    # Sanity check on the merge
    assert(no_events_pd_i_df['nSNs'].notna().all())
    #-----
    # Combine rcpx_0_pd_i and no_events_pd_i_df
    assert(len(set(rcpx_0_pd_i.columns).symmetric_difference(set(no_events_pd_i_df.columns)))==0)
    no_events_pd_i_df = no_events_pd_i_df[rcpx_0_pd_i.columns]
    rcpx_0_pd_i = pd.concat([rcpx_0_pd_i, no_events_pd_i_df])
    #-------------------------
    # Rename the cr# columns to their full curated reasons
    rcpx_0_pd_i=rcpx_0_pd_i.rename(columns=cr_trans_dict)
    #--------------------------------------------------
    #--------------------------------------------------
    # Any columns without a curated reason (i.e., those with column name = ''), have not been observed
    #   yet in the data, and therefore the sume of the counts should be 0.
    # These empty columns are not needed, so drop
    assert(rcpx_0_pd_i[''].sum().sum()==0)
    rcpx_0_pd_i=rcpx_0_pd_i.drop(columns=[''])
    #-------------------------
    # Any curated reasons containing 'cleared' or 'Test Mode' or not included in the analysis, so remove
    rcpx_0_pd_i = MECPODf.remove_reasons_from_rcpo_df(
        rcpo_df=rcpx_0_pd_i, 
        regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*'], 
        ignore_case=True
    )
    #-----
    # After irrelevant cleared and test columns removed, need to recalculate events_tot to accurately
    #   reflect the total number of relevant events
    assert(total_counts_col in non_reason_cols)
    rcpx_0_pd_i[total_counts_col] = rcpx_0_pd_i.drop(columns=non_reason_cols).sum(axis=1)
    #-------------------------
    # Combine similar reasons (e.g., all 'Tamper' type reasons are combined into 1)
    # See MECPODf.combine_cpo_df_reasons for more information
    rcpx_0_pd_i = MECPODf.combine_cpo_df_reasons(rcpo_df=rcpx_0_pd_i)
    #-------------------------
    # Include the difference in power-up and power-down, if desired (typically turned off) 
    if include_power_down_minus_up:
        rcpx_0_pd_i = MECPODf.delta_cpo_df_reasons(
            rcpo_df=rcpx_0_pd_i, 
            reasons_1='Primary Power Down',
            reasons_2='Primary Power Up',
            delta_reason_name='Power Down Minus Up'
        )
    #-------------------------
    # Make sure rcpx_0_pd_i contains the expected final reason columns.
    # Once this is assured, project out these reasons and combine all other reasons into
    #   the 'Other Reasons' columns
    # See MECPODf.get_reasons_subset_from_cpo_df for more info
    assert(len(set(final_reason_cols_i).difference(set(rcpx_0_pd_i.columns.tolist())))==0)
    rcpx_0_pd_i = MECPODf.get_reasons_subset_from_cpo_df(
        cpo_df=rcpx_0_pd_i, 
        reasons_to_include=final_reason_cols_i, 
        combine_others=True, 
        output_combine_others_col='Other Reasons', 
        SNs_tags=None, 
        is_norm=False, 
        counts_col='nSNs', 
        normalize_by_nSNs_included=False, 
        level_0_raw_col = 'counts', 
        level_0_nrm_col = 'counts_norm', 
        cols_to_ignore = ['total_counts'], 
        include_counts_col_in_output=True
    )    
    #--------------------------------------------------
    #--------------------------------------------------
    # Don't want nSNs in each pd individually
    rcpx_0_pd_i = rcpx_0_pd_i.drop(columns=[nSNs_col])
    #-------------------------
    # Add the correct time period name as level 0 of the columns
    rcpx_0_pd_i = Utilities_df.prepend_level_to_MultiIndex(
        df=rcpx_0_pd_i, 
        level_val=final_time_pd_i, 
        level_name=None, 
        axis=1
    )
    #-------------------------
    pd_dfs.append(rcpx_0_pd_i)
    
# Make sure all dfs in pd_dfs look correct
shape_0 = pd_dfs[0].shape
index_0 = pd_dfs[0].index
for i in range(len(pd_dfs)):
    if i==0:
        continue
    assert(pd_dfs[i].shape==shape_0)
    assert(len(set(index_0).symmetric_difference(set(pd_dfs[i].index)))==0)
    #-----
    # Aligning the indices is not strictly necessary, as pd.concat should handle that
    # But, it's best to be safe
    pd_dfs[i] = pd_dfs[i].loc[index_0]
    
# Build rcpx_final by combining all dfs in pd_dfs
rcpx_final = pd.concat(pd_dfs, axis=1)

# Include back in the number of SNs per transformer (from xf_meter_cnt_srs)
rcpx_final=rcpx_final.merge(
    xf_meter_cnt_srs.to_frame(name=('nSNs', 'nSNs')), 
    left_index=True, 
    right_index=True, 
    how='left'
)
# Sanity check on the merge
assert(rcpx_final['nSNs'].notna().all().all())

In [None]:
rcpx_final.head()

# Normalize by nSNs

In [None]:
# Kind of silly, but below I cannot simply use 'rcpx_final[final_time_pds] = ...'
#   This will result in: "ValueError: Columns must be same length as key", because final_time_pds
#   has only, e.g., 6 elements but rcpx_final[final_time_pds] contains, e.g., 72 columns
# Instead, must use 'rcpx_final[rcpx_final[final_time_pds].columns] = ..'
rcpx_final[rcpx_final[final_time_pds].columns] = rcpx_final[final_time_pds].divide(rcpx_final[('nSNs', 'nSNs')], axis=0)

In [None]:
rcpx_final.head()

# Build EEMSP Data

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()

In [None]:
merge_eemsp = True
mult_strategy='agg'
#-------------------------
cols_of_interest_eemsp = [
    'location_nb', 
    'mfgr_nm', 
    'install_dt', 
    'last_trans_desc', 
    'eqtype_id', 
    'coolant', 
    'info', 
    'kva_size',
    'phase_cnt', 
    'prim_voltage', 
    'protection', 
    'pru_number', 
    'sec_voltage', 
    'special_char', 
    'taps', 
    'xftype'
]
cols_of_interest_eemsp_full = cols_of_interest_eemsp + ['latest_status', 'removal_dt', 'serial_nb']
#-------------------------
sql_EEMSP = """
SELECT {} 
FROM meter_events.eems_transformer_nameplate
WHERE location_nb IN ({})
AND install_dt <= '{}'
AND (removal_dt IS NULL OR removal_dt > '{}')
""".format(
    Utilities_sql.join_list(cols_of_interest_eemsp_full, quotes_needed=False), 
    Utilities_sql.join_list(trsf_pole_nbs, quotes_needed=True), 
    date_range[0], 
    date_range[1]
)
print(sql_EEMSP)
#-------------------------
df_eemsp = pd.read_sql_query(sql_EEMSP, conn_aws)

In [None]:
df_eemsp.head()

# Reduce down df_eemsp so there is a single entry for each transformer
reduce1_eemsp_for_outg_trsf reduces df_eemsp down to contain only entries for transformers which were active during the date(s) in question.
</br>No need to run reduce1_eemsp_for_outg_trsf for this case, as all share the same date restrictions which were already imposed in sql_EEMSP.
</br>(For model development/training, this step would be necessary, as the data utilized there have many different date restrictions, and df_eemsp cannot simply be built with the date restrictions)

reduce2_eemsp_for_outg_trsf futher reduces df_eemsp down so there is a single entry for each transformer.
</br>How exactly this is achieved is dictated mainly by the "mult_strategy" parameter

In [None]:
# reduce2_eemsp_for_outg_trsf was designed to be used with outg_rec_nb/no_outg_rec_nb.
# outg_rec_nb is not necessary here, but we need a temporary column anyway to make the function happy.
# I'll update the code in the future so this unnecessary step won't be needed
df_eemsp['outg_rec_nb'] = df_eemsp['location_nb']
#-----
df_eemsp_reduce2 = reduce2_eemsp_for_outg_trsf(
    df_eemsp=df_eemsp, 
    mult_strategy='agg', 
    include_n_eemsp=True, 
    outg_rec_nb_col='outg_rec_nb', 
    location_nb_col='location_nb', 
    numeric_cols = ['kva_size'], 
    dt_cols = ['install_dt', 'removal_dt'], 
    ignore_cols = ['serial_nb'], 
    cat_cols_as_strings=True
)
#-------------------------
# No matter of the mult_strategy used, at this point df_eemsp_reduce2 should only have a single
#   entry for each outg_rec_nb, location_nb pair
assert(all(df_eemsp_reduce2[['outg_rec_nb', 'location_nb']].value_counts()==1))

#----------------------------------------------------------------------------------------------------
# Clean up df_eemsp_reduce2 and merge with rcpx_final
#--------------------------------------------------
# Can't simply take df_eemsp_reduce2[cols_of_interest_eemsp] because we need also the new column
#   OUTG_REC_NB_TO_MERGE (and any others which may be added in the future)
cols_to_drop = list(set(cols_of_interest_eemsp_full).difference(set(cols_of_interest_eemsp)))
cols_to_drop = [x for x in cols_to_drop if x in df_eemsp_reduce2.columns]
if len(cols_to_drop)>0:
    df_eemsp_reduce2 = df_eemsp_reduce2.drop(columns=cols_to_drop)
#-------------------------
assert(df_eemsp_reduce2.shape[0]==df_eemsp_reduce2.groupby(['outg_rec_nb', 'location_nb']).ngroups)
print(f"df_eemsp_reduce2['location_nb'].nunique() = {df_eemsp_reduce2['location_nb'].nunique()}")
print(f"len(trsf_pole_nbs)                        = {len(trsf_pole_nbs)}")
print(f"Diff                                      = {len(trsf_pole_nbs)-df_eemsp_reduce2['location_nb'].nunique()}")
print()
#-------------------------
# Make all EEMSP columns (except n_eemsp) uppercase to match what was done in model development (where EEMSP)
#   data were grabbed from the Oracle database, and columns were all uppercase)
df_eemsp_reduce2 = Utilities_df.make_all_column_names_uppercase(df_eemsp_reduce2, cols_to_exclude=['n_eemsp'])

# Similar to the case with 'outg_rec_nb' column in df_eemsp above, merge_rcpx_with_eemsp was designed to be 
#   used with outg_rec_nb/no_outg_rec_nb.
# As such, rcpx_final needs an additional column (in this case, it is easier to add another level to the index)
# I'll update the code in the future so this unnecessary step won't be needed
rcpx_final = rcpx_final.set_index([rcpx_final.index, rcpx_final.index])
#-------------------------
print("\nShapes BEFORE merging")
print(f"rcpx_final.shape = {rcpx_final.shape}")
#-------------------------
rcpx_final = merge_rcpx_with_eemsp(
    df_rcpx=rcpx_final, 
    df_eemsp=df_eemsp_reduce2, 
    outg_rec_nb_idfr_rcpx ='index_0', 
    trsf_pole_nb_idfr_rcpx='index_1', 
    outg_rec_nb_idfr_eemsp='OUTG_REC_NB', 
    location_nb_idfr_eemsp='LOCATION_NB', 
    set_index=True
)
#-------------------------
print("\nShapes AFTER merging")
print(f"rcpx_final.shape = {rcpx_final.shape}")
#-------------------------
# Drop the unnecessary index level that was added above and is no longer needed
rcpx_final=rcpx_final.droplevel(0, axis=0)

# Convert INSTALL_DT to age in years
rcpx_final[('EEMSP_0', 'INSTALL_DT')] = (prediction_date-rcpx_final[('EEMSP_0', 'INSTALL_DT')]).dt.total_seconds()/(60*60*24*365)

# Add month
rcpx_final[('dummy_lvl_0', 'outg_month')] = prediction_date.month
#-------------------------
# Make sure rcpx_final has the correct columns in the correct order
assert(len(set(data_structure_df.columns).symmetric_difference(set(rcpx_final.columns)))==0)
rcpx_final=rcpx_final[data_structure_df.columns]
X_test = rcpx_final.copy()

In [None]:
rcpx_final

# Load Model and Make Predictions

In [None]:
forest_clf = joblib.load(os.path.join(model_dir, 'forest_clf.joblib'))
scaler     = joblib.load(os.path.join(model_dir, 'scaler.joblib'))
eemsp_enc  = joblib.load(os.path.join(model_dir, 'eemsp_encoder.joblib'))

# Transformations/scaling

In [None]:
#-------------------------
cols_to_encode = data_structure_df['EEMSP_0'].columns
numeric_cols = ['KVA_SIZE', 'INSTALL_DT']
cols_to_encode = [x for x in cols_to_encode if x not in numeric_cols]
assert(len(set(eemsp_enc.feature_names_in_).symmetric_difference(cols_to_encode))==0)
assert(set(X_test['EEMSP_0'].columns).difference(eemsp_enc.feature_names_in_)==set(numeric_cols))
#-----
cols_to_encode = [('EEMSP_0', x) for x in cols_to_encode if x not in numeric_cols]
X_test[cols_to_encode] = X_test[cols_to_encode].astype(str)
X_test[cols_to_encode] = eemsp_enc.transform(X_test[cols_to_encode].droplevel(0, axis=1))
#----------
X_test = scaler.transform(X_test)
#-------------------------

In [None]:
# Make predictions
y_pred = forest_clf.predict(X_test)

In [None]:
print(f"# Outages Predicted: {y_pred.sum()}")
print(f"# Predictions:       {y_pred.shape[0]}")
print(f"%:                   {100*y_pred.sum()/y_pred.shape[0]}")

In [None]:
# Set predictions column in rcpx_final
assert(rcpx_final.shape[0]==y_pred.shape[0])
rcpx_final['y_pred'] = y_pred

In [None]:
rcpx_final[rcpx_final['y_pred']==1]

In [None]:
assert(0)

In [None]:
df_mp_install_time_col = 'inst_ts'
df_mp_removal_time_col = 'rmvl_ts'
dt_0 = prediction_date
dt_1 = prediction_date

In [None]:
mp_df = MeterPremise.build_mp_df_curr_hist_for_xfmrs(
    trsf_pole_nbs=rcpx_final.index.tolist(), 
    join_curr_hist=True, 
    addtnl_mp_df_curr_cols=None, 
    addtnl_mp_df_hist_cols=None, 
    assume_one_xfmr_per_PN=True, 
    drop_approx_duplicates=True, 
    drop_approx_duplicates_args=None, 
    df_mp_serial_number_col='mfr_devc_ser_nbr', 
    df_mp_prem_nb_col='prem_nb', 
    df_mp_install_time_col='inst_ts', 
    df_mp_removal_time_col='rmvl_ts', 
    df_mp_trsf_pole_nb_col='trsf_pole_nb'
)

# Only want meters active at the relevant time period
mp_df = mp_df[(mp_df[df_mp_install_time_col]<=pd.to_datetime(dt_0)) & 
              (mp_df[df_mp_removal_time_col].fillna(pd.Timestamp.max)>pd.to_datetime(dt_1))]

In [None]:
# Build dovs_df
dovs = DOVSOutages(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True,
    build_sql_function=DOVSOutages_SQL.build_sql_std_outage, 
    build_sql_function_kwargs=dict(
        premise_nbs=mp_df['prem_nb'].unique().tolist(), 
        date_range=[
            prediction_date-pd.Timedelta('31D'), 
            prediction_date+pd.Timedelta('31D')
        ], 
        field_to_split='premise_nbs', 
        include_premise=True
    ), 
    build_consolidated=False
)
dovs_df = dovs.df.copy()

In [None]:
dovs_df = pd.merge(
    dovs_df, 
    mp_df[['prem_nb', 'trsf_pole_nb']].drop_duplicates(), 
    left_on='PREMISE_NB', 
    right_on='prem_nb', 
    how='left'
)
dovs_df

In [None]:
mp_df_pred1 = mp_df[mp_df['trsf_pole_nb'].isin(rcpx_final[rcpx_final['y_pred']==1].index.tolist())].copy()
dovs_df_pred1 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred1['prem_nb'].unique().tolist())]
#-----
mp_df_pred0 = mp_df[mp_df['trsf_pole_nb'].isin(rcpx_final[rcpx_final['y_pred']==0].index.tolist())].copy()
dovs_df_pred0 = dovs_df[dovs_df['PREMISE_NB'].isin(mp_df_pred0['prem_nb'].unique().tolist())]

In [None]:
dovs_df_pred1['DT_OFF_TS_FULL'].nunique()

In [None]:
dovs_df_pred0['DT_OFF_TS_FULL'].nunique()

In [None]:
dovs_df_pred1['DT_OFF_TS_FULL']

In [None]:
natsorted(dovs_df_pred1['DT_OFF_TS_FULL'].unique())

In [None]:
prediction_date

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.plot.scatter(ax=ax, x='DT_OFF_TS_FULL', y='CI_NB')

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.plot.scatter(ax=ax, x='DT_OFF_TS_FULL', y='CMI_NB')

In [None]:
dovs_df_pred1

In [None]:
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL')).count()

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['OUTG_REC_NB'].count().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred1.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CMI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
fig,ax = Plot_General.default_subplots()
dovs_df_pred0.groupby(pd.Grouper(freq='1D', key='DT_OFF_TS_FULL'))['CMI_NB'].sum().plot(ax=ax, kind="bar")

In [None]:
dovs_df['OUTG_REC_NB'].nunique()

In [None]:
dovs_df['trsf_pole_nb'].nunique()

In [None]:
rcpx_final[rcpx_final['y_pred']==1]

In [None]:
set(dovs_df['trsf_pole_nb'].unique()).intersection(set(rcpx_final[rcpx_final['y_pred']==1].index))

In [None]:
natsorted(rcpx_final[rcpx_final['y_pred']==1].index)

In [None]:
natsorted(dovs_df['trsf_pole_nb'].unique())

In [None]:
set(dovs_df['trsf_pole_nb'].unique()).intersection(set(rcpx_final.index))