# SEE 'Directions for running' below!

In [None]:
%run ./model_end_events_for_outages_METHODS.ipynb

In [None]:
from importlib import reload
#reload(Utilities)
#reload(clm)
# NOTE: To reload a class imported as, e.g., 
# from module import class
# One must call:
#   1. import module
#   2. reload module
#   3. from module import class

import sys, os
import re
from pathlib import Path
import json
import pickle

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype
from scipy import stats
import datetime
import time
from natsort import natsorted, ns, natsort_keygen
from packaging import version
import copy

import itertools

import pyodbc
#---------------------------------------------------------------------
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
from matplotlib import dates
import matplotlib.colors as mcolors
import matplotlib.cm as cm #e.g. for cmap=cm.jet
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#-----
import CommonLearningMethods as clm
#-----
from MeterPremise import MeterPremise
from EEMSP import EEMSP
#-----
from AMI_SQL import AMI_SQL
from AMINonVee_SQL import AMINonVee_SQL
from AMIEndEvents_SQL import AMIEndEvents_SQL
from AMIUsgInst_SQL import AMIUsgInst_SQL
from DOVSOutages_SQL import DOVSOutages_SQL
#-----
from GenAn import GenAn
from AMINonVee import AMINonVee
from AMIEndEvents import AMIEndEvents
from AMIEDE_DEV import AMIEDE_DEV
from MECPODf import MECPODf
from MECPOAn import MECPOAn
from AMIUsgInst import AMIUsgInst
from DOVSOutages import DOVSOutages
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_sql_aids_dir())
import Utilities_sql
import TableInfos
from TableInfos import TableInfo
from SQLElement import SQLElement
from SQLElementsCollection import SQLElementsCollection
from SQLSelect import SQLSelectElement, SQLSelect
from SQLFrom import SQLFrom
from SQLWhere import SQLWhereElement, SQLWhere
from SQLJoin import SQLJoin, SQLJoinCollection
from SQLGroupBy import SQLGroupByElement, SQLGroupBy
from SQLHaving import SQLHaving
from SQLOrderBy import SQLOrderByElement, SQLOrderBy
from SQLQuery import SQLQuery
from SQLQueryGeneric import SQLQueryGeneric
#---------------------------------------------------------------------
#sys.path.insert(0, os.path.join(os.path.realpath('..'), 'Utilities'))
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
from Utilities_df import DFConstructType
import Utilities_dt
import Plot_General
import Plot_Box_sns
import Plot_Hist
import Plot_Bar
import GrubbsTest
import DataFrameSubsetSlicer
from DataFrameSubsetSlicer import DataFrameSubsetSlicer as DFSlicer

# Directions for running:

For the purposes of this demonstration, let's assume your AEP User ID is s123456, and your local Documents directory
is located at C:\Users\s123456\Documents

1. If not already done, clone the Analysis GitHub repo (https://github.aepsc.com/s346557/Analysis).
<br>- I will assume the repo was cloned into the Documents directory, i.e. I assume your local copy of the repo is located at C:\Users\s123456\Documents\Analysis (and therefore, this Jupyter notebook should be located at C:\Users\s123456\Documents\Analysis\JupyterNbs\IT_Demo.ipynb)
<br><br>

2. Create a simple text file containing your AEP passwords.
- I suggest you use the file pwd_file_template.txt in the Analysis directory (C:\Users\s123456\Documents\Analysis\pwd_file_template.txt) to create your own password file.
    - DO NOT ALTER the pwd_file_template.txt file, create a new pwd_file.txt file!
- I further suggest you name your password file pwd_file.txt and place it in the Analysis directory (C:\Users\s123456\Documents\Analysis\pwd_file.txt).
    - The Git repo is set up to ignore pwd_file.txt in the Analysis directory, so your information will not be pushed up to the repo if saved in this manner.
- NOTE: At one point, my Athena and Oracle passwords were different, which is why there is a 'Main' and 'Oracle' entry in the password file.  Likely you will put the same password for both entries.
<br><br>

3. IF NOT ALREADY DONE, run the method Utilities_config.generate_initial_config_file to initiate your config.yaml file
- I suggest you input arguments for all three parameters (aep_user_id, pwd_file_path, and local_data_dir)
    - If no aep_user_id is given, the code will attempt to determine your AEP User ID from the contents of your C:\Users directory
    - If no pwd_file_path is given, it is assumed to exist, be named pwd_file.txt, and be located in the Analysis directory (C:\Users\s123456\Documents\Analysis\pwd_file.txt)
    - If local_data_dir is not None, it should point to a directory when you plan to store any results (my personal local_data_dir is located at C:\Users\s346557\Documents\LocalData\).
        - If you are not planning to save or load any files locally, I believe this can be kept as None

# ----------------------------------------------------------------------------------------------------
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# ONLY NEED TO RUN ONCE!
So, if you have already run Utilities_config.generate_initial_config_file (and your configuration has not changed since), there is no need to run again

In [None]:
run_config=False

In [None]:
if run_config:
    # REPLACE VALUES BELOW WITH YOUR OWN
    aep_user_id = 's123456'
    pwd_file_path = r'C:\Users\s3123456\Documents\Analysis\pwd_file.txt'
    local_data_dir = r'C:\Users\s123456\Documents\LocalData'

    Utilities_config.generate_initial_config_file(
        aep_user_id=aep_user_id, 
        pwd_file_path=pwd_file_path, 
        local_data_dir=local_data_dir
    )

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# ----------------------------------------------------------------------------------------------------

In [None]:
# Randomly grabbed trsf_pole_nbs to use for our purposes
trsf_pole_nbs = [
    '1881678764270',
    '1783411710022',
    '41830153D30129',
    '1872247711298',
    '1820228711653',
    '1924989734074',
    '1834492774877',
    '1853148757927',
    '1840493692059',
    '41810769C10088',
    '1882624786848',
    '1867392684831',
    '1951078726448',
    '40820458D10181',
    '41840501D30023',
    '41810982D20079',
    '1899769749646',
    '41830806B20022',
    '40820458A40051',
    '1894765679964',
    '1914519715083',
    '2295283520733',
    '1865162714841',
    '1819362730022',
    '39830853B40009',
    '1842568672564',
    '41840982B20038',
    '1827991743027',
    '1864172691901',
    '1839189769213',
    '1872465763559',
    '1878400683201',
    '41810796A20156',
    '1905880759206',
    '1839712741136',
    '41810748B40188',
    '41810958D40169',
    '1829134693468',
    '40820506D10218',
    '1887542761309',
    '1871334715206',
    '1904835767206',
    '1902838697810',
    '40820507A10072',
    '41810819C40139',
    '1865431765527',
    '1891170737034',
    '40820707C20002',
    '1836231721573',
    '1912124700405',
    '1842330805220',
    '1866130766088',
    '40820588D30084',
    '41840898A10069',
    '1871696719469',
    '41810794A40161',
    '41811008A20198',
    '1875598765170',
    '1861922686377',
    '1948667782803',
    '40820506D40238',
    '41840898C20057',
    '1833284756491',
    '39830855D10027',
    '40820507B20057',
    '1856743480538',
    '1914662698440',
    '1832714711800',
    '40820660D10199',
    '40820507C10103',
    '41830440D40074',
    '41810796A40071',
    '1859120778206',
    '1866200767783',
    '41830750A30052',
    '1919909700535',
    '1868442737650',
    '1879608712393',
    '40820507A40022',
    '1823173736584',
    '1885575704441',
    '1853216748770',
    '40810165C10068',
    '1829736736745',
    '41840874B30085',
    '1867971718645',
    '40820482B20025',
    '41810748D20072',
    '1877687713638',
    '1835644777440',
    '1870217709319',
    '41840874B30044',
    '40810190A10166',
    '1796307716407',
    '1875834764859',
    '40810002A10009',
    '1881341721010',
    '40820506D10055',
    '1898393738979',
    '40820507C30093'
]

In [None]:
prediction_date = pd.to_datetime('2023-05-01')
date_range = [
    prediction_date-pd.Timedelta('31D'), 
    prediction_date-pd.Timedelta('1D')
]

# For this example...
Since we are grabbing data for specific transformers, and since 

In [None]:
mp = MeterPremise(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True, 
    build_sql_function=None, 
    build_sql_function_kwargs=dict(
        trsf_pole_nbs=trsf_pole_nbs, 
        field_to_split='trsf_pole_nbs'
    ), 
    save_args=False
)
mp_df = mp.df.copy()

# NOTE:
This is a somewhat simple-minded example, as I am naively joining the meter_events.end_device_event table with default.meter_premise table.
<br>As we have discussed, to correctly join with meter_premise, one must determine which meters were active AT THE TIME IN QUESTION (i.e., on 2022-10-01), not those simply in default.meter_premise.
<br>Thus, to be completely correct, one should use methods contained in the MeterPremise class.
<br>However, for the purposes here, this simple-minded join is acceptable.

In [None]:
# Build AMIEndEvents object housing data from the meter_events.end_device_event table.
# Events are collected from those meters having aep_opco='oh' on the date '2022-10-01'
ami_ede = AMIEndEvents(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args=None, 
    init_df_in_constructor=True, 
    build_sql_function=None, 
    build_sql_function_kwargs=dict(
        opco='oh', 
        date_range=date_range, 
        premise_nbs=mp_df['prem_nb'].unique().tolist(), 
        field_to_split='premise_nbs', 
        join_mp_args=dict(
            join_with_CTE=True, 
            build_mp_kwargs=dict(cols_of_interest=TableInfos.MeterPremise_TI.std_columns_of_interest), 
            join_type='LEFT', 
            list_of_columns_to_join=[
                ['serialnumber', 'mfr_devc_ser_nbr'], 
                ['aep_premise_nb', 'prem_nb']
            ]
        )
    ), 
    save_args=False
)

In [None]:
# The SQL statement run by the above block is printed below
print(ami_ede.get_sql_statement())

In [None]:
# Grab the pandas.DataFrame (DF) object from ami_ede
end_events_df = ami_ede.df.copy()
# Also, grab the full list of reasons found
reasons_full = sorted(end_events_df['reason'].unique().tolist())

In [None]:
# Print some basic info about the DF
print(f'end_events_df.shape = {end_events_df.shape}')
print(f"Number of unique reasons = {end_events_df['reason'].nunique()}")
end_events_df.head()

# -------------------------

In [None]:
# Below, when running reduce_end_event_reasons_in_df, the default behavior (placement_col=None) is to simply replace
#   the entries in the 'reason' column by their reduced versions.
# For purposes here, I will keep both the full reason column (renamed to 'reason_full') and the reduced reason column (named 'reason')

print(f'end_events_df.shape[0]                          = {end_events_df.shape[0]}')
print(f"end_events_df['serialnumber'].nunique()         = {end_events_df['serialnumber'].nunique()}")
print(f"end_events_df['enddeviceeventtypeid'].nunique() = {end_events_df['enddeviceeventtypeid'].nunique()}")
print('\n\n')

end_events_df = end_events_df.rename(columns={'reason':'reason_full'})
end_events_df = AMIEndEvents.reduce_end_event_reasons_in_df(
    df=end_events_df, 
    reason_col='reason_full', 
    placement_col='reason'
)

print('BEFORE REASON REDUCTION')
print(f"end_events_df['reason'].nunique()               = {end_events_df['reason_full'].nunique()}")
print('AFTER REASON REDUCTION')
print(f"end_events_df['reason'].nunique()               = {end_events_df['reason'].nunique()}")

### Build the reason_counts_per_group DF grouped by serial number, and including the number of serial numbers connected to the transformer
### This is a closer mock-up (although still not exactly what we want) of what we're ultimately looking for from IT

In [None]:
start=time.time()
rcpo_final_xfmr_v0 = AMIEndEvents.get_reason_counts_per_group(
    end_events_df = end_events_df, 
    group_cols=['trsf_pole_nb'], 
    group_freq=None, 
    gpby_dropna=False, 
    serial_number_col='serialnumber', 
    reason_col='reason', 
    include_normalize_by_nSNs=False, 
    inclue_zero_counts=True,
    possible_reasons=None, 
    include_nSNs=True, 
    include_SNs=False, 
    prem_nb_col='aep_premise_nb', 
    include_nprem_nbs=True,
    include_prem_nbs=False,   
    return_form = dict(return_multiindex_outg_reason = False, 
                       return_normalized_separately  = False)
)
print(time.time()-start)
#-------------------------
rcpo_final_xfmr_v0=rcpo_final_xfmr_v0.loc[~rcpo_final_xfmr_v0.index.isin(['NETWORK', 'PRIMARY', 'TRANSMISSION'])]
rcpo_final_xfmr_v0=rcpo_final_xfmr_v0.loc[rcpo_final_xfmr_v0.index.notna()]
#-------------------------
n_SNs_per_xfmr = mp_df.groupby('trsf_pole_nb')['mfr_devc_ser_nbr'].apply(lambda x: len(set(x)))
n_SNs_per_xfmr.name = 'n_SNs_on_xfmr'
#-------------------------
n_PNs_per_xfmr = mp_df.groupby('trsf_pole_nb')['prem_nb'].apply(lambda x: len(set(x)))
n_PNs_per_xfmr.name = 'n_PNs_on_xfmr'
#-------------------------
rcpo_final_xfmr_v0 = pd.merge(
    rcpo_final_xfmr_v0, 
    n_SNs_per_xfmr, 
    left_on='trsf_pole_nb', 
    right_index=True, 
    how='left'
)
#-------------------------
rcpo_final_xfmr_v0 = pd.merge(
    rcpo_final_xfmr_v0, 
    n_PNs_per_xfmr, 
    left_on='trsf_pole_nb', 
    right_index=True, 
    how='left'
)
#-------------------------
rcpo_final_xfmr_v0.loc[rcpo_final_xfmr_v0['n_SNs_on_xfmr'].notna(), 'n_SNs_on_xfmr'] = rcpo_final_xfmr_v0[rcpo_final_xfmr_v0['n_SNs_on_xfmr'].notna()]['n_SNs_on_xfmr'].astype(int)
rcpo_final_xfmr_v0.loc[rcpo_final_xfmr_v0['n_PNs_on_xfmr'].notna(), 'n_PNs_on_xfmr'] = rcpo_final_xfmr_v0[rcpo_final_xfmr_v0['n_PNs_on_xfmr'].notna()]['n_PNs_on_xfmr'].astype(int)
#-----
rcpo_final_xfmr_v0=Utilities_df.move_cols_to_front(rcpo_final_xfmr_v0, ['_nSNs', '_nprem_nbs', 'n_SNs_on_xfmr', 'n_PNs_on_xfmr'])
rcpo_final_xfmr_v0['n_events_tot'] = rcpo_final_xfmr_v0.iloc[:, 4:].sum(axis=1)
#-----
rcpo_final_xfmr_v0=Utilities_df.move_cols_to_front(rcpo_final_xfmr_v0, ['_nSNs', '_nprem_nbs', 'n_SNs_on_xfmr', 'n_PNs_on_xfmr', 'n_events_tot'])
#-------------------------
rcpo_final_xfmr_v0=rcpo_final_xfmr_v0.rename(columns={
    '_nSNs':        'n_SNs*', 
    '_nprem_nbs':   'n_PNs*', 
    'n_SNs_on_xfmr':'n_SNs', 
    'n_PNs_on_xfmr':'n_PNs'
})

In [None]:
start=time.time()
rcpo_final_xfmr = AMIEndEvents.get_reason_counts_per_group_QUICK(
    end_events_df=end_events_df, 
    group_cols=['trsf_pole_nb'], 
    group_freq=None, 
    gpby_dropna=False, 
    reason_col='reason'
)
print(time.time()-start)
#-------------------------
rcpo_final_xfmr=rcpo_final_xfmr.loc[~rcpo_final_xfmr.index.isin(['NETWORK', 'PRIMARY', 'TRANSMISSION'])]
rcpo_final_xfmr=rcpo_final_xfmr.loc[rcpo_final_xfmr.index.notna()]
#-------------------------
n_SNs_per_xfmr = mp_df.groupby('trsf_pole_nb')['mfr_devc_ser_nbr'].apply(lambda x: len(set(x)))
n_SNs_per_xfmr.name = 'n_SNs_on_xfmr'
#-------------------------
n_PNs_per_xfmr = mp_df.groupby('trsf_pole_nb')['prem_nb'].apply(lambda x: len(set(x)))
n_PNs_per_xfmr.name = 'n_PNs_on_xfmr'
#-------------------------
rcpo_final_xfmr = pd.merge(
    rcpo_final_xfmr, 
    n_SNs_per_xfmr, 
    left_on='trsf_pole_nb', 
    right_index=True, 
    how='left'
)
#-------------------------
rcpo_final_xfmr = pd.merge(
    rcpo_final_xfmr, 
    n_PNs_per_xfmr, 
    left_on='trsf_pole_nb', 
    right_index=True, 
    how='left'
)
#-------------------------
rcpo_final_xfmr.loc[rcpo_final_xfmr['n_SNs_on_xfmr'].notna(), 'n_SNs_on_xfmr'] = rcpo_final_xfmr[rcpo_final_xfmr['n_SNs_on_xfmr'].notna()]['n_SNs_on_xfmr'].astype(int)
rcpo_final_xfmr.loc[rcpo_final_xfmr['n_PNs_on_xfmr'].notna(), 'n_PNs_on_xfmr'] = rcpo_final_xfmr[rcpo_final_xfmr['n_PNs_on_xfmr'].notna()]['n_PNs_on_xfmr'].astype(int)
#-----
rcpo_final_xfmr=Utilities_df.move_cols_to_front(rcpo_final_xfmr, ['n_SNs_on_xfmr', 'n_PNs_on_xfmr'])
rcpo_final_xfmr['n_events_tot'] = rcpo_final_xfmr.iloc[:, 2:].sum(axis=1)
#-----
rcpo_final_xfmr=Utilities_df.move_cols_to_front(rcpo_final_xfmr, ['n_SNs_on_xfmr', 'n_PNs_on_xfmr', 'n_events_tot'])
#-------------------------
rcpo_final_xfmr=rcpo_final_xfmr.rename(columns={
    'n_SNs_on_xfmr':'n_SNs', 
    'n_PNs_on_xfmr':'n_PNs'
})

In [None]:
rcpo_final_xfmr_v0[rcpo_final_xfmr.columns].equals(rcpo_final_xfmr)

In [None]:
conn_aws = Utilities.get_athena_qa_aws_connection()

In [None]:
end_events_sql_function_kwargs=dict(
    schema_name='meter_events', 
    table_name='events_summary_vw', 
    cols_of_interest=['*'], 
    date_range=date_range, 
    trsf_pole_nbs=trsf_pole_nbs, 
    opco='oh'
)
#-----
end_events = AMIEndEvents(
    df_construct_type=DFConstructType.kRunSqlQuery, 
    contstruct_df_args = dict(conn_db=conn_aws), 
    build_sql_function=AMIEndEvents_SQL.build_sql_end_events, 
    build_sql_function_kwargs=end_events_sql_function_kwargs, 
    init_df_in_constructor=True, 
    save_args=False
)

In [None]:
end_events.df

In [None]:
dev_df = end_events.df.copy()

In [None]:
dev_df.drop(columns=['serialnumber', 'aep_premise_nb'])

In [None]:
# For full functionality, won't want to drop aep_event_dt (and maybe not aep_opco)
cols_to_drop = ['serialnumber', 'aep_premise_nb', 'aep_opco', 'aep_event_dt']

In [None]:
dev_df.drop(columns=cols_to_drop).groupby(['trsf_pole_nb']).agg(np.sum)

In [None]:
dev_df.drop(columns=cols_to_drop).columns.tolist()

In [None]:
agg_dict = {col:np.sum for col in dev_df.drop(columns=cols_to_drop+['trsf_pole_nb']).columns.tolist()}
agg_dict['xf_meter_cnt'] = np.max

In [None]:
dev_df.drop(columns=cols_to_drop).groupby(['trsf_pole_nb']).agg(agg_dict)

In [None]:
dev_df['trsf_pole_nb'].nunique()

In [None]:
rcpo_final_xfmr_new = dev_df.drop(columns=cols_to_drop).groupby(['trsf_pole_nb']).agg(agg_dict)

In [None]:
rcpo_final_xfmr

In [None]:
rcpo_final_xfmr_new

In [None]:
sql = """
SELECT * FROM meter_events.event_summ_regex_setup
"""
regex_setup_df = pd.read_sql(sql, conn_aws, dtype=str)
rename_cols_dict = {x[0]:x[1] for x in regex_setup_df[['pivot_id', 'regex_report_title']].values.tolist()}

In [None]:
rcpo_final_xfmr_new=rcpo_final_xfmr_new.rename(columns=rename_cols_dict)
rcpo_final_xfmr_new = rcpo_final_xfmr_new.rename(columns={
    'xf_meter_cnt':'n_SNs', 
    'events_tot':'n_events_tot'
})

In [None]:
rcpo_final_xfmr

In [None]:
rcpo_final_xfmr_new

In [None]:
cols_only_in_old = list(set(rcpo_final_xfmr.columns).difference(set(rcpo_final_xfmr_new.columns)))
cols_only_in_old

In [None]:
cols_only_in_new = list(set(rcpo_final_xfmr_new.columns).difference(set(rcpo_final_xfmr.columns)))
assert(rcpo_final_xfmr_new[cols_only_in_new].sum().sum()==0)

In [None]:
overlap_cols = list(set(rcpo_final_xfmr.columns).intersection(set(rcpo_final_xfmr_new.columns)))
overlap_idxs = list(set(rcpo_final_xfmr.index).intersection(set(rcpo_final_xfmr_new.index)))

In [None]:
print(len(overlap_idxs))
print(rcpo_final_xfmr.shape[0])
print(rcpo_final_xfmr_new.shape[0])

In [None]:
df_old = rcpo_final_xfmr.loc[overlap_idxs][overlap_cols].copy()
df_new = rcpo_final_xfmr_new.loc[overlap_idxs][overlap_cols].copy()

In [None]:
df_old.equals(df_new)

In [None]:
df_old

In [None]:
df_new

In [None]:
diffs = Utilities_df.get_dfs_diff(df_old, df_new)

In [None]:
diffs

In [None]:
dev_df_2 = end_events.df.copy()

In [None]:
dev_df_2['aep_event_dt'] = pd.to_datetime(dev_df_2['aep_event_dt'])

# For full functionality, won't want to drop aep_event_dt (and maybe not aep_opco)
cols_to_drop = ['serialnumber', 'aep_premise_nb', 'aep_opco']

agg_dict = {col:np.sum for col in dev_df_2.drop(columns=cols_to_drop+['trsf_pole_nb', 'aep_event_dt']).columns.tolist()}
agg_dict['xf_meter_cnt'] = np.max

# rcpo_final_xfmr_new = dev_df_2.drop(columns=cols_to_drop).groupby(['trsf_pole_nb']).agg(agg_dict)

In [None]:
freq='D'
group_cols=['trsf_pole_nb']
group_freq=pd.Grouper(freq=freq, key='aep_event_dt')
dev_df_2.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

In [None]:
dev_df_2.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

In [None]:
freq='5D'
group_cols=['trsf_pole_nb']
group_freq=pd.Grouper(freq=freq, key='aep_event_dt')
dev_df_2.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)

In [None]:
dev_rcpx = dev_df_2.drop(columns=cols_to_drop).groupby(group_cols+[group_freq]).agg(agg_dict)
dev_rcpx

In [None]:
dev_rcpx.index.get_level_values(1).unique()

In [None]:
dev_rcpx.index.get_level_values(0).nunique()

In [None]:
xf_meter_cnt_srs = dev_rcpx.droplevel(1, axis=0)['xf_meter_cnt'].reset_index().drop_duplicates().set_index('trsf_pole_nb').squeeze()
assert(xf_meter_cnt_srs.shape[0]==xf_meter_cnt_srs.index.nunique())
all_trsf_pole_nbs = dev_rcpx.index.get_level_values(0).unique().tolist()
xf_meter_cnt_srs.name='nSNs'
xf_meter_cnt_srs

In [None]:
sql = """
SELECT * FROM meter_events.event_summ_regex_setup
"""
regex_setup_df = pd.read_sql(sql, conn_aws, dtype=str)
rename_cols_dict = {x[0]:x[1] for x in regex_setup_df[['pivot_id', 'regex_report_title']].values.tolist()}

In [None]:
date_pd1 = pd.to_datetime('2023-04-01')
assert(date_pd1 in dev_rcpx.index.get_level_values(1).unique())

In [None]:
dev_rcpx_pd1 = dev_rcpx[dev_rcpx.index.get_level_values(1)==date_pd1].copy()
dev_rcpx_pd1 = dev_rcpx_pd1.droplevel(1, axis=0)
#-------------------------
no_events_pd1 = list(set(all_trsf_pole_nbs).difference(set(dev_rcpx_pd1.index.get_level_values(0).unique())))
no_events_pd1_df = pd.DataFrame(
    columns=dev_rcpx.columns, 
    index=no_events_pd1, 
    data=np.zeros((len(no_events_pd1), dev_rcpx.shape[1]))
)
#-----
no_events_pd1_df = no_events_pd1_df.drop(columns=['xf_meter_cnt']).merge(
    xf_meter_cnt_srs, 
    left_index=True, 
    right_index=True, 
    how='left'
)
assert(no_events_pd1_df['nSNs'].notna().all())
#-----
no_events_pd1_df = Utilities_df.move_cols_to_front(df=no_events_pd1_df, cols_to_move=['nSNs'])
#-------------------------
dev_rcpx_pd1 = pd.concat([dev_rcpx_pd1, no_events_pd1_df])
#-----
dev_rcpx_pd1=dev_rcpx_pd1.rename(columns=rename_cols_dict)
#-----
dev_rcpx_pd1 = Utilities_df.prepend_level_to_MultiIndex(
    df=dev_rcpx_pd1, 
    level_val=date_pd1, 
    level_name=None, 
    axis=1
)

In [None]:
dev_rcpx_pd1

In [None]:
pd_dfs = []
for date_pd_i in natsorted(dev_rcpx.index.get_level_values(1).unique()):
    dev_rcpx_pd_i = dev_rcpx[dev_rcpx.index.get_level_values(1)==date_pd_i].copy()
    dev_rcpx_pd_i = dev_rcpx_pd_i.droplevel(1, axis=0)
    #-------------------------
    no_events_pd_i = list(set(all_trsf_pole_nbs).difference(set(dev_rcpx_pd_i.index.get_level_values(0).unique())))
    no_events_pd_i_df = pd.DataFrame(
        columns=dev_rcpx.columns, 
        index=no_events_pd_i, 
        data=np.zeros((len(no_events_pd_i), dev_rcpx.shape[1]))
    )
    #-----
    no_events_pd_i_df = no_events_pd_i_df.drop(columns=['xf_meter_cnt']).merge(
        xf_meter_cnt_srs, 
        left_index=True, 
        right_index=True, 
        how='left'
    )
    assert(no_events_pd_i_df['nSNs'].notna().all())
    #-----
    no_events_pd_i_df = Utilities_df.move_cols_to_front(df=no_events_pd_i_df, cols_to_move=['nSNs'])
    #-------------------------
    dev_rcpx_pd_i = pd.concat([dev_rcpx_pd_i, no_events_pd_i_df])
    #-----
    dev_rcpx_pd_i=dev_rcpx_pd_i.rename(columns=rename_cols_dict)
    #-----
    dev_rcpx_pd_i = Utilities_df.prepend_level_to_MultiIndex(
        df=dev_rcpx_pd_i, 
        level_val=date_pd_i, 
        level_name=None, 
        axis=1
    )
    #-------------------------
    pd_dfs.append(dev_rcpx_pd_i)

In [None]:
final_df = pd.concat(pd_dfs, axis=1)
final_df

In [None]:
final_df.isna().sum().sum()

In [None]:
final_df.columns.get_level_values(0).unique()

In [None]:
prediction_date

In [None]:
natsorted(dev_df_2['aep_event_dt'].unique())

In [None]:
final_df

# TODO: Make sure full set of reasons match

In [None]:
expected_reasons_full = [
    'Access Point has lost connectivity with FHSS 900 MHz band.',
    'Cleared: Meter cleared tamper detection (C1219 Table 3)',
    'Cleared: detected a high temperature condition. (C1219 Table 3)',
    'Demand Reset',
    'Detected end of voltage sag',
    'Detected end of voltage swell',
    'Device Failed: Reason: Security public key mismatch',
    'Device Failed: Reason: unknown (0x4)',
    'Device Failed: Reason: unknown (0x6)',
    'Device exceeded the max allowable trap threshold',
    'Diag1 Condition cleared',
    'Diag1: Polarity, Cross Phase, Reverse Energy Flow: Angle out of tolerance.',
    'Diag6 Condition cleared',
    'Diag6: Under Voltage, Element A',
    'Diag7 Condition cleared',
    'Diag7: Over Voltage, Element A ',
    'Error occurred when attempting to synch meter time with NIC time for device',
    'Ignoring Interval Read data for device as it has time in the future',
    'Ignoring Register Read data for device as it has time in the future',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x00]',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x01]',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x04]',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x10] LG_PF_DETECTOR',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x18] LG_ZERO_X_DETECTOR ,LG_PF_DETECTOR',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x40] LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x41] LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x42] LG_FLAG_FLASH_ERR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x43] LG_FLAG_FLASH_ERR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x48] LG_ZERO_X_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x49] LG_ZERO_X_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x4A] LG_FLAG_FLASH_ERR ,LG_ZERO_X_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x50] LG_PF_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x51] LG_PF_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x58] LG_ZERO_X_DETECTOR ,LG_PF_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp - NIC power lost for device, Fail Reason: [0x59] LG_ZERO_X_DETECTOR ,LG_PF_DETECTOR ,LG_DIRECT_NOTIFICATION',
    'Last Gasp State: EL_EVENT_POWER_FAIL_DETECT_LG_DISABLED, Detector State: EL_EVENT_POWER_FAIL_DETECT_METER_PF_DISABLED',
    'Last Gasp State: EL_EVENT_POWER_FAIL_DETECT_LG_DISABLED, Detector State: EL_EVENT_POWER_FAIL_DETECT_NIC_ZX_DISABLED',
    'Low Battery (C1219 Table 3)',
    'Low Potential (C1219 Table 3)',
    'Low Potential cleared',
    'Measurement Error (C1219 Table 3) ',
    'Measurement Error cleared',
    'Meter Program Seal mismatch for Device',
    'Meter detected a high temperature condition (C1219 Table 3)',
    'Meter detected loss of time (C1219 Table 3)',
    'Meter detected tampering (C1219 Table 3)',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_LP_BX, TS_ERR_BIG_DRIFT [0x44]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX [0x4C]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX, TS_ERR_DST_BX [0x6C]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX, TS_ERR_NEAR_DST_BND [0x5C]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DST_BX [0x64]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_NEAR_DST_BND [0x54]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT [0x45]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX [0x4D]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DST_BX [0x65]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_NEAR_DST_BND [0x55]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_NEAR_LP_BND, TS_ERR_BIG_DRIFT [0x43]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT [0x47]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX [0x4F]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_DAY_BND, TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX, TS_ERR_DST_BX [0x6F]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_BIG_DRIFT [0x42]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT [0x46]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX [0x4E]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX, TS_ERR_DST_BX [0x6E]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DAY_BX, TS_ERR_NEAR_DST_BND [0x5E]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_DST_BX [0x66]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_LP_BX, TS_ERR_BIG_DRIFT, TS_ERR_NEAR_DST_BND [0x56]',
    'Meter needs explicit time sync. Encountered Problems:  TS_ERR_NEAR_LP_BND, TS_ERR_NEAR_DST_BND, TS_ERR_BIG_DRIFT [0x52]',
    'N/A failed consecutively for 1 or more times',
    'NET_MGMT command failed consecutively for 1 or more times',
    'NET_MGMT command was sent with a key that has insufficient privileges: READ SUBID: 65535 ASSOC_ID: 20036',
    'NET_MGMT command was sent with a key that has insufficient privileges: WRITE SUBID: 65535 ASSOC_ID: 45182',
    'NET_MGMT command was sent with a key that has insufficient privileges: WRITE SUBID: 65535 ASSOC_ID: 768',
    'NIC Link Layer Handshake Failed: Rejection Cause: invalid birth certificate',
    'NIC Link Layer Handshake Failed: Rejection Cause: invalid eblob signature',
    'NIC Link Layer Handshake Failed: Rejection Cause: invalid mfg cert',
    'NIC Power Restore Trap Received from device',
    'NVRAM Error (C1219 Table 3) ',
    'NVRAM Error cleared',
    'Over Voltage (Diagnostic 7) : Phase A.',
    'Over Voltage (Diagnostic 7) cleared',
    'Primary Power Down',
    'Primary Power Up',
    'Requested operation could not be applied: JOB_OP_LP_READ ',
    'Requested operation could not be applied: JOB_OP_NEW_DATA_READ ',
    'Requested operation could not be applied: JOB_OP_PROVISION_CONNECT ',
    'Requested operation could not be applied: JOB_OP_PROVISION_DISCONNECT ',
    'Requested operation could not be applied: JOB_OP_PROVISION_GET_STATUS ',
    'Requested operation could not be applied: JOB_OP_REGISTER_CURR_READ ',
    'Requested operation could not be applied: JOB_OP_TYPE_ARB_METER_COMMAND ',
    'Requested operation could not be applied: JOB_OP_TYPE_DEMAND_RESET ',
    'Secure association operation failed consecutively for 1 or more times',
    'System Error (C1219 Table 3: Er000020)',
    'Tamper (Meter Inversion) detected',
    'Test Mode Started',
    'Test Mode Stopped',
    'Under Voltage (CA000400)  Phase  A Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase  A, B Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase  A, B, C Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase  A, C Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase  B Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase  B, C Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase  C Voltage out of tolerance.',
    'Under Voltage (CA000400)  Phase A and C.',
    'Under Voltage (CA000400)  Phase A.',
    'Under Voltage (CA000400)  Phase C.',
    'Under Voltage (CA000400) cleared',
    'Under Voltage (Diagnostic 6)  Phase A.',
    'Under Voltage (Diagnostic 6) cleared'
]

In [None]:
len(expected_reasons_full)

In [None]:
tmp_df = final_df[pd.to_datetime('2023-04-01 00:00:00')].copy()

In [None]:
tmp_df.columns.tolist()

In [None]:
assert(tmp_df[''].sum().sum()==0)
tmp_df=tmp_df.drop(columns=[''])

In [None]:
# tmp_df = MECPODf.remove_reasons_from_rcpo_df(
#     rcpo_df=tmp_df, 
#     regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*'], 
#     ignore_case=True
# )
# tmp_df.shape

In [None]:
tmp_df.columns.tolist()

In [None]:
set(tmp_df.columns).difference(set(expected_reasons_full))

In [None]:
set(expected_reasons_full).difference(set(tmp_df.columns))

In [None]:
natsorted(tmp_df.columns.tolist())

In [None]:
natsorted(tmp_df.columns.tolist())[1]

In [None]:
expected_reasons_full[1]

In [None]:
natsorted(tmp_df.columns.tolist())[1]==expected_reasons_full[1]

In [None]:
dev_df = final_df['2023-04-01'].copy()

In [None]:
dev_df.shape

In [None]:
assert(dev_df[''].sum().sum()==0)
dev_df=dev_df.drop(columns=[''])

In [None]:
dev_df = MECPODf.remove_reasons_from_rcpo_df(
    rcpo_df=dev_df, 
    regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*'], 
    ignore_case=True
)
dev_df.shape

In [None]:
dev_df

In [None]:
# Since irrelevant cleared and test columns removed, need to recalculate events_tot
non_reason_cols = ['xf_meter_cnt', 'events_tot']
total_counts_col = 'events_tot'
assert(total_counts_col in non_reason_cols)
dev_df[total_counts_col] = dev_df.drop(columns=non_reason_cols).sum(axis=1)

In [None]:
dev_df = MECPODf.combine_cpo_df_reasons(rcpo_df=dev_df)
dev_df.shape

In [None]:
include_power_down_minus_up=False
if include_power_down_minus_up:
    dev_df = MECPODf.delta_cpo_df_reasons(
        rcpo_df=dev_df, 
        reasons_1='Primary Power Down',
        reasons_2='Primary Power Up',
        delta_reason_name='Power Down Minus Up'
    )

In [None]:
final_reason_cols = [
    'NET_MGMT command failed consecutively',
    'Primary Power Down',
    'Primary Power Up',
    'Error occurred when attempting to synch meter time with NIC time for device',
    'NIC Power Restore Trap Received from device',
    'Under Voltage',
    'Meter needs explicit time sync',
    'Last Gasp',
    'Over Voltage',
    'Detected end of voltage sag',
#     'Other Reasons',
#     'total_counts'
]

In [None]:
len(final_reason_cols)

In [None]:
set(final_reason_cols).difference(set(dev_df.columns.tolist()))

In [None]:
assert(len(set(final_reason_cols).difference(set(dev_df.columns.tolist())))==0)
dev_df = MECPODf.get_reasons_subset_from_cpo_df(
    cpo_df=dev_df, 
    reasons_to_include=final_reason_cols, 
    combine_others=True, 
    output_combine_others_col='Other Reasons', 
    SNs_tags=None, 
    is_norm=False, 
    counts_col='xf_meter_cnt', 
    normalize_by_nSNs_included=False, 
    level_0_raw_col = 'counts', 
    level_0_nrm_col = 'counts_norm', 
    cols_to_ignore = ['events_tot']
)

In [None]:
dev_df

In [None]:
dev_df.iloc[:, :-1].sum(axis=1).equals(dev_df.iloc[:, -1])

In [None]:
natsorted(dev_rcpx.index.get_level_values(1).unique())

In [None]:
full_data_df = pd.read_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230615\Models\All_EEMSP_agg_Top10_v2\full_data_df.pkl')

In [None]:
final_time_pds = [
    '01-06 Days',
    '06-11 Days',
    '11-16 Days',
    '16-21 Days',
    '21-26 Days',
    '26-31 Days',
]
time_pds_rename = dict(zip(
    natsorted(dev_rcpx.index.get_level_values(1).unique()), 
    final_time_pds
))

In [None]:
rename_cols = {
    'events_tot':'total_counts', 
    'xf_meter_cnt':'nSNs'
}

total_counts_col = 'total_counts'
nSNs_col         = 'nSNs'
non_reason_cols = [nSNs_col, total_counts_col]

include_power_down_minus_up=False
#-------------------------
dev_rcpx=dev_rcpx.rename(columns=rename_cols)
#-------------------------
pd_dfs = []
for date_pd_i in natsorted(dev_rcpx.index.get_level_values(1).unique()):
    final_time_pd_i = time_pds_rename[date_pd_i]
    #-----
    final_reason_cols_i = full_data_df[final_time_pd_i].columns.tolist()
    final_reason_cols_i = [x for x in final_reason_cols_i if x not in non_reason_cols+['Other Reasons']]
    #-------------------------
    dev_rcpx_pd_i = dev_rcpx[dev_rcpx.index.get_level_values(1)==date_pd_i].copy()
    dev_rcpx_pd_i = dev_rcpx_pd_i.droplevel(1, axis=0)
    #-------------------------
    no_events_pd_i = list(set(all_trsf_pole_nbs).difference(set(dev_rcpx_pd_i.index.get_level_values(0).unique())))
    no_events_pd_i_df = pd.DataFrame(
        columns=dev_rcpx.columns, 
        index=no_events_pd_i, 
        data=np.zeros((len(no_events_pd_i), dev_rcpx.shape[1]))
    )
    #-----
    no_events_pd_i_df = no_events_pd_i_df.drop(columns=['nSNs']).merge(
        xf_meter_cnt_srs, 
        left_index=True, 
        right_index=True, 
        how='left'
    )
    assert(no_events_pd_i_df['nSNs'].notna().all())
    #-----
    no_events_pd_i_df = Utilities_df.move_cols_to_front(df=no_events_pd_i_df, cols_to_move=['nSNs'])
    #-------------------------
    dev_rcpx_pd_i = pd.concat([dev_rcpx_pd_i, no_events_pd_i_df])
    #-------------------------
    dev_rcpx_pd_i=dev_rcpx_pd_i.rename(columns=rename_cols_dict)
    #--------------------------------------------------
    assert(dev_rcpx_pd_i[''].sum().sum()==0)
    dev_rcpx_pd_i=dev_rcpx_pd_i.drop(columns=[''])
    #-------------------------
    dev_rcpx_pd_i = MECPODf.remove_reasons_from_rcpo_df(
        rcpo_df=dev_rcpx_pd_i, 
        regex_patterns_to_remove=['.*cleared.*', '.*Test Mode.*'], 
        ignore_case=True
    )
    #-----
    # Since irrelevant cleared and test columns removed, need to recalculate events_tot
    assert(total_counts_col in non_reason_cols)
    dev_rcpx_pd_i[total_counts_col] = dev_rcpx_pd_i.drop(columns=non_reason_cols).sum(axis=1)
    #-------------------------
    dev_rcpx_pd_i = MECPODf.combine_cpo_df_reasons(rcpo_df=dev_rcpx_pd_i)
    #-------------------------
    if include_power_down_minus_up:
        dev_rcpx_pd_i = MECPODf.delta_cpo_df_reasons(
            rcpo_df=dev_rcpx_pd_i, 
            reasons_1='Primary Power Down',
            reasons_2='Primary Power Up',
            delta_reason_name='Power Down Minus Up'
        )
    #-------------------------
    assert(len(set(final_reason_cols_i).difference(set(dev_rcpx_pd_i.columns.tolist())))==0)
    dev_rcpx_pd_i = MECPODf.get_reasons_subset_from_cpo_df(
        cpo_df=dev_rcpx_pd_i, 
        reasons_to_include=final_reason_cols_i, 
        combine_others=True, 
        output_combine_others_col='Other Reasons', 
        SNs_tags=None, 
        is_norm=False, 
        counts_col='nSNs', 
        normalize_by_nSNs_included=False, 
        level_0_raw_col = 'counts', 
        level_0_nrm_col = 'counts_norm', 
        cols_to_ignore = ['total_counts'], 
        include_counts_col_in_output=True
    )    
    #--------------------------------------------------
    dev_rcpx_pd_i = dev_rcpx_pd_i.rename(columns=rename_cols)
    #-------------------------
    # Don't want nSNs in each pd individually
    dev_rcpx_pd_i = dev_rcpx_pd_i.drop(columns=[nSNs_col])
    #-------------------------
    dev_rcpx_pd_i = Utilities_df.prepend_level_to_MultiIndex(
        df=dev_rcpx_pd_i, 
        level_val=final_time_pd_i, 
        level_name=None, 
        axis=1
    )
    #-------------------------
    pd_dfs.append(dev_rcpx_pd_i)

In [None]:
final_df = pd.concat(pd_dfs, axis=1)
final_df

In [None]:
final_df=final_df.merge(
    xf_meter_cnt_srs.to_frame(name=('nSNs', 'nSNs')), 
    left_index=True, 
    right_index=True, 
    how='left'
)
assert(final_df['nSNs'].notna().all().all())

In [None]:
final_df

# EEMSP

In [None]:
conn_aws = Utilities.get_athena_prod_aws_connection()

In [None]:
merge_eemsp = True
mult_strategy='agg'
#-----
cols_of_interest_eemsp = [
    'location_nb', 
    'mfgr_nm', 
    'install_dt', 
    'last_trans_desc', 
    'eqtype_id', 
    'coolant', 
    'info', 
    'kva_size',
    'phase_cnt', 
    'prim_voltage', 
    'protection', 
    'pru_number', 
    'sec_voltage', 
    'special_char', 
    'taps', 
    'xftype'
]
cols_of_interest_eemsp_full = cols_of_interest_eemsp + ['latest_status', 'removal_dt', 'serial_nb']

In [None]:
sql_EEMSP = """
SELECT {} 
FROM meter_events.eems_transformer_nameplate
WHERE location_nb IN ({})
AND install_dt <= '{}'
AND (removal_dt IS NULL OR removal_dt > '{}')
""".format(
    Utilities_sql.join_list(cols_of_interest_eemsp_full, quotes_needed=False), 
    Utilities_sql.join_list(trsf_pole_nbs, quotes_needed=True), 
    date_range[0], 
    date_range[1]
)

In [None]:
print(sql_EEMSP)

In [None]:
df_eemsp = pd.read_sql_query(sql_EEMSP, conn_aws)

In [None]:
df_eemsp

In [None]:
# No need to run reduce1_eemsp_for_outg_trsf for this case, as all share the same date restrictions
#   which were already imposed in sql_EEMSP
#-----
# reduce2_eemsp_for_outg_trsf was designed to be used with outg_rec_nb/no_outg_rec_nb
# This is not necessary here, but we need a temporary column anyway to make the function happy
df_eemsp['outg_rec_nb'] = df_eemsp['location_nb']

In [None]:
df_eemsp

In [None]:
df_eemsp_reduce2 = reduce2_eemsp_for_outg_trsf(
    df_eemsp=df_eemsp, 
    mult_strategy='agg', 
    include_n_eemsp=True, 
    outg_rec_nb_col='outg_rec_nb', 
    location_nb_col='location_nb', 
    numeric_cols = ['kva_size'], 
    dt_cols = ['install_dt', 'removal_dt'], 
    ignore_cols = ['serial_nb'], 
    cat_cols_as_strings=True
)

In [None]:
df_eemsp['location_nb'].value_counts()

In [None]:
df_eemsp_reduce2.columns.tolist()

In [None]:
#-------------------------
# No matter of the mult_strategy used, at this point df_eemsp_reduce2 should only have a single
#   entry for each outg_rec_nb, location_nb pair
assert(all(df_eemsp_reduce2[['outg_rec_nb', 'location_nb']].value_counts()==1))

#----------------------------------------------------------------------------------------------------
# Clean up df_eemsp_reduce2 and merge with merged_df_full, merged_df_no_outg, and merged_df_no_outg_prstn
#--------------------------------------------------
# Can't simply take df_eemsp_reduce2[cols_of_interest_eemsp] because we need also the new column
#   OUTG_REC_NB_TO_MERGE (and ant others which may be added in the future)
cols_to_drop = list(set(cols_of_interest_eemsp_full).difference(set(cols_of_interest_eemsp)))
cols_to_drop = [x for x in cols_to_drop if x in df_eemsp_reduce2.columns]
if len(cols_to_drop)>0:
    df_eemsp_reduce2 = df_eemsp_reduce2.drop(columns=cols_to_drop)

In [None]:
    #-------------------------
    assert(df_eemsp_reduce2.shape[0]==df_eemsp_reduce2.groupby(['outg_rec_nb', 'location_nb']).ngroups)
    print(f"df_eemsp_reduce2['location_nb'].nunique() = {df_eemsp_reduce2['location_nb'].nunique()}")
    print(f"len(trsf_pole_nbs)                        = {len(trsf_pole_nbs)}")
    print(f"Diff                                      = {len(trsf_pole_nbs)-df_eemsp_reduce2['location_nb'].nunique()}")
    print()
    #-------------------------
#     print("\nShapes BEFORE merging")
#     print(f"merged_df_full.shape          = {merged_df_full.shape}")
#     #-------------------------
#     merged_df_full = merge_rcpx_with_eemsp(
#         df_rcpx=merged_df_full, 
#         df_eemsp=df_eemsp_reduce2, 
#         outg_rec_nb_idfr_rcpx ='index_0', 
#         trsf_pole_nb_idfr_rcpx='index_1', 
#         outg_rec_nb_idfr_eemsp='OUTG_REC_NB_TO_MERGE', 
#         location_nb_idfr_eemsp='LOCATION_NB', 
#         set_index=True
#     )
#     #-------------------------
#     print("\nShapes AFTER merging")
#     print(f"merged_df_full.shape          = {merged_df_full.shape}")

In [None]:
df_eemsp_reduce2 = Utilities_df.make_all_column_names_uppercase(df_eemsp_reduce2, cols_to_exclude=['n_eemsp'])

In [None]:
final_df = final_df.set_index([final_df.index, final_df.index])

In [None]:
print("\nShapes BEFORE merging")
print(f"final_df.shape          = {final_df.shape}")
#-------------------------
final_df = merge_rcpx_with_eemsp(
    df_rcpx=final_df, 
    df_eemsp=df_eemsp_reduce2, 
    outg_rec_nb_idfr_rcpx ='index_0', 
    trsf_pole_nb_idfr_rcpx='index_1', 
    outg_rec_nb_idfr_eemsp='OUTG_REC_NB', 
    location_nb_idfr_eemsp='LOCATION_NB', 
    set_index=True
)
#-------------------------
print("\nShapes AFTER merging")
print(f"final_df.shape          = {final_df.shape}")

In [None]:
final_df

In [None]:
final_df=final_df.droplevel(0, axis=0)
final_df[('EEMSP_0', 'INSTALL_DT')] = (prediction_date-final_df[('EEMSP_0', 'INSTALL_DT')]).dt.total_seconds()/(60*60*24*365)

In [None]:
final_df

In [None]:
final_df

In [None]:
final_df

In [None]:
import joblib
forest_clf = joblib.load(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230615\Models\All_EEMSP_agg_Top10_v2\forest_clf.joblib')
scaler = joblib.load(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230615\Models\All_EEMSP_agg_Top10_v2\scaler.joblib')

In [None]:
full_data_df = pd.read_pickle(r'C:\Users\s346557\Documents\LocalData\dovs_and_end_events_data\20230615\Models\All_EEMSP_agg_Top10_v2\full_data_df.pkl')

In [None]:
set(full_data_df.columns.tolist()).difference(set(final_df.columns.tolist()))

In [None]:
set(final_df.columns.tolist()).difference(set(full_data_df.columns.tolist()))

In [None]:
full_data_df.columns.get_level_values(1).unique()

In [None]:
full_data_df.columns.get_level_values(1).nunique()

In [None]:
prediction_date.month

In [None]:
final_df[('dummy_lvl_0', 'outg_month')] = prediction_date.month

In [None]:
set(full_data_df.columns.tolist()).difference(set(final_df.columns.tolist()))

In [None]:
X_test = final_df[full_data_df.drop(columns=[('from_outg','from_outg')]).columns].copy()

In [None]:
le = preprocessing.LabelEncoder()
cols_to_encode = full_data_df['EEMSP_0'].columns
numeric_cols = ['KVA_SIZE', 'INSTALL_DT']
cols_to_encode = [x for x in cols_to_encode if x not in numeric_cols]
for col in cols_to_encode:
    full_data_df[('EEMSP_0', col)] = full_data_df[('EEMSP_0', col)].astype(str)
    X_test[('EEMSP_0', col)]       = X_test[('EEMSP_0', col)].astype(str)
    #-----
    le.fit(full_data_df[('EEMSP_0', col)])
    #-----
    X_test[('EEMSP_0', col)]   = le.transform(X_test[('EEMSP_0', col)])

In [None]:
X_test

In [None]:
X_test = scaler.transform(X_test)

In [None]:
y_pred = forest_clf.predict(X_test)

In [None]:
print(y_pred.sum())
print(y_pred.shape[0])
print(100*y_pred.sum()/y_pred.shape[0])

In [None]:
y_pred.sum()