In [1]:
import sys, os
import re

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from scipy import stats
import datetime

import pyodbc
#---------------------------------------------------------------------
sys.path.insert(0, os.path.realpath('..'))
import Utilities_config
#---------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.ticker as ticker
#---------------------------------------------------------------------
sys.path.insert(0, Utilities_config.get_utilities_dir())
import Utilities
import Utilities_df
import Plot_Box_sns
import GrubbsTest

In [2]:
my_meter_premise_info = pd.read_csv(r'C:\Users\s346557\Documents\my_meter_premise_info.csv')
my_meter_premise_info = Utilities_df.remove_prepend_from_columns_in_df(my_meter_premise_info)
my_meter_premise_info_dict = my_meter_premise_info.squeeze().to_dict()

In [3]:
conn_aws = Utilities.get_athena_prod_aws_connection()

In [4]:
cols_of_interest_usage_inst = [
    'read_type',
    'serialnumber',
    'aep_premise_nb',
    'timezoneoffset',
    'aep_readtime',
    'aep_readtime_utc',
    'measurement_type',
    'measurement_value',
    'longitude',
    'latitude',
    'aep_opco',
    'aep_read_dt'
]

In [5]:
date_range = ['2021-10-12', '2021-11-13']

In [6]:
sql_usg_inst = (
"""
SELECT {}
FROM usage_instantaneous.inst_msr_consume
WHERE aep_opco = 'oh' 
AND aep_read_dt BETWEEN '{}' AND '{}'
AND aep_premise_nb = '{}'
AND serialnumber = '{}'
"""
).format(','.join(cols_of_interest_usage_inst), 
         date_range[0], 
         date_range[1], 
         my_meter_premise_info_dict['prem_nb'], 
         my_meter_premise_info_dict['mfr_devc_ser_nbr'])

In [7]:
print(sql_usg_inst)


SELECT read_type,serialnumber,aep_premise_nb,timezoneoffset,aep_readtime,aep_readtime_utc,measurement_type,measurement_value,longitude,latitude,aep_opco,aep_read_dt
FROM usage_instantaneous.inst_msr_consume
WHERE aep_opco = 'oh' 
AND aep_read_dt BETWEEN '2021-10-12' AND '2021-11-13'
AND aep_premise_nb = '104752350'
AND serialnumber = '879838382'



In [8]:
df_usg_inst_OG = pd.read_sql(sql_usg_inst, conn_aws)
df_usg_inst_OG = Utilities_df.remove_table_aliases(df_usg_inst_OG)

In [9]:
df_usg_inst_OG.head()

Unnamed: 0,read_type,serialnumber,aep_premise_nb,timezoneoffset,aep_readtime,aep_readtime_utc,measurement_type,measurement_value,longitude,latitude,aep_opco,aep_read_dt
0,mt113_st28_pm_temp_powerquality,879838382,104752350,-04:00,2021-10-28 09:35:17,1635413717,temperature_f,80.6,-83.012651,40.061243,oh,2021-10-28
1,mt113_st28_am_temp_powerquality,879838382,104752350,-04:00,2021-10-28 13:48:26,1635428906,instantaneous_ia,2.91,-83.012651,40.061243,oh,2021-10-28
2,mt113_st28_pm_temp_powerquality,879838382,104752350,-04:00,2021-10-28 05:43:09,1635399789,temperature_f,80.6,-83.012651,40.061243,oh,2021-10-28
3,mt113_st28_pm_temp_powerquality,879838382,104752350,-04:00,2021-10-28 05:43:09,1635399789,instantaneous_ia,0.76,-83.012651,40.061243,oh,2021-10-28
4,mt113_st28_pm_temp_powerquality,879838382,104752350,-04:00,2021-10-28 05:43:09,1635399789,voltage_phase_a,240.9,-83.012651,40.061243,oh,2021-10-28


# -----

In [10]:
df_usg_inst = df_usg_inst_OG.copy()

In [11]:
print(f'df_usg_inst.shape = {df_usg_inst.shape}')
print("\ndf_usg_inst['measurement_type'].unique()\n", '-'*25+'\n', df_usg_inst['measurement_type'].unique())

df_usg_inst.shape = (1017, 12)

df_usg_inst['measurement_type'].unique()
 -------------------------
 ['temperature_f' 'instantaneous_ia' 'voltage_phase_a'
 'power_factor_phase_a' 'instantaneous_kw' 'capacitor_voltage'
 'rom_error_status' 'device_firmware_version_support_status'
 'demand_overload_status' 'unprogrammed_status' 'loss_of_program_status'
 'low_battery_status' 'capacitor_fault_bit' 'capacitor_fault_status'
 'dsp_error_status' 'nvram_error_status' 'reverse_energy_flow_status'
 'ram_error_status' 'system_error_status' 'leading_kvarh_status'
 'under_voltage_status' 'clock_error_status' 'meter_inversion_status']


In [None]:
df_usg_inst['aep_readtime'] = pd.to_datetime(df_usg_inst['aep_readtime'])
df_usg_inst = Utilities_df.convert_col_types(
    df=df_usg_inst, 
    cols_and_types_dict={'measurement_value':float}, 
    to_numeric_errors='coerce', 
    inplace=True
)

In [None]:
sort_by = ['aep_readtime', 'measurement_type']
df_usg_inst = df_usg_inst.sort_values(by=sort_by, ignore_index=True)

In [None]:
for idx,gp_df in df_usg_inst.groupby('aep_read_dt'):
    print(gp_df['aep_readtime'].nunique())

### Why are there entries which are EXACTLY the same except for read time (read_time, aep_readtime, aep_readtime_utc)

In [None]:
df_usg_inst.iloc[1]==df_usg_inst.iloc[6]

In [None]:
df_usg_inst[(df_usg_inst['aep_read_dt']=='2021-10-12') & (df_usg_inst['measurement_type']=='instantaneous_kw')]

In [None]:
my_df_usg_inst = df_usg_inst.copy()

In [None]:
my_df_usg_inst_dupl = my_df_usg_inst[my_df_usg_inst.duplicated(subset=[x for x in my_df_usg_inst.columns 
                                                                       if x not in ['aep_readtime', 'aep_readtime_utc']], keep=False)]
my_df_usg_inst_dupl_gpd = my_df_usg_inst_dupl.groupby([x for x in my_df_usg_inst.columns 
                                                       if x not in ['aep_readtime', 'aep_readtime_utc']])

In [None]:
get_group_idx=0
my_df_usg_inst_dupl_gpd.get_group(list(my_df_usg_inst_dupl_gpd.groups.keys())[get_group_idx])

In [None]:
my_df_usg_inst.shape

In [None]:
my_df_usg_inst.drop_duplicates(subset=[x for x in my_df_usg_inst.columns 
                                       if x not in ['aep_readtime', 'aep_readtime_utc']]).shape

# Setup df_usage_inst

In [None]:
conn = Utilities.get_athena_prod_aws_connection()

In [None]:
#date_range = ['2020-10-12', '2020-10-13']
date_range = ['2020-10-12', '2020-11-12']

# Randomly selected
premise_nbs = [
    '072163781',
    '100166573',
    '101258511',
    '105347161',
    '109612790',
    '075671313',
    '106737082',
    '102970840',
    '103596600',
    '107782860'
]

state_abbr_txs = ['OH']

cols_of_interest_usage_inst = [
    'read_type',
    'serialnumber',
    'aep_premise_nb',
    'timezoneoffset',
    'aep_readtime',
    'aep_readtime_utc',
    'measurement_type',
    'measurement_value',
    'longitude',
    'latitude',
    'aep_opco',
    'aep_read_dt'
]

sql_usage_inst = (
"""
SELECT {}
FROM usage_instantaneous.inst_msr_consume
WHERE aep_opco = 'oh' 
AND aep_state IN ({})
AND aep_premise_nb IN ({})
AND aep_read_dt BETWEEN '{}' AND '{}'
"""
).format(','.join(cols_of_interest_usage_inst), 
         ','.join(["'{}'".format(x) for x in state_abbr_txs]), 
         ','.join(["'{}'".format(x) for x in premise_nbs]), 
         date_range[0], 
         date_range[1])

# sql_usage_inst = (
# """
# SELECT {}
# FROM usage_instantaneous.inst_msr_consume
# WHERE aep_opco = 'oh' 
# AND aep_state IN ({})
# AND aep_read_dt BETWEEN '{}' AND '{}'
# LIMIT 100000
# """
# ).format(','.join(cols_of_interest_usage_inst), 
#          ','.join(["'{}'".format(x) for x in state_abbr_txs]), 
#          date_range[0], 
#          date_range[1])

In [None]:
print(sql_usage_inst)

In [None]:
df_usage_inst_OG = pd.read_sql(sql_usage_inst, conn)
df_usage_inst_OG = Utilities_df.remove_table_aliases(df_usage_inst_OG)

In [None]:
df_usage_inst = df_usage_inst_OG.copy()

In [None]:
print(f'df_usage_inst.shape = {df_usage_inst.shape}')

In [None]:
df_usage_inst = Utilities_df.convert_col_types(
    df=df_usage_inst, 
    cols_and_types_dict={'measurement_value':float}, 
    to_numeric_errors='coerce', 
    inplace=True
)

In [None]:
df_usage_inst

In [None]:
df_usage_inst.dtypes

In [None]:
df_usage_inst.drop_duplicates(subset=[x for x in df_usage_inst.columns 
                                      if x not in ['aep_readtime', 'aep_readtime_utc']]).shape

In [None]:
print(df_usage_inst.columns)
df_usage_inst.head(10)

In [None]:
df_usage_inst['read_type'].unique()

In [None]:
len(df_usage_inst['serialnumber'].unique())

In [None]:
df_usage_inst.groupby(['serialnumber', 'aep_read_dt']).ngroups

In [None]:
for idx, gp_df in df_usage_inst.groupby(['serialnumber', 'aep_read_dt']):
    print(gp_df['aep_readtime'].nunique())

In [None]:
df_usage_inst['read_type'].unique()

In [None]:
df_usage_inst['measurement_type'].unique()

In [None]:
df_usage_inst.groupby('measurement_type')['measurement_value'].mean()

In [None]:
fig_num = 0

In [None]:
# tmp_df = df_usage_inst[df_usage_inst['measurement_type']=='power_factor_phase_a']
# fig, ax = plt.subplots(1, 1, num=fig_num, figsize=[11, 8.5])
# #fig.suptitle('Transformer Voltages By Date', fontsize=25, fontweight='bold')
# sns.stripplot(ax=ax, x='serialnumber', y='measurement_value', data=tmp_df, jitter=False)
# ax.tick_params(axis='x', labelrotation=90, labelsize=7.0, direction='in');
# fig_num +=1


In [None]:
# measurement_types = df_usage_inst['measurement_type'].unique().tolist()
# for measurement_type in measurement_types:
#     tmp_df = df_usage_inst[df_usage_inst['measurement_type']==measurement_type]
#     fig, ax = plt.subplots(1, 1, num=fig_num, figsize=[11, 8.5])
#     fig.suptitle(f'Measurement Type = {measurement_type}', fontsize=25, fontweight='bold')
#     sns.stripplot(ax=ax, x='serialnumber', y='measurement_value', data=tmp_df, jitter=False)
#     ax.tick_params(axis='x', labelrotation=90, labelsize=7.0, direction='in');
#     fig_num +=1

In [None]:
# measurement_types = df_usage_inst['measurement_type'].unique().tolist()
# for measurement_type in measurement_types:
#     tmp_df = df_usage_inst[df_usage_inst['measurement_type']==measurement_type]
#     fig, ax = plt.subplots(1, 1, num=fig_num, figsize=[11, 8.5])
#     fig.suptitle(f'Measurement Type = {measurement_type}', fontsize=25, fontweight='bold')
#     sns.boxplot(ax=ax, x='read_type', y='measurement_value', data=tmp_df)
#     ax.tick_params(axis='x', labelrotation=90, labelsize=7.0, direction='in');
#     fig_num +=1

In [None]:
state_abbr_txs = ['OH']

# limit = 10000
# date_range = ['2020-10-12', '2020-10-13']

limit = 1000
date_range = ['2020-01-01', '2022-01-01']

cols_of_interest_usage_inst = [
    'read_type',
    'serialnumber',
    'aep_premise_nb',
    'timezoneoffset',
    'aep_readtime',
    'aep_readtime_utc',
    'measurement_type',
    'measurement_value',
    'longitude',
    'latitude',
    'aep_opco',
    'aep_read_dt'
]


sql_gen = (
"""
SELECT {0}
FROM usage_instantaneous.inst_msr_consume
WHERE aep_opco = 'oh' 
AND aep_read_dt BETWEEN '{1}' AND '{2}'
AND read_type = {3}
LIMIT {4}
"""
).format('{}', 
         date_range[0], date_range[1], 
         '{}', 
         limit)

sql_1 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'mt110_voltage_event'")
sql_2 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'mt113_st28_am_temp_powerquality'")
sql_3 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'mt115_load_control_status'")
sql_4 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'mt117_capacitor_voltage'")
sql_5 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'mt72_kv2c'")
sql_6 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'mt113_st28_pm_temp_powerquality'")
sql_7 = sql_gen.format(','.join(cols_of_interest_usage_inst), "'st03_fatal_errors'")

In [None]:
df_1 = pd.read_sql(sql_1, conn)
df_2 = pd.read_sql(sql_2, conn)
df_3 = pd.read_sql(sql_3, conn)
df_4 = pd.read_sql(sql_4, conn)
df_5 = pd.read_sql(sql_5, conn)
df_6 = pd.read_sql(sql_6, conn)
df_7 = pd.read_sql(sql_7, conn)

In [None]:
df_usage_inst_2_OG = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7])

In [None]:
df_usage_inst_2 = df_usage_inst_2_OG.copy()

In [None]:
df_usage_inst_2 = Utilities_df.remove_table_aliases(df_usage_inst_2)

df_usage_inst_2 = df_usage_inst_2[df_usage_inst_2['measurement_value'].notna()] # Don't want rows with null measurement values, I guess...
df_usage_inst_2 = df_usage_inst_2[df_usage_inst_2['measurement_value']!=' '] # Don't want rows with measurement values = ' ', I guess...
df_usage_inst_2 = df_usage_inst_2[df_usage_inst_2['measurement_value']!='FAULT']

In [None]:
df_usage_inst_2 = Utilities_df.convert_col_types(
    df=df_usage_inst_2, 
    cols_and_types_dict={'measurement_value':float}, 
    to_numeric_errors='coerce', 
    inplace=True
)

In [None]:
# measurement_types = df_usage_inst_2['measurement_type'].unique().tolist()
# for measurement_type in measurement_types:
#     tmp_df = df_usage_inst_2[df_usage_inst_2['measurement_type']==measurement_type]
#     fig, ax = plt.subplots(1, 1, num=fig_num, figsize=[11, 8.5])
#     fig.suptitle(f'Measurement Type = {measurement_type}', fontsize=25, fontweight='bold')
#     sns.boxplot(ax=ax, x='read_type', y='measurement_value', data=tmp_df)
#     ax.tick_params(axis='x', labelrotation=90, labelsize=7.0, direction='in');
#     fig_num +=1

In [None]:
df_usage_inst_2

In [None]:
measurement_types = sorted(df_usage_inst_2['measurement_type'].unique().tolist())
read_types = df_usage_inst_2['read_type'].unique().tolist()
for read_type in read_types:
    tmp_df = df_usage_inst_2[df_usage_inst_2['read_type']==read_type]
    fig, ax = plt.subplots(1, 1, num=fig_num, figsize=[11, 8.5])
    fig.suptitle(f'Read Type = {read_type}', fontsize=25, fontweight='bold')
    sns.boxplot(ax=ax, x='measurement_type', y='measurement_value', data=tmp_df)
    ax.tick_params(axis='x', labelrotation=90, labelsize=7.0, direction='in');
    fig_num +=1

In [None]:
# measurement_types for LaTeX document
#-------------------------
# Simple, single columned
# for msr_type in measurement_types:
#     print(f'  {msr_type} \\\\ \hline')
#-------------------------
# Two columns
left_start = 0
if len(measurement_types)%2==0:
    right_start = int(np.floor(len(measurement_types)/2))
else:
    right_start = int(np.floor(len(measurement_types)/2)+1)
#-----
left_entries = measurement_types[left_start:right_start]
right_entries = measurement_types[right_start:]
#-----
if len(measurement_types)%2!=0:
    assert(len(left_entries)==len(right_entries)+1)
    right_entries.append('')
assert(len(left_entries)==len(right_entries))
#-----
for i_row in range(len(left_entries)):
    print(f"  {left_entries[i_row]} & \n  {right_entries[i_row]} \\\\ \hline ")

In [None]:
measurement_types_in_each_read_type = {}
for name, group in df_usage_inst_2.groupby('read_type'):
    print('-'*25+'\n'+f'{name}'+'\n'+'-'*25)
    unique_measurement_types = group['measurement_type'].unique().tolist()
    print('Unique measurement types:')
    print(*sorted(unique_measurement_types), sep='\n')
    print()
    assert(name not in measurement_types_in_each_read_type)
    measurement_types_in_each_read_type[name] = unique_measurement_types

In [None]:
measurement_types_in_each_read_type

In [None]:
df_usage_inst_2.head()

In [None]:
unq_msr_types_by_sn = []
unq_read_types_by_sn = []
for idx, gp in df_usage_inst_2.groupby('serialnumber'):
    msr_types_i = gp['measurement_type'].unique().tolist()
    read_types_i = gp['read_type'].unique().tolist()
    if msr_types_i not in unq_msr_types_by_sn:
        unq_msr_types_by_sn.append(msr_types_i)
    if read_types_i not in unq_read_types_by_sn:
        unq_read_types_by_sn.append(read_types_i)

In [None]:
len(unq_msr_types_by_sn)

In [None]:
len(unq_read_types_by_sn)

In [None]:
unq_read_types_by_sn

In [None]:
unq_msr_types_by_sn

In [None]:
df_usage_inst_2.shape

In [None]:
df_usage_inst_2.drop_duplicates(subset=[x for x in df_usage_inst_2.columns 
                                      if x not in ['aep_readtime', 'aep_readtime_utc']]).shape

In [None]:
possible_read_type_groupings=[]
for idx,gp_df in df_usage_inst_2.groupby('serialnumber'):
    if gp_df['read_type'].unique().tolist() not in possible_read_type_groupings:
        possible_read_type_groupings.append(gp_df['read_type'].unique().tolist())
#     if gp_df['read_type'].nunique()>1:
#         print(f'serialnunber = {idx}')
#         print(gp_df['read_type'].unique())
#         print()

In [None]:
possible_read_type_groupings

In [None]:
sorted(df_usage_inst_2['read_type'].unique().tolist())

In [None]:
# mt110_voltage_event by self or with mt72_kv2c
# mt72_kv2c by self or with mt110_voltage_event

# mt113_st28_am_temp_powerquality by self or with mt113_st28_pm_temp_powerquality or mt115_load_control_status
# mt113_st28_pm_temp_powerquality by self or with mt113_st28_am_temp_powerquality or mt115_load_control_status
# mt115_load_control_status by self or with mt113_st28_am_temp_powerquality or mt113_st28_pm_temp_powerquality

In [None]:
# my_df_usg_inst['read_type'].unique()